From aa13fb780a8461b6b744015ee6b204a789845449 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 1 Jan 2025 16:15:47 +0100 Subject: [PATCH 1/6] wip Signed-off-by: sven1977 --- rllib/BUILD | 8 + rllib/algorithms/algorithm_config.py | 33 +++-- rllib/env/single_agent_env_runner.py | 7 +- .../envs/async_gym_env_vectorization.py | 137 ++++++++++++++++++ rllib/utils/test_utils.py | 4 +- 5 files changed, 170 insertions(+), 19 deletions(-) create mode 100644 rllib/examples/envs/async_gym_env_vectorization.py diff --git a/rllib/BUILD b/rllib/BUILD index 21512a8ea8b92..93528a518c968 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2295,6 +2295,14 @@ py_test( srcs = ["examples/envs/agents_act_in_sequence.py"], args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iters=3"] ) +py_test( + name = "examples/envs/async_gym_env_vectorization", + main = "examples/envs/async_gym_env_vectorization.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "medium", + srcs = ["examples/envs/async_gym_env_vectorization.py"], + args = ["--enable-new-api-stack", "--as-test", "--vectorize-mode=BOTH"] +) py_test( name = "examples/envs/custom_env_render_method", main = "examples/envs/custom_env_render_method.py", diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index ab9d6d4fd6640..58ddc2d0f59d3 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -318,6 +318,7 @@ def __init__(self, algo_class: Optional[type] = None): self.env_runner_cls = None self.num_env_runners = 0 self.num_envs_per_env_runner = 1 + self.gym_env_vectorize_mode = gym.envs.registration.VectorizeMode.SYNC self.num_cpus_per_env_runner = 1 self.num_gpus_per_env_runner = 0 self.custom_resources_per_env_runner = {} @@ -904,25 +905,15 @@ def freeze(self) -> None: def validate(self) -> None: """Validates all values in this config.""" - # Check callbacks settings. + self._validate_env_runner_settings() self._validate_callbacks_settings() - # Check framework specific settings. self._validate_framework_settings() - # Check resources specific settings. self._validate_resources_settings() - # Check multi-agent specific settings. self._validate_multi_agent_settings() - # Check input specific settings. self._validate_input_settings() - # Check evaluation specific settings. self._validate_evaluation_settings() - # Check offline specific settings (new API stack). self._validate_offline_settings() - - # Check new API stack specific settings. self._validate_new_api_stack_settings() - - # Check to-be-deprecated settings (however that are still in use). self._validate_to_be_deprecated_settings() def build( @@ -1738,6 +1729,7 @@ def env_runners( env_runner_cls: Optional[type] = NotProvided, num_env_runners: Optional[int] = NotProvided, num_envs_per_env_runner: Optional[int] = NotProvided, + gym_env_vectorize_mode: Optional[str] = NotProvided, num_cpus_per_env_runner: Optional[int] = NotProvided, num_gpus_per_env_runner: Optional[Union[float, int]] = NotProvided, custom_resources_per_env_runner: Optional[dict] = NotProvided, @@ -1795,6 +1787,11 @@ def env_runners( (vector-wise) per EnvRunner. This enables batching when computing actions through RLModule inference, which can improve performance for inference-bottlenecked workloads. + gym_env_vectorize_mode: The gymnasium vectorization mode for vector envs. + Must be a `gymnasium.envs.registration.VectorizeMode` (enum) value. + Default is SYNC. Set this to ASYNC to parallelize the individual sub + environments within the vector. This can speed up your EnvRunners + significantly when using heavier environments. num_cpus_per_env_runner: Number of CPUs to allocate per EnvRunner. num_gpus_per_env_runner: Number of GPUs to allocate per EnvRunner. This can be fractional. This is usually needed only if your env itself requires a @@ -1975,7 +1972,8 @@ def env_runners( "larger 0!" ) self.num_envs_per_env_runner = num_envs_per_env_runner - + if gym_env_vectorize_mode is not NotProvided: + self.gym_env_vectorize_mode = gym_env_vectorize_mode if num_cpus_per_env_runner is not NotProvided: self.num_cpus_per_env_runner = num_cpus_per_env_runner if num_gpus_per_env_runner is not NotProvided: @@ -4375,6 +4373,17 @@ def _model_config_auto_includes(self) -> Dict[str, Any]: # ----------------------------------------------------------- # Various validation methods for different types of settings. # ----------------------------------------------------------- + def _validate_env_runner_settings(self) -> None: + allowed_vectorize_modes = set( + gym.envs.registration.VectorizeMode.__members__.keys() + ) + if self.gym_env_vectorize_mode not in allowed_vectorize_modes: + raise ValueError( + f"`gym_env_vectorize_mode` ({self.gym_env_vectorize_mode}) must be a " + "member of `gym.envs.registration.VectorizeMode`! Allowed values " + f"are {allowed_vectorize_modes}." + ) + def _validate_callbacks_settings(self) -> None: """Validates callbacks settings.""" # Old API stack: diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 0dd6f2881974a..677efe5c0357c 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -6,7 +6,6 @@ import gymnasium as gym from gymnasium.wrappers.vector import DictInfoToList -from gymnasium.envs.registration import VectorizeMode from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.callbacks.callbacks import RLlibCallback @@ -641,11 +640,7 @@ def make_env(self) -> None: gym.make_vec( "rllib-single-agent-env-v0", num_envs=self.config.num_envs_per_env_runner, - vectorization_mode=( - VectorizeMode.ASYNC - if self.config.remote_worker_envs - else VectorizeMode.SYNC - ), + vectorization_mode=self.config.gym_env_vectorize_mode.lower(), ) ) diff --git a/rllib/examples/envs/async_gym_env_vectorization.py b/rllib/examples/envs/async_gym_env_vectorization.py new file mode 100644 index 0000000000000..a1745e0c218ae --- /dev/null +++ b/rllib/examples/envs/async_gym_env_vectorization.py @@ -0,0 +1,137 @@ +"""Example demo'ing async gym vector envs, in which sub-envs have their own process. + +Setting up env vectorization works through setting the `config.num_envs_per_env_runner` +value to > 1. However, by default the n sub-environments are stepped through +sequentially, rather than in parallel. + +This script shows the effect of setting the `config.gym_env_vectorize_mode` from its +default value of "SYNC" (all sub envs are located in the same EnvRunner process) +to "ASYNC" (all sub envs in each EnvRunner get their own process). + +This example: + - shows, which config settings to change in order to switch from sub-envs being + stepped in sequence to each sub-envs owning its own process (and compute resource) + and thus the vector being stepped in parallel. + - shows, how this setup can increase EnvRunner performance significantly, especially + for heavier, slower environments. + - uses an artificially slow CartPole-v1 environment for demonstration purposes. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack ` + +Use the `--vectorize-mode=BOTH` option to run both modes (SYNC and ASYNC) +through Tune at the same time and get a better comparison of the throughputs +achieved. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see results similar to the following in your console output +when using the + ++--------------------------+------------+------------------------+------+ +| Trial name | status | gym_env_vectorize_mode | iter | +| | | | | +|--------------------------+------------+------------------------+------+ +| PPO_slow-env_6ddf4_00000 | TERMINATED | SYNC | 4 | +| PPO_slow-env_6ddf4_00001 | TERMINATED | ASYNC | 4 | ++--------------------------+------------+------------------------+------+ ++------------------+----------------------+------------------------+ +| total time (s) | episode_return_mean | num_env_steps_sample | +| | | d_lifetime | +|------------------+----------------------+------------------------+ +| 60.8794 | 73.53 | 16040 | +| 19.1203 | 73.86 | 16037 | ++------------------+----------------------+------------------------+ + +You can see that the ASYNC mode, given that the env is sufficiently slow, +achieves much better results when using vectorization. + +You should see no difference, however, when only using +`--num-envs-per-env-runner=1`. +""" +import time + +import gymnasium as gym + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray import tune + +parser = add_rllib_example_script_args(default_reward=60.0) +parser.set_defaults( + enable_new_api_stack=True, + env="CartPole-v1", + num_envs_per_env_runner=6, +) +parser.add_argument( + "--vectorize-mode", + type=str, + default="ASYNC", + help="The value `gym.envs.registration.VectorizeMode` to use for env " + "vectorization. SYNC steps through all sub-envs in sequence. ASYNC (default) " + "parallelizes sub-envs through multiprocessing and can speed up EnvRunners " + "significantly. Use the special value `BOTH` to run both ASYNC and SYNC through a " + "Tune grid-search.", +) + + +class SlowEnv(gym.ObservationWrapper): + def observation(self, observation): + time.sleep(0.005) + return observation + + +if __name__ == "__main__": + args = parser.parse_args() + + # Wrap the env with the slowness wrapper. + def _env_creator(cfg): + return SlowEnv(gym.make(args.env, **cfg)) + + tune.register_env("slow-env", _env_creator) + + if args.vectorize_mode == "BOTH" and args.no_tune: + raise ValueError( + "`--vectorize-mode=BOTH` and `--no-tune` not allowed in combination!" + ) + + base_config = ( + PPOConfig() + .environment("slow-env") + .env_runners( + gym_env_vectorize_mode=( + tune.grid_search(["SYNC", "ASYNC"]) + if args.vectorize_mode == "BOTH" + else args.vectorize_mode + ), + ) + ) + + results = run_rllib_example_script_experiment(base_config, args) + + # Compare the throughputs and assert that ASYNC is much faster than SYNC. + if args.vectorize_mode == "BOTH" and args.as_test: + throughput_sync = ( + results[0].metrics["num_env_steps_sampled_lifetime"] + / results[0].metrics["time_total_s"] + ) + throughput_async = ( + results[1].metrics["num_env_steps_sampled_lifetime"] + / results[1].metrics["time_total_s"] + ) + assert throughput_async > throughput_sync diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 95dc9bf95aa63..664b3383c1b4b 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1087,9 +1087,11 @@ def run_rllib_example_script_experiment( enable_env_runner_and_connector_v2=False, ) - # Define EnvRunner/RolloutWorker scaling and behavior. + # Define EnvRunner scaling and behavior. if args.num_env_runners is not None: config.env_runners(num_env_runners=args.num_env_runners) + if args.num_envs_per_env_runner is not None: + config.env_runners(num_envs_per_env_runner=args.num_envs_per_env_runner) # Define compute resources used automatically (only using the --num-learners # and --num-gpus-per-learner args). From 74db8f1407f768daf5f05bceb444d6230d390cac Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 1 Jan 2025 16:44:07 +0100 Subject: [PATCH 2/6] wip Signed-off-by: sven1977 --- doc/source/rllib/rllib-examples.rst | 185 +++++++++++++++------------- 1 file changed, 96 insertions(+), 89 deletions(-) diff --git a/doc/source/rllib/rllib-examples.rst b/doc/source/rllib/rllib-examples.rst index b0b30c6d7edb3..39dd5ff8f6b5e 100644 --- a/doc/source/rllib/rllib-examples.rst +++ b/doc/source/rllib/rllib-examples.rst @@ -13,20 +13,20 @@ of RLlib, demonstrating the different use cases and features of the library. .. note:: RLlib is currently in a transition state from old- to new API stack. - Some of the example scripts haven't been translated yet to the new stack and are tagged - with the following comment line on top: ``# @OldAPIStack``. The moving of all example - scripts over to the new stack is work in progress. + The Ray team has translated most of the example scripts to the new stack and tag those + still on the old stack with this comment line on top: ``# @OldAPIStack``. + The moving of all example scripts over to the new stack is work in progress. .. note:: - If any (new API stack) example is broken, or if you'd like to add an example to this page, + If you find any new API stack example broken, or if you'd like to add an example to this page, create an issue in the `RLlib GitHub repository `__. Folder structure ---------------- The `examples folder `__ has -several sub-directories, which are described in detail below. +several sub-directories described in detail below. How to run an example script @@ -44,8 +44,8 @@ directory and run the script as-is with python: Use the `--help` command line argument to have each script print out its supported command line options. Most of the scripts share a common subset of generally applicable command line arguments, -for example `--num-env-runners` (to scale the number of EnvRunner actors), `--no-tune` (to switch off running with Ray Tune), -`--wandb-key` (to log to W&B), or `--verbose` (to control log chattiness). +for example `--num-env-runners`, to scale the number of EnvRunner actors, `--no-tune`, to switch off running with Ray Tune, +`--wandb-key`, to log to WandB, or `--verbose`, to control log chattiness. All example sub-folders @@ -55,20 +55,21 @@ All example sub-folders Actions +++++++ - `Nested Action Spaces `__: - Sets up an environment with nested action spaces using custom (single- or multi-agent) configurations. This example demonstrates - how RLlib manages complex action structures, such as multi-dimensional or hierarchical action spaces. + Sets up an environment with nested action spaces using custom single- or multi-agent + configurations. This example demonstrates how RLlib manages complex action structures, + such as multi-dimensional or hierarchical action spaces. Checkpoints +++++++++++ -- `Checkpoint by Custom Criteria `__: +- `Checkpoint by custom criteria `__: Shows how to create checkpoints based on custom criteria, giving users control over when to save model snapshots during training. -- `Continue Training From Checkpoint `__: +- `Continue training from checkpoint `__: Illustrates resuming training from a saved checkpoint, useful for extending training sessions or recovering from interruptions. -- `Restore 1 (out of N) Agents from Checkpoint `__: +- `Restore 1 out of N agents from checkpoint `__: Restores one specific agent from a multi-agent checkpoint, allowing selective loading for environments where only certain agents need to resume training. @@ -79,23 +80,25 @@ Connectors .. note:: RLlib's Connector API has been re-written from scratch for the new API stack. Connector-pieces and -pipelines are now referred to as :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` - (as opposed to ``Connector``, which only continue to work on the old API stack). + to distinguish against the ``Connector`` class, which only continue to work on the old API stack. -- `Flatten and One-Hot Observations `__: - Demonstrates how to one-hot discrete observation spaces and/or flatten complex observations (Dict or Tuple), allowing RLlib to process arbitrary - observation data as flattened (1D) vectors. Useful for environments with complex, discrete, or hierarchical observations. +- `Flatten and one-hot observations `__: + Demonstrates how to one-hot discrete observation spaces and/or flatten complex observations, Dict or Tuple, allowing RLlib to process arbitrary + observation data as flattened 1D vectors. Useful for environments with complex, discrete, or hierarchical observations. -- `Observation Frame-Stacking `__: - Implements frame stacking, where consecutive frames are stacked together to provide temporal context to the agent. +- `Observation frame-stacking `__: + Implements frame stacking, where N consecutive frames stack together to provide temporal context to the agent. This technique is common in environments with continuous state changes, like video frames in Atari games. - Using connectors for frame stacking is more efficient as it avoids having to send large observation tensors through the network (ray). + Using connectors for frame stacking is more efficient as it avoids having to send large observation tensors through + ray remote calls. -- `Mean/Std Filtering `__: - Adds mean and standard deviation normalization for observations (shift by the mean and divide by std-dev), improving learning stability - by scaling observations to a normalized range. This can enhance performance in environments with highly variable state magnitudes. +- `Mean/Std filtering `__: + Adds mean and standard deviation normalization for observations, shifting by the mean and dividing by std-dev. + This type of filtering can improve learning stability in environments with highly variable state magnitudes + by scaling observations to a normalized range. -- `Prev-Actions, Prev-Rewards Connector `__: +- `Prev-actions, prev-rewards connector `__: Augments observations with previous actions and rewards, giving the agent a short-term memory of past events, which can improve decision-making in partially observable or sequentially dependent tasks. @@ -103,17 +106,17 @@ Connectors Curiosity +++++++++ -- `Count-Based Curiosity `__: +- `Count-based curiosity `__: Implements count-based intrinsic motivation to encourage exploration of less visited states. Using curiosity is beneficial in sparse-reward environments where agents may struggle to find rewarding paths. However, count-based methods are only feasible for environments with small observation spaces. -- `Euclidian Distance-Based Curiosity `__: +- `Euclidian distance-based curiosity `__: Uses Euclidean distance between states and the initial state to measure novelty, encouraging exploration by rewarding the agent for reaching "far away" regions of the environment. Suitable for sparse-reward tasks, where diverse exploration is key to success. -- `Intrinsic-Curiosity-Model (ICM) Based Curiosity `__: +- `Intrinsic-curiosity-model (ICM) Based Curiosity `__: Adds an `Intrinsic Curiosity Model (ICM) `__ that learns to predict the next state as well as the action in between two states to measure novelty. The higher the loss of the ICM, the higher the "novelty" and thus the intrinsic reward. Ideal for complex environments with large observation spaces where reward signals are sparse. @@ -122,7 +125,7 @@ Curiosity Curriculum learning +++++++++++++++++++ -- `Custom Env rendering method `__: +- `Custom env rendering method `__: Demonstrates curriculum learning, where the environment difficulty increases as the agent improves. This approach enables gradual learning, allowing agents to master simpler tasks before progressing to more challenging ones, ideal for environments with hierarchical or staged difficulties. Also see the :doc:`curriculum learning how-to ` from the documentation. @@ -131,22 +134,27 @@ Curriculum learning Environments ++++++++++++ -- `Custom Env Rendering Method `__: +- `Async gym vectorization, parallelizing sub-environments `__: + Shows how the `gym_env_vectorize_mode` config setting can significantly speed up your + :py:class`~ray.rllib.env.env_runner.EnvRunner` actors, if your RL environment is slow and you are + using `num_envs_per_env_runner > 1`. The reason for the performance gain is that each sub-environment runs in its own process. + +- `Custom env rendering method `__: Demonstrates how to add a custom `render()` method to a (custom) environment, allowing visualizations of agent interactions. -- `Custom gymnasium Env `__: +- `Custom gymnasium env `__: Implements a custom `gymnasium `__ environment from scratch, showing how to define observation and action spaces, arbitrary reward functions, as well as, step- and reset logic. -- `Env connecting to RLlib through a TCP client `__: +- `Env connecting to RLlib through a tcp client `__: An external environment, running outside of RLlib and acting as a client, connects to RLlib as a server. The external env performs its own action inference using an ONNX model, sends collected data back to RLlib for training, and receives model updates from time to time from RLlib. -- `Env Rendering and Recording `__: +- `Env rendering and recording `__: Illustrates environment rendering and recording setups within RLlib, capturing visual outputs for later review (ex. on WandB), which is essential for tracking agent behavior in training. -- `Env with Protobuf Observations `__: +- `Env with protobuf observations `__: Uses Protobuf for observations, demonstrating an advanced way of handling serialized data in environments. This approach is useful for integrating complex external data sources as observations. @@ -154,10 +162,10 @@ Environments Evaluation ++++++++++ -- `Custom Evaluation `__: +- `Custom evaluation `__: Configures custom evaluation metrics for agent performance, allowing users to define specific success criteria beyond standard RLlib evaluation metrics. -- `Evaluation Parallel to Training `__: +- `Evaluation parallel to training `__: Runs evaluation episodes in parallel with training, reducing training time by offloading evaluation to separate processes. This method is beneficial when you require frequent evaluation without interrupting learning. @@ -165,28 +173,28 @@ Evaluation Fault tolerance +++++++++++++++ -- `Crashing and stalling Env `__: +- `Crashing and stalling env `__: Simulates an environment that randomly crashes or stalls, allowing users to test RLlib's fault-tolerance mechanisms. This script is useful for evaluating how RLlib handles interruptions and recovers from unexpected failures during training. -GPU (for training and sampling) -+++++++++++++++++++++++++++++++ +GPUs for training and sampling +++++++++++++++++++++++++++++++ - `Float16 training and inference `__: - Configures a setup for mixed-precision (float16) training and inference, optimizing performance by reducing memory usage and speeding up computation. + Configures a setup for float16 training and inference, optimizing performance by reducing memory usage and speeding up computation. This is especially useful for large-scale models on compatible GPUs. - `Fractional GPUs per Learner `__: Demonstrates allocating fractional GPUs to individual learners, enabling finer resource allocation in multi-model setups. Useful for saving resources when training smaller models, many of which can fit on a single GPU. -- `Mixed Precision Training and Float16 Inference `__: - Uses mixed precision (float32 and float16) for training, while switching to float16 precision for inference, balancing stability during training +- `Mixed precision training and float16 inference `__: + Uses mixed precision, float32 and float16, for training, while switching to float16 precision for inference, balancing stability during training with performance improvements during evaluation. - `Using GPUs on EnvRunners `__: - Demos how GPUs can be required by :py:class:`~ray.rllib.env.env_runner.EnvRunner` instances as well (single- or multi-agent) through + Demos how :py:class:`~ray.rllib.env.env_runner.EnvRunner` instances, single- or multi-agent, can request GPUs through the `config.env_runners(num_gpus_per_env_runner=..)` setting. @@ -206,30 +214,30 @@ Inference of models or policies - `Policy inference after training `__: Demonstrates performing inference with a trained policy, showing how to load a trained model and use it to make decisions in a simulated environment. -- `Policy Inference after Training (with ConnectorV2) `__: - Runs inference with a trained (LSTM-based) policy using connectors, which preprocess observations and actions, allowing for more modular and flexible inference setups. +- `Policy inference after training, with ConnectorV2 `__: + Runs inference with a trained, LSTM-based policy using connectors, which preprocess observations and actions, allowing for more modular and flexible inference setups. Learners ++++++++ -- `Custom Loss Function (simple) `__: +- `Custom loss function, simple `__: Implements a custom loss function for training, demonstrating how users can define tailored loss objectives for specific environments or behaviors. -- `Custom Torch Learning Rate Schedulers `__: +- `Custom torch learning rate schedulers `__: Adds learning rate scheduling to PPO, showing how to adjust the learning rate dynamically using PyTorch schedulers for improved training stability. -- `Separate Learning Rate and Optimizer for Value-Function `__: - Configures a separate learning rate and a separate optimizer for the value function (vs the policy network), enabling differentiated - training dynamics between policy and value estimation in RL algorithms. +- `Separate learning rate and optimizer for value function `__: + Configures a separate learning rate and a separate optimizer for the value function vs the policy network, + enabling differentiated training dynamics between policy and value estimation in RL algorithms. Metrics +++++++ -- `Logging Custom Metrics in EnvRunners `__: +- `Logging custom metrics in EnvRunners `__: Demonstrates adding custom metrics to :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors, providing a way to track specific performance- and environment indicators beyond the standard RLlib metrics. @@ -242,47 +250,47 @@ Multi-agent RL a hand-coded random policy while another agent trains with PPO. This example highlights integrating static and dynamic policies, suitable for environments with a mix of fixed-strategy and adaptive agents. -- `Different Spaces for Agents `__: +- `Different spaces for agents `__: Configures agents with differing observation and action spaces within the same environment, showcasing RLlib's support for heterogeneous agents with varying space requirements in a single multi-agent environment. -- `Grouped Agents (Two-Step Game) `__: - Implements a multi-agent, grouped setup within a two-step game environment (from the `QMIX paper `__). - N agents are grouped into M teams (N >= M) for which policies and rewards are shared. This example demonstrates RLlib's ability to manage - collective objectives and interactions among grouped agents. +- `Grouped agents, two-step game `__: + Implements a multi-agent, grouped setup within a two-step game environment from the `QMIX paper `__. + N agents form M teams in total, where N >= M, and agents in each team share rewards and one policy. + This example demonstrates RLlib's ability to manage collective objectives and interactions among grouped agents. -- `Multi-Agent CartPole `__: +- `Multi-agent CartPole `__: Runs a multi-agent version of the CartPole environment with each agent independently learning to balance its pole. This example serves as a foundational test for multi-agent reinforcement learning scenarios in simple, independent tasks. -- `Multi-Agent Pendulum `__: +- `Multi-agent Pendulum `__: Extends the classic Pendulum environment into a multi-agent setting, where multiple agents attempt to balance their respective pendulums. This example highlights RLlib's support for environments with replicated dynamics but distinct agent policies. -- `PettingZoo Independent Learning `__: +- `PettingZoo independent learning `__: Integrates RLlib with `PettingZoo `__ to facilitate independent learning among multiple agents. Each agent independently optimizes its policy within a shared environment. -- `PettingZoo Parameter Sharing `__: +- `PettingZoo parameter sharing `__: Uses `PettingZoo `__ for an environment where all agents share a single policy. -- `PettingZoo Shared Value Function `__: +- `PettingZoo shared value function `__: Also using PettingZoo, this example explores shared value functions among agents. It demonstrates collaborative learning scenarios where agents collectively estimate a value function rather than individual policies. -- `Rock-Paper-Scissors Heuristic vs Learned `__: +- `Rock-paper-scissors heuristic vs learned `__: Simulates a rock-paper-scissors game with one heuristic-driven agent and one learning agent. It provides insights into performance when combining fixed and adaptive strategies in adversarial games. -- `Rock-Paper-Scissors Learned vs Learned `__: - Sets up a rock-paper-scissors game where both agents are trained and therefore learn strategies against each other. +- `Rock-paper-scissors learned vs learned `__: + Sets up a rock-paper-scissors game where you train both agents to learn strategies on how to play against each other. Useful for evaluating performance in simple adversarial settings. -- `Self-Play (League-Based) with OpenSpiel `__: - Uses OpenSpiel to demonstrate league-based self-play, where agents play against various (frozen or still-learning) versions of themselves to - improve through competitive interaction. +- `Self-play, league-based, with OpenSpiel `__: + Uses OpenSpiel to demonstrate league-based self-play, where agents play against various + versions of themselves, frozen or in-training, to improve through competitive interaction. -- `Self-Play with OpenSpiel `__: +- `Self-play with OpenSpiel `__: Similar to the league-based self-play, but simpler. This script leverages OpenSpiel for two-player games, allowing agents to improve through direct self-play without building a complex, structured league. @@ -290,9 +298,9 @@ Multi-agent RL Offline RL ++++++++++ -- `Train with Behavioral Cloning (BC), Finetune with PPO `__: - Combines behavioral cloning pre-training with PPO fine-tuning, providing a two-phase training strategy where imitation learning (offline) - is followed by online reinforcement learning. +- `Train with behavioral cloning (BC), Finetune with PPO `__: + Combines behavioral cloning pre-training with PPO fine-tuning, providing a two-phase + training strategy. Offline imitation learning as a first step followed by online reinforcement learning. Ray Serve and RLlib @@ -307,15 +315,15 @@ Ray Serve and RLlib Ray Tune and RLlib ++++++++++++++++++ -- `Custom Experiment `__: +- `Custom experiment `__: Configures a custom experiment with `Ray Tune `__, demonstrating advanced options for custom training- and evaluation phases -- `Custom Logger `__: +- `Custom logger `__: Shows how to implement a custom logger within `Ray Tune `__, allowing users to define specific logging behaviors and outputs during training. -- `Custom Progress Reporter `__: +- `Custom progress reporter `__: Demonstrates a custom progress reporter in `Ray Tune `__, which enables tracking and displaying specific training metrics or status updates in a customized format. @@ -323,31 +331,31 @@ Ray Tune and RLlib RLModules +++++++++ -- `Action Masking `__: - Implements an :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` with action masking, where certain (disallowed) actions are +- `Action masking `__: + Implements an :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` with action masking, where certain disallowed actions are masked based on parts of the observation dict, useful for environments with conditional action availability. -- `Auto-Regressive Actions `__: +- `Auto-regressive actions `__: Configures an RL module that generates actions in an autoregressive manner, where the second component of an action depends on the previously sampled first component of the same action. -- `Custom CNN-Based RLModule `__: +- `Custom CNN-based RLModule `__: Demonstrates a custom CNN architecture realized as an :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`, enabling convolutional feature extraction tailored to the environment's visual observations. -- `Custom LSTM-Based RLModule `__: +- `Custom LSTM-based RLModule `__: Uses a custom LSTM within an :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`, allowing for temporal sequence processing, beneficial for partially observable environments with sequential dependencies. -- `Migrate ModelV2 to RLModule (new API stack) by config `__: +- `Migrate ModelV2 to RLModule by config `__: Shows how to migrate a ModelV2-based setup (old API stack) to the new API stack's :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`, using an (old API stack) :py:class:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig` instance. -- `Migrate ModelV2 to RLModule (new API stack) by Policy Checkpoint `__: +- `Migrate ModelV2 to RLModule by Policy Checkpoint `__: Migrates a ModelV2 (old API stack) to the new API stack's :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` by directly loading a policy checkpoint, enabling smooth transitions to the new API stack while preserving learned parameters. -- `Pretrain Single-Agent Policy, then Train in Multi-Agent Env `__: +- `Pretrain single-agent policy, then train in multi-agent Env `__: Demonstrates pretraining a single-agent model and transferring it to a multi-agent setting, useful for initializing multi-agent scenarios with pre-trained policies. @@ -356,20 +364,19 @@ Tuned examples -------------- The `tuned examples `__ folder -contains python config files that can be executed analogously to -all other example scripts described here to run tuned learning experiments -for the different algorithms and environment types. +contains python config files that you can execute analogously to all other example scripts described +here to run tuned learning experiments for the different algorithms and environment types. -For example, see this tuned Atari example for PPO, which learns to solve the Pong environment -in roughly 5 minutes. It can be run as follows on a single g5.24xlarge (or g6.24xlarge) machine with -4 GPUs and 96 CPUs: +For example, see this `tuned Atari example for PPO `__, +which learns to solve the Pong environment in roughly 5 minutes. You can run it as follows on a single +g5.24xlarge or g6.24xlarge machine with 4 GPUs and 96 CPUs: .. code-block:: bash $ cd ray/rllib/tuned_examples/ppo $ python atari_ppo.py --env=ale_py:ALE/Pong-v5 --num-learners=4 --num-env-runners=95 -Note that some of the files in this folder are used for RLlib's daily or weekly release tests as well. +Note that RLlib's daily or weekly release tests use some of the files in this folder as well. Community examples @@ -387,7 +394,7 @@ Community examples Example of training autonomous vehicles with RLlib and `CARLA `__ simulator. - `The Emergence of Adversarial Communication in Multi-Agent Reinforcement Learning `__: Using Graph Neural Networks and RLlib to train multiple cooperative and adversarial agents to solve the - "cover the area"-problem, thereby learning how to best communicate (or - in the adversarial case - how to disturb communication) (`code `__). + "cover the area"-problem, thereby learning how to best communicate or - in the adversarial case - how to disturb communication (`code `__). - `Flatland `__: A dense traffic simulating environment with RLlib-generated baselines. - `GFootball `__: @@ -424,7 +431,7 @@ Blog posts - `Attention Nets and More with RLlib’s Trajectory View API `__: - Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. + Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL attention net architectures. - `Reinforcement Learning with RLlib in the Unity Game Engine `__: How-To guide about connecting RLlib with the Unity3D game engine for running visual- and physics-based RL experiments. - `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: From e706ede5e2be04c8353bd1c55112da6303b8e17f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 1 Jan 2025 19:15:50 +0100 Subject: [PATCH 3/6] k Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 3 ++- rllib/env/single_agent_env_runner.py | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 58ddc2d0f59d3..195ad354cc241 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -4375,7 +4375,8 @@ def _model_config_auto_includes(self) -> Dict[str, Any]: # ----------------------------------------------------------- def _validate_env_runner_settings(self) -> None: allowed_vectorize_modes = set( - gym.envs.registration.VectorizeMode.__members__.keys() + list(gym.envs.registration.VectorizeMode.__members__.keys()) + + list(gym.envs.registration.VectorizeMode.__members__.values()) ) if self.gym_env_vectorize_mode not in allowed_vectorize_modes: raise ValueError( diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 677efe5c0357c..6f6f2ed0d16ea 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -635,12 +635,17 @@ def make_env(self) -> None: env_context=env_ctx, ) gym.register("rllib-single-agent-env-v0", entry_point=entry_point) + vectorize_mode = self.config.gym_env_vectorize_mode self.env = DictInfoToList( gym.make_vec( "rllib-single-agent-env-v0", num_envs=self.config.num_envs_per_env_runner, - vectorization_mode=self.config.gym_env_vectorize_mode.lower(), + vectorization_mode=( + vectorize_mode + if isinstance(vectorize_mode, gym.envs.registration.VectorizeMode) + else gym.envs.registration.VectorizeMode(vectorize_mode.lower()) + ), ) ) From d9ec5cf67eba87b5e5cf377e5ab64b5ff8fa2d60 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 3 Jan 2025 11:40:11 +0100 Subject: [PATCH 4/6] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 8211806d58a87..6681075e91311 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -318,7 +318,9 @@ def __init__(self, algo_class: Optional[type] = None): self.env_runner_cls = None self.num_env_runners = 0 self.num_envs_per_env_runner = 1 - self.gym_env_vectorize_mode = gym.envs.registration.VectorizeMode.SYNC + # TODO (sven): Once new ormsgpack system in place, reaplce the string + # with proper `gym.envs.registration.VectorizeMode.SYNC`. + self.gym_env_vectorize_mode = "SYNC" self.num_cpus_per_env_runner = 1 self.num_gpus_per_env_runner = 0 self.custom_resources_per_env_runner = {} From 06909a2fbeebf839db4050047f735e81f3ff9d7c Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 3 Jan 2025 12:16:32 +0100 Subject: [PATCH 5/6] fix Signed-off-by: sven1977 --- rllib/examples/envs/async_gym_env_vectorization.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rllib/examples/envs/async_gym_env_vectorization.py b/rllib/examples/envs/async_gym_env_vectorization.py index a1745e0c218ae..06a2d7d0982a3 100644 --- a/rllib/examples/envs/async_gym_env_vectorization.py +++ b/rllib/examples/envs/async_gym_env_vectorization.py @@ -99,6 +99,11 @@ def observation(self, observation): if __name__ == "__main__": args = parser.parse_args() + if args.no_tune and args.vectorize_mode == "BOTH": + raise ValueError( + "Can't run this script with both --no-tune and --vectorize-mode=BOTH!" + ) + # Wrap the env with the slowness wrapper. def _env_creator(cfg): return SlowEnv(gym.make(args.env, **cfg)) @@ -125,7 +130,7 @@ def _env_creator(cfg): results = run_rllib_example_script_experiment(base_config, args) # Compare the throughputs and assert that ASYNC is much faster than SYNC. - if args.vectorize_mode == "BOTH" and args.as_test: + if args.vectorize_mode == "BOTH": throughput_sync = ( results[0].metrics["num_env_steps_sampled_lifetime"] / results[0].metrics["time_total_s"] From 463d28277078ca14334735462665c06cc3ef76bb Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 3 Jan 2025 13:23:13 +0100 Subject: [PATCH 6/6] fixes Signed-off-by: sven1977 --- rllib/algorithms/tests/test_callbacks_on_algorithm.py | 2 +- rllib/examples/fault_tolerance/crashing_and_stalling_env.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/rllib/algorithms/tests/test_callbacks_on_algorithm.py b/rllib/algorithms/tests/test_callbacks_on_algorithm.py index 9d3592771282b..7da44c57c8051 100644 --- a/rllib/algorithms/tests/test_callbacks_on_algorithm.py +++ b/rllib/algorithms/tests/test_callbacks_on_algorithm.py @@ -73,7 +73,7 @@ def test_on_env_runners_recreated_callback(self): # Train a bit (and have the envs/workers crash). for _ in range(3): print(algo.train()) - time.sleep(5.0) + time.sleep(15.0) algo.restore_workers(algo.env_runner_group) # After training, the `on_workers_recreated` callback should have captured diff --git a/rllib/examples/fault_tolerance/crashing_and_stalling_env.py b/rllib/examples/fault_tolerance/crashing_and_stalling_env.py index bdae74c6d4be3..4425d51d5d9e5 100644 --- a/rllib/examples/fault_tolerance/crashing_and_stalling_env.py +++ b/rllib/examples/fault_tolerance/crashing_and_stalling_env.py @@ -94,7 +94,6 @@ parser.set_defaults( enable_new_api_stack=True, num_env_runners=4, - num_envs_per_env_runner=2, ) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values to set up `config` below.