From 9cf29aeea73b0dbaa543a17f303da3a5982b0232 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 21:13:56 -0700
Subject: [PATCH 1/9] wip

---
 doc/source/rllib-concepts.rst | 383 +++++++++++++++++++++++++++++++---
 doc/source/rllib.rst          |  22 +-
 2 files changed, 363 insertions(+), 42 deletions(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index d91a29f28b9ff..e614596e9f4db 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -1,26 +1,27 @@
-RLlib Concepts
-==============
+RLlib Concepts and Custom Algorithms
+====================================
 
 This page describes the internal concepts used to implement algorithms in RLlib. You might find this useful if modifying or adding new algorithms to RLlib.
 
-Policy Graphs
--------------
+Policies
+--------
 
-Policy graph classes encapsulate the core numerical components of RL algorithms. This typically includes the policy model that determines actions to take, a trajectory postprocessor for experiences, and a loss function to improve the policy given postprocessed experiences. For a simple example, see the policy gradients `graph definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy_graph.py>`__.
+Policy classes encapsulate the core numerical components of RL algorithms. This typically includes the policy model that determines actions to take, a trajectory postprocessor for experiences, and a loss function to improve the policy given postprocessed experiences. For a simple example, see the policy gradients `graph definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy.py>`__.
 
-Most interaction with deep learning frameworks is isolated to the `PolicyGraph interface <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_graph.py>`__, allowing RLlib to support multiple frameworks. To simplify the definition of policy graphs, RLlib includes `Tensorflow <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/tf_policy_graph.py>`__ and `PyTorch-specific <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/torch_policy_graph.py>`__ templates. You can also write your own from scratch. Here is an example:
+Most interaction with deep learning frameworks is isolated to the `Policy interface <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/policy.py>`__, allowing RLlib to support multiple frameworks. To simplify the definition of policies, RLlib includes `Tensorflow <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/tf_policy_template.py>`__ and `PyTorch-specific <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/torch_policy_template.py>`__ templates. You can also write your own from scratch. Here is an example:
 
 .. code-block:: python
 
-    class CustomPolicy(PolicyGraph):
-        """Example of a custom policy graph written from scratch.
+    class CustomPolicy(Policy):
+        """Example of a custom policy written from scratch.
 
-        You might find it more convenient to extend TF/TorchPolicyGraph instead
-        for a real policy.
+        You might find it more convenient to use the `build_tf_policy` and
+        `build_torch_policy` helpers instead for a real policy, which are
+        described in the next sections.
         """
 
         def __init__(self, observation_space, action_space, config):
-            PolicyGraph.__init__(self, observation_space, action_space, config)
+            Policy.__init__(self, observation_space, action_space, config)
             # example parameter
             self.w = 1.0
 
@@ -45,61 +46,372 @@ Most interaction with deep learning frameworks is isolated to the `PolicyGraph i
         def set_weights(self, weights):
             self.w = weights["w"]
 
+Building Policies in TensorFlow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section covers how to build a TensorFlow RLlib policy using ``tf_policy_template.build_tf_policy()``.
+
+To start, you first have to define a loss function. In RLlib, loss functions are defined over batches of trajectory data produced by policy evaluation. A basic policy gradient loss that only tries to maximize the 1-step reward can be defined as follows:
+
+.. code-block:: python
+
+    import tensorflow as tf
+    from ray.rllib.policy.sample_batch import SampleBatch
+
+    def policy_gradient_loss(policy, batch_tensors):
+        actions = batch_tensors[SampleBatch.ACTIONS]
+        rewards = batch_tensors[SampleBatch.REWARDS]
+        return -tf.reduce_mean(policy.action_dist.logp(actions) * rewards)
+
+In the above snippet, ``actions`` is a Tensor placeholder of shape ``[batch_size, action_dim...]``, and ``rewards`` is a placeholder of shape ``[batch_size]``. The ``policy.action_dist`` object is an `ActionDistribution <rllib-package-ref.html#ray.rllib.models.ActionDistribution>`__ that represents the output of the neural network policy model. Passing this loss function to ``build_tf_policy`` is enough to produce a very basic TF policy:
+
+.. code-block:: python
+
+    from ray.rllib.policy.tf_policy_template import build_tf_policy
+
+    # <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'>
+    MyTFPolicy = build_tf_policy(
+        name="MyTFPolicy",
+        loss_fn=policy_gradient_loss)
+
+We can create a `Trainer <#trainers>`__ and try running this policy on a toy env with two parallel rollout workers:
+
+.. code-block:: python
+
+    import ray
+    from ray import tune
+    from ray.rllib.agents.trainer_template import build_trainer
+
+    # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
+    MyTrainer = build_trainer(
+        name="MyCustom",
+        default_policy=MyTFPolicy)
+    
+    ray.init()
+    tune.run(MyTrainer, config={"env": "CartPole-v0", "num_workers": 2})
+
+
+If you run the above snippet, you'll probably notice that CartPole doesn't learn so well:
+
+.. code-block:: bash
+
+    == Status ==
+    Using FIFO scheduling algorithm.
+    Resources requested: 3/4 CPUs, 0/0 GPUs
+    Memory usage on this node: 4.6/12.3 GB
+    Result logdir: /home/ubuntu/ray_results/MyAlgTrainer
+    Number of trials: 1 ({'RUNNING': 1})
+    RUNNING trials:
+     - MyAlgTrainer_CartPole-v0_0:	RUNNING, [3 CPUs, 0 GPUs], [pid=26784],
+                                        32 s, 156 iter, 62400 ts, 23.1 rew
+
+Let's modify our policy loss to include rewards summed over time. To enable this advantage calculation, we need to define a *trajectory postprocessor* for the policy. This can be done by defining ``postprocess_fn``:
+
+.. code-block:: python
+
+    from ray.rllib.evaluation.postprocessing import compute_advantages, \
+        Postprocessing
+
+    def postprocess_advantages(policy,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
+        return compute_advantages(
+            sample_batch, 0.0, policy.config["gamma"], use_gae=False)
+
+    def policy_gradient_loss(policy, batch_tensors):
+        actions = batch_tensors[SampleBatch.ACTIONS]
+        advantages = batch_tensors[Postprocessing.ADVANTAGES]
+        return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages)
+
+    MyTFPolicy = build_tf_policy(
+        name="MyTFPolicy",
+        loss_fn=policy_gradient_loss,
+        postprocess_fn=postprocess_advantages)
+
+The ``postprocess_advantages()`` function above uses calls RLlib's ``compute_advantages`` function to compute advantages for each timestep. If you re-run the trainer with this improved policy, you'll find that it quickly achieves the max reward of 200.
+
+You might be wondering how RLlib makes the advantages placeholder automatically available as ``batch_tensors[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. This allows placeholders to be dynamically generated in most scenarios.
+
+**Advanced Example: Proximal Policy Optimization**
+
+In the above example you saw how to compose a simple policy gradient algorithm with RLlib. In this section, we'll dive into how PPO was built with RLlib and how you can modify it. First, check out the `PPO trainer definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ppo/ppo.py>`__:
+
+.. code-block:: python
+
+    PPOTrainer = build_trainer(
+        name="PPOTrainer",
+        default_config=DEFAULT_CONFIG,
+        default_policy=PPOTFPolicy,
+        make_policy_optimizer=make_optimizer,
+        validate_config=validate_config,
+        after_optimizer_step=update_kl,
+        before_train_step=warn_about_obs_filter,
+        after_train_result=warn_about_bad_reward_scales)
+
+Besides some boilerplate for defining the PPO configuration and some warnings, there are two important arguments to take note of here: ``make_policy_optimizer=make_optimizer``, and ``after_optimizer_step=update_kl``.
+
+The ``make_optimizer`` function chooses which `Policy Optimizer <#policy-optimization>`__ to use for distributed training. You can think of these policy optimizers as coordinating the distributed workflow needed to improve the policy. Depending on the trainer config, PPO can switch between a simple synchronous optimizer (the default), or a multi-GPU optimizer that implements minibatch SGD:
+
+.. code-block:: python
+
+    def make_optimizer(workers, config):
+        if config["simple_optimizer"]:
+            return SyncSamplesOptimizer(
+                workers,
+                num_sgd_iter=config["num_sgd_iter"],
+                train_batch_size=config["train_batch_size"])
+
+        return LocalMultiGPUOptimizer(
+            workers,
+            sgd_batch_size=config["sgd_minibatch_size"],
+            num_sgd_iter=config["num_sgd_iter"],
+            num_gpus=config["num_gpus"],
+            sample_batch_size=config["sample_batch_size"],
+            num_envs_per_worker=config["num_envs_per_worker"],
+            train_batch_size=config["train_batch_size"],
+            standardize_fields=["advantages"],
+            straggler_mitigation=config["straggler_mitigation"])
+
+Suppose we want to customize PPO to use an asynchronous-gradient optimization strategy similar to A3C. To do that, we could define a new function that returns ``AsyncGradientsOptimizer`` and pass in ``make_policy_optimizer=make_async_optimizer`` when building the trainer:
+
+.. code-block:: python
+
+    from ray.rllib.optimizers import AsyncGradientsOptimizer
+
+    def make_async_optimizer(workers, config):
+        return AsyncGradientsOptimizer(workers, grads_per_step=100)
+
+    PPOTrainer = build_trainer(
+        ...,
+        make_policy_optimizer=make_async_optimizer)
+
+
+Now let's take a look at the ``update_kl`` function. This is used to adaptively adjust the KL penalty coefficient on the PPO loss, which bounds the policy change per training step. You'll notice the code handles both single and multi-agent cases (where there are be multiple policies each with different KL coeffs):
+
+.. code-block:: python
+
+    def update_kl(trainer, fetches):
+        if "kl" in fetches:
+            # single-agent
+            trainer.workers.local_worker().for_policy(
+                lambda pi: pi.update_kl(fetches["kl"]))
+        else:
+
+            def update(pi, pi_id):
+                if pi_id in fetches:
+                    pi.update_kl(fetches[pi_id]["kl"])
+                else:
+                    logger.debug("No data for {}, not updating kl".format(pi_id))
+
+            # multi-agent
+            trainer.workers.local_worker().foreach_trainable_policy(update)
+
+The ``update_kl`` method on the policy is defined in `PPOTFPolicy <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ppo/ppo_policy.py>`__ via the ``KLCoeffMixin``, along with several other advanced features. Let's look at each new feature used by the policy:
+
+.. code-block:: python
+
+    PPOTFPolicy = build_tf_policy(
+        name="PPOTFPolicy",
+        get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
+        loss_fn=ppo_surrogate_loss,
+        stats_fn=kl_and_loss_stats,
+        extra_action_fetches_fn=vf_preds_and_logits_fetches,
+        postprocess_fn=postprocess_ppo_gae,
+        gradients_fn=clip_gradients,
+        before_loss_init=setup_mixins,
+        mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])
+
+``stats_fn``: The stats function returns a dictionary of Tensors that will be reported with the training results. This also includes the ``kl`` metric which is used by the trainer to adjust the KL penalty. Note that many of the values below reference ``policy.loss_obj``, which is assigned by ``loss_fn`` (not shown here since the PPO loss is quite complex). RLlib will always call ``stats_fn`` after ``loss_fn``, so you can rely on using values saved by ``loss_fn`` as part of your statistics:
+
+.. code-block:: python
+
+    def kl_and_loss_stats(policy, batch_tensors):
+        policy.explained_variance = explained_variance(
+            batch_tensors[Postprocessing.VALUE_TARGETS], policy.value_function)
+
+        stats_fetches = {
+            "cur_kl_coeff": policy.kl_coeff,
+            "cur_lr": tf.cast(policy.cur_lr, tf.float64),
+            "total_loss": policy.loss_obj.loss,
+            "policy_loss": policy.loss_obj.mean_policy_loss,
+            "vf_loss": policy.loss_obj.mean_vf_loss,
+            "vf_explained_var": policy.explained_variance,
+            "kl": policy.loss_obj.mean_kl,
+            "entropy": policy.loss_obj.mean_entropy,
+        }
+
+        return stats_fetches
+
+``extra_actions_fetches_fn``: This function defines extra outputs that will be recorded when generating actions with the policy. For example, this enables saving the raw policy logits in the experience batch, which e.g. means it can be referenced in the PPO loss function via ``batch_tensors[BEHAVIOUR_LOGITS]``. Other values such as the current value prediction can also be emitted for debugging or optimization purposes:
+
+.. code-block:: python
+
+    def vf_preds_and_logits_fetches(policy):
+        return {
+            SampleBatch.VF_PREDS: policy.value_function,
+            BEHAVIOUR_LOGITS: policy.model.outputs,
+        }
+
+``gradients_fn``: If defined, this function returns TF gradients for the loss function. You'd typically only want to override this to apply transformations such as gradient clipping:
+
+.. code-block:: python
+
+    def clip_gradients(policy, optimizer, loss):
+        if policy.config["grad_clip"] is not None:
+            policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
+                                                tf.get_variable_scope().name)
+            grads = tf.gradients(loss, policy.var_list)
+            policy.grads, _ = tf.clip_by_global_norm(grads,
+                                                     policy.config["grad_clip"])
+            clipped_grads = list(zip(policy.grads, policy.var_list))
+            return clipped_grads
+        else:
+            return optimizer.compute_gradients(
+                loss, colocate_gradients_with_ops=True)
+
+``mixins``: To add arbitrary stateful components, you can add mixin classes to the policy. Methods defined by these mixins will have higher priority than the base policy class, so you can use these to override methods (as in the case of ``LearningRateSchedule``), or define extra methods and attributes (e.g., ``KLCoeffMixin``, ``ValueNetworkMixin``). Like any other Python superclass, these should be initialized at some point, which is what the ``setup_mixins`` function does:
+
+.. code-block:: python
+
+    def setup_mixins(policy, obs_space, action_space, config):
+        ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
+        KLCoeffMixin.__init__(policy, config)
+        LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+
+In PPO we run ``setup_mixins`` before the loss function is called (i.e., ``before_loss_init``), but other callbacks you can use include ``before_init`` and ``after_init``.
+
+Finally, we note that you do not have to use ``build_tf_policy`` to define a TensorFlow policy. You can alternatively subclass ``Policy``, ``TFPolicy``, or ``DynamicTFPolicy`` as convenient.
+
+
+Building Policies in PyTorch
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Defining a policy in PyTorch is quite similar to that for TensorFlow (and the process of defining a trainer given a Torch policy is exactly the same). Building on the TF example above, let's look at how the `A3C torch policy <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/a3c/a3c_torch_policy.py>`__ is defined:
+
+.. code-block:: python
+
+    A3CTorchPolicy = build_torch_policy(
+        name="A3CTorchPolicy",
+        get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
+        loss_fn=actor_critic_loss,
+        stats_fn=loss_and_entropy_stats,
+        postprocess_fn=add_advantages,
+        extra_action_out_fn=model_value_predictions,
+        extra_grad_process_fn=apply_grad_clipping,
+        optimizer_fn=torch_optimizer,
+        mixins=[ValueNetworkMixin])
+
+``loss_fn``: Similar to the TF example, the actor critic loss is defined over ``batch_tensors``. We imperatively execute the forward pass by calling ``policy.model()`` on the observations followed by ``policy.dist_class()`` on the output logits. The output Tensors are saved as attributes of the policy object (e.g., ``policy.entropy = dist.entropy.mean()``), and we return the scalar loss:
+
+.. code-block:: python
+
+    def actor_critic_loss(policy, batch_tensors):
+        logits, _, values, _ = policy.model({
+            SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
+        }, [])
+        dist = policy.dist_class(logits)
+        log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS])
+        policy.entropy = dist.entropy().mean()
+        ...
+        return overall_err
+
+``stats_fn``: The stats function references ``entropy``, ``pi_err``, and ``value_err`` saved from the call to the loss function, similar in the PPO TF example:
+
+.. code-block:: python
+
+    def loss_and_entropy_stats(policy, batch_tensors):
+        return {
+            "policy_entropy": policy.entropy.item(),
+            "policy_loss": policy.pi_err.item(),
+            "vf_loss": policy.value_err.item(),
+        }
+
+``extra_action_out_fn``: We save value function predictions given model outputs. This makes the value function predictions of the model available in the trajectory as ``batch_tensors[SampleBatch.VF_PREDS]``:
+
+.. code-block:: python
+
+    def model_value_predictions(policy, model_out):
+        return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}
+
+``postprocess_fn`` and ``mixins``: Similar to the PPO example, we need access to the value function during postprocessing (i.e., ``add_advantages`` below calls ``policy._value()``. The value function is exposed through a mixin class that defines the method:
+
+.. code-block:: python
+
+    def add_advantages(policy,
+                       sample_batch,
+                       other_agent_batches=None,
+                       episode=None):
+        completed = sample_batch[SampleBatch.DONES][-1]
+        if completed:
+            last_r = 0.0
+        else:
+            last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
+        return compute_advantages(sample_batch, last_r, policy.config["gamma"],
+                                  policy.config["lambda"])
+
+    class ValueNetworkMixin(object):
+        def _value(self, obs):
+            with self.lock:
+                obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device)
+                _, _, vf, _ = self.model({"obs": obs}, [])
+                return vf.detach().cpu().numpy().squeeze()
+
+You can find the full policy definition in `a3c_torch_policy.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/a3c/a3c_torch_policy.py>`__.
+
+In summary, the main differences between the PyTorch and TensorFlow policy builder functions is that the TF loss and stats functions are evaluated symbolically (when the policy is initialized), whereas for PyTorch these functions are called imperatively each time they are used.
+
 Policy Evaluation
 -----------------
 
-Given an environment and policy graph, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
+Given an environment and policy, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `RolloutWorker <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/rollout_worker.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
 
-You can use policy evaluation standalone to produce batches of experiences. This can be done by calling ``ev.sample()`` on an evaluator instance, or ``ev.sample.remote()`` in parallel on evaluator instances created as Ray actors (see ``PolicyEvaluator.as_remote()``).
+You can use rollout workers standalone to produce batches of experiences. This can be done by calling ``worker.sample()`` on a worker instance, or ``worker.sample.remote()`` in parallel on worker instances created as Ray actors (see ``RolloutWorkers.create_remote``).
 
-Here is an example of creating a set of policy evaluation actors and using the to gather experiences in parallel. The trajectories are concatenated, the policy learns on the trajectory batch, and then we broadcast the policy weights to the evaluators for the next round of rollouts:
+Here is an example of creating a set of rollout workers and using them gather experiences in parallel. The trajectories are concatenated, the policy learns on the trajectory batch, and then we broadcast the policy weights to the workers for the next round of rollouts:
 
 .. code-block:: python
 
-    # Setup policy and remote policy evaluation actors
+    # Setup policy and rollout workers
     env = gym.make("CartPole-v0")
     policy = CustomPolicy(env.observation_space, env.action_space, {})
-    remote_evaluators = [
-        PolicyEvaluator.as_remote().remote(lambda c: gym.make("CartPole-v0"),
-                                           CustomPolicy)
-        for _ in range(10)
-    ]
+    workers = WorkerSet(
+        policy=CustomPolicy,
+        env_creator=lambda c: gym.make("CartPole-v0"),
+        num_workers=10)
 
     while True:
         # Gather a batch of samples
         T1 = SampleBatch.concat_samples(
-            ray.get([w.sample.remote() for w in remote_evaluators]))
+            ray.get([w.sample.remote() for w in workers.remote_workers()]))
 
         # Improve the policy using the T1 batch
         policy.learn_on_batch(T1)
 
         # Broadcast weights to the policy evaluation workers
         weights = ray.put({"default_policy": policy.get_weights()})
-        for w in remote_evaluators:
+        for w in workers.remote_workers():
             w.set_weights.remote(weights)
 
 Policy Optimization
 -------------------
 
-Similar to how a `gradient-descent optimizer <https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer>`__ can be used to improve a model, RLlib's `policy optimizers <https://github.com/ray-project/ray/tree/master/python/ray/rllib/optimizers>`__ implement different strategies for improving a policy graph.
+Similar to how a `gradient-descent optimizer <https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer>`__ can be used to improve a model, RLlib's `policy optimizers <https://github.com/ray-project/ray/tree/master/python/ray/rllib/optimizers>`__ implement different strategies for improving a policy.
 
-For example, in A3C you'd want to compute gradients asynchronously on different workers, and apply them to a central policy graph replica. This strategy is implemented by the `AsyncGradientsOptimizer <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/async_gradients_optimizer.py>`__. Another alternative is to gather experiences synchronously in parallel and optimize the model centrally, as in `SyncSamplesOptimizer <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/sync_samples_optimizer.py>`__. Policy optimizers abstract these strategies away into reusable modules.
+For example, in A3C you'd want to compute gradients asynchronously on different workers, and apply them to a central policy replica. This strategy is implemented by the `AsyncGradientsOptimizer <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/async_gradients_optimizer.py>`__. Another alternative is to gather experiences synchronously in parallel and optimize the model centrally, as in `SyncSamplesOptimizer <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/sync_samples_optimizer.py>`__. Policy optimizers abstract these strategies away into reusable modules.
 
 This is how the example in the previous section looks when written using a policy optimizer:
 
 .. code-block:: python
 
     # Same setup as before
-    local_evaluator = PolicyEvaluator(lambda c: gym.make("CartPole-v0"), CustomPolicy)
-    remote_evaluators = [
-        PolicyEvaluator.as_remote().remote(lambda c: gym.make("CartPole-v0"),
-                                           CustomPolicy)
-        for _ in range(10)
-    ]
+    workers = WorkerSet(
+        policy=CustomPolicy,
+        env_creator=lambda c: gym.make("CartPole-v0"),
+        num_workers=10)
     
     # this optimizer implements the IMPALA architecture
-    optimizer = AsyncSamplesOptimizer(
-        local_evaluator, remote_evaluators, train_batch_size=500)
+    optimizer = AsyncSamplesOptimizer(workers, train_batch_size=500)
 
     while True:
         optimizer.step()
@@ -110,7 +422,7 @@ Trainers
 
 Trainers are the boilerplate classes that put the above components together, making algorithms accessible via Python API and the command line. They manage algorithm configuration, setup of the policy evaluators and optimizer, and collection of training metrics. Trainers also implement the `Trainable API <https://ray.readthedocs.io/en/latest/tune-usage.html#training-api>`__ for easy experiment management.
 
-Example of two equivalent ways of interacting with the PPO trainer:
+Example of three equivalent ways of interacting with the PPO trainer:
 
 .. code-block:: python
 
@@ -121,3 +433,8 @@ Example of two equivalent ways of interacting with the PPO trainer:
 .. code-block:: bash
 
     rllib train --run=PPO --env=CartPole-v0 --config='{"train_batch_size": 4000}'
+
+.. code-block:: python
+    
+    from ray import tune
+    tune.run(PPOTrainer, config={"env": "CartPole-v0", "train_batch_size": 4000})
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 06c5800355077..3f15e90015999 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -5,8 +5,7 @@ RLlib is an open-source library for reinforcement learning that offers both high
 
 .. image:: rllib-stack.svg
 
-Learn more about RLlib's design by reading the `ICML paper <https://arxiv.org/abs/1712.09381>`__.
-To get started, take a look over the `custom env example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/custom_env.py>`__ and the `API documentation <rllib-training.html>`__.
+To get started, take a look over the `custom env example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/custom_env.py>`__ and the `API documentation <rllib-training.html>`__. If you're looking to develop custom algorithms with RLlib, also check out `concepts and custom algorithms <rllib-concepts.html>`__.
 
 Installation
 ------------
@@ -50,7 +49,7 @@ Models and Preprocessors
 * `Custom Preprocessors <rllib-models.html#custom-preprocessors>`__
 * `Supervised Model Losses <rllib-models.html#supervised-model-losses>`__
 * `Variable-length / Parametric Action Spaces <rllib-models.html#variable-length-parametric-action-spaces>`__
-* `Customizing Policy Graphs <rllib-models.html#customizing-policy-graphs>`__
+* `Customizing Policies <rllib-models.html#customizing-policys>`__
 
 Algorithms
 ----------
@@ -96,12 +95,17 @@ Offline Datasets
 * `Input API <rllib-offline.html#input-api>`__
 * `Output API <rllib-offline.html#output-api>`__
 
-Concepts
---------
-* `Policy Graphs <rllib-concepts.html>`__
-* `Policy Evaluation <rllib-concepts.html#policy-evaluation>`__
-* `Policy Optimization <rllib-concepts.html#policy-optimization>`__
-* `Trainers <rllib-concepts.html#trainers>`__
+Concepts and Custom Algorithms
+------------------------------
+*  `Policies <rllib-concepts.html>`__
+
+   -  `Building Policies in TensorFlow <rllib-concepts.html#building-policies-in-tensorflow>`__
+
+   -  `Building Policies in PyTorch <rllib-concepts.html#building-policies-in-pytorch>`__
+
+*  `Policy Evaluation <rllib-concepts.html#policy-evaluation>`__
+*  `Policy Optimization <rllib-concepts.html#policy-optimization>`__
+*  `Trainers <rllib-concepts.html#trainers>`__
 
 Examples
 --------

From 2682c234a5accf905fefe1ed49253795a1659c8e Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 21:15:25 -0700
Subject: [PATCH 2/9] fix index

---
 doc/source/index.rst          | 4 ++--
 doc/source/rllib-concepts.rst | 4 ++--
 doc/source/rllib-env.rst      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/source/index.rst b/doc/source/index.rst
index eba9eaa6ccac7..a90e0224bb02f 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -98,10 +98,10 @@ Ray comes with libraries that accelerate deep learning and reinforcement learnin
    rllib-models.rst
    rllib-algorithms.rst
    rllib-offline.rst
-   rllib-dev.rst
    rllib-concepts.rst
-   rllib-package-ref.rst
    rllib-examples.rst
+   rllib-dev.rst
+   rllib-package-ref.rst
 
 .. toctree::
    :maxdepth: 1
diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index e614596e9f4db..916896cabb20b 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -359,7 +359,7 @@ Defining a policy in PyTorch is quite similar to that for TensorFlow (and the pr
 
 You can find the full policy definition in `a3c_torch_policy.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/a3c/a3c_torch_policy.py>`__.
 
-In summary, the main differences between the PyTorch and TensorFlow policy builder functions is that the TF loss and stats functions are evaluated symbolically (when the policy is initialized), whereas for PyTorch these functions are called imperatively each time they are used.
+In summary, the main differences between the PyTorch and TensorFlow policy builder functions is that the TF loss and stats functions are built symbolically when the policy is initialized, whereas for PyTorch these functions are called imperatively each time they are used.
 
 Policy Evaluation
 -----------------
@@ -420,7 +420,7 @@ This is how the example in the previous section looks when written using a polic
 Trainers
 --------
 
-Trainers are the boilerplate classes that put the above components together, making algorithms accessible via Python API and the command line. They manage algorithm configuration, setup of the policy evaluators and optimizer, and collection of training metrics. Trainers also implement the `Trainable API <https://ray.readthedocs.io/en/latest/tune-usage.html#training-api>`__ for easy experiment management.
+Trainers are the boilerplate classes that put the above components together, making algorithms accessible via Python API and the command line. They manage algorithm configuration, setup of the rollout workers and optimizer, and collection of training metrics. Trainers also implement the `Trainable API <https://ray.readthedocs.io/en/latest/tune-usage.html#training-api>`__ for easy experiment management.
 
 Example of three equivalent ways of interacting with the PPO trainer:
 
diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst
index 2701a689dc2c1..818e3c7e43248 100644
--- a/doc/source/rllib-env.rst
+++ b/doc/source/rllib-env.rst
@@ -275,7 +275,7 @@ Implementing a centralized critic that takes as input the observations and actio
 
 .. code-block:: python
 
-    def postprocess_trajectory(self, sample_batch, other_agent_batches, episode):
+    def postprocess_trajectory(policy, sample_batch, other_agent_batches, episode):
         agents = ["agent_1", "agent_2", "agent_3"]  # simple example of 3 agents
         global_obs_batch = np.stack(
             [other_agent_batches[agent_id][1]["obs"] for agent_id in agents],

From 660cb6fa01698d813e464e3d214b67448be63788 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 21:27:28 -0700
Subject: [PATCH 3/9] fix bugs

---
 doc/source/rllib-concepts.rst               |  2 +-
 python/ray/rllib/agents/pg/pg.py            |  2 +-
 python/ray/rllib/agents/ppo/ppo.py          |  2 +-
 python/ray/rllib/agents/trainer_template.py | 13 ++++++-------
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index 916896cabb20b..2d8ee7f74f0ce 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -84,7 +84,7 @@ We can create a `Trainer <#trainers>`__ and try running this policy on a toy env
 
     # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
     MyTrainer = build_trainer(
-        name="MyCustom",
+        name="MyCustomTrainer",
         default_policy=MyTFPolicy)
     
     ray.init()
diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index ffbb899d1b9e5..872d662e0fad0 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -29,7 +29,7 @@ def get_policy_class(config):
 
 
 PGTrainer = build_trainer(
-    name="PG",
+    name="PGTrainer",
     default_config=DEFAULT_CONFIG,
     default_policy=PGTFPolicy,
     get_policy_class=get_policy_class)
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index d3f5abdaa95c0..a9dc682262cfb 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -156,7 +156,7 @@ def validate_config(config):
 
 
 PPOTrainer = build_trainer(
-    name="PPO",
+    name="PPOTrainer",
     default_config=DEFAULT_CONFIG,
     default_policy=PPOTFPolicy,
     make_policy_optimizer=make_optimizer,
diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py
index 618bc3b30ace1..e709e60bc2380 100644
--- a/python/ray/rllib/agents/trainer_template.py
+++ b/python/ray/rllib/agents/trainer_template.py
@@ -2,7 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.agents.trainer import Trainer
+from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG
 from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.rllib.utils.annotations import override, DeveloperAPI
 
@@ -44,13 +44,12 @@ def build_trainer(name,
         a Trainer instance that uses the specified args.
     """
 
-    if name.endswith("Trainer"):
-        raise ValueError("Algorithm name should not include *Trainer suffix",
-                         name)
+    if not name.endswith("Trainer"):
+        raise ValueError("Algorithm name should have *Trainer suffix", name)
 
     class trainer_cls(Trainer):
         _name = name
-        _default_config = default_config or Trainer.COMMON_CONFIG
+        _default_config = default_config or COMMON_CONFIG
         _policy_graph = default_policy
 
         def _init(self, config, env_creator):
@@ -92,6 +91,6 @@ def _train(self):
                 after_train_result(self, res)
             return res
 
-    trainer_cls.__name__ = name + "Trainer"
-    trainer_cls.__qualname__ = name + "Trainer"
+    trainer_cls.__name__ = name
+    trainer_cls.__qualname__ = name
     return trainer_cls

From c7c3f95edd47df70c2709ec70fbd275da7532fc2 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 21:29:23 -0700
Subject: [PATCH 4/9] todo

---
 doc/source/rllib-concepts.rst      | 21 ++++++++++++---------
 python/ray/rllib/agents/ppo/ppo.py |  4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index 2d8ee7f74f0ce..05576792ae9d0 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -131,11 +131,11 @@ Let's modify our policy loss to include rewards summed over time. To enable this
 
 The ``postprocess_advantages()`` function above uses calls RLlib's ``compute_advantages`` function to compute advantages for each timestep. If you re-run the trainer with this improved policy, you'll find that it quickly achieves the max reward of 200.
 
-You might be wondering how RLlib makes the advantages placeholder automatically available as ``batch_tensors[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. This allows placeholders to be dynamically generated in most scenarios.
+You might be wondering how RLlib makes the advantages placeholder automatically available as ``batch_tensors[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. RLlib tracks which placeholders that ``loss_fn`` and ``stats_fn`` access, and then feeds the corresponding sample data into those placeholders during loss optimization.
 
-**Advanced Example: Proximal Policy Optimization**
+**Example 1: Proximal Policy Optimization**
 
-In the above example you saw how to compose a simple policy gradient algorithm with RLlib. In this section, we'll dive into how PPO was built with RLlib and how you can modify it. First, check out the `PPO trainer definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ppo/ppo.py>`__:
+In the above section you saw how to compose a simple policy gradient algorithm with RLlib. In this example, we'll dive into how PPO was built with RLlib and how you can modify it. First, check out the `PPO trainer definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/ppo/ppo.py>`__:
 
 .. code-block:: python
 
@@ -143,19 +143,19 @@ In the above example you saw how to compose a simple policy gradient algorithm w
         name="PPOTrainer",
         default_config=DEFAULT_CONFIG,
         default_policy=PPOTFPolicy,
-        make_policy_optimizer=make_optimizer,
+        make_policy_optimizer=choose_policy_optimizer,
         validate_config=validate_config,
         after_optimizer_step=update_kl,
         before_train_step=warn_about_obs_filter,
         after_train_result=warn_about_bad_reward_scales)
 
-Besides some boilerplate for defining the PPO configuration and some warnings, there are two important arguments to take note of here: ``make_policy_optimizer=make_optimizer``, and ``after_optimizer_step=update_kl``.
+Besides some boilerplate for defining the PPO configuration and some warnings, there are two important arguments to take note of here: ``make_policy_optimizer=choose_policy_optimizer``, and ``after_optimizer_step=update_kl``.
 
-The ``make_optimizer`` function chooses which `Policy Optimizer <#policy-optimization>`__ to use for distributed training. You can think of these policy optimizers as coordinating the distributed workflow needed to improve the policy. Depending on the trainer config, PPO can switch between a simple synchronous optimizer (the default), or a multi-GPU optimizer that implements minibatch SGD:
+The ``choose_policy_optimizer`` function chooses which `Policy Optimizer <#policy-optimization>`__ to use for distributed training. You can think of these policy optimizers as coordinating the distributed workflow needed to improve the policy. Depending on the trainer config, PPO can switch between a simple synchronous optimizer (the default), or a multi-GPU optimizer that implements minibatch SGD:
 
 .. code-block:: python
 
-    def make_optimizer(workers, config):
+    def choose_policy_optimizer(workers, config):
         if config["simple_optimizer"]:
             return SyncSamplesOptimizer(
                 workers,
@@ -281,13 +281,16 @@ The ``update_kl`` method on the policy is defined in `PPOTFPolicy <https://githu
 
 In PPO we run ``setup_mixins`` before the loss function is called (i.e., ``before_loss_init``), but other callbacks you can use include ``before_init`` and ``after_init``.
 
-Finally, we note that you do not have to use ``build_tf_policy`` to define a TensorFlow policy. You can alternatively subclass ``Policy``, ``TFPolicy``, or ``DynamicTFPolicy`` as convenient.
+**Example 2: Deep Q Networks**
 
+(todo)
+
+Finally, note that you do not have to use ``build_tf_policy`` to define a TensorFlow policy. You can alternatively subclass ``Policy``, ``TFPolicy``, or ``DynamicTFPolicy`` as convenient.
 
 Building Policies in PyTorch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Defining a policy in PyTorch is quite similar to that for TensorFlow (and the process of defining a trainer given a Torch policy is exactly the same). Building on the TF example above, let's look at how the `A3C torch policy <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/a3c/a3c_torch_policy.py>`__ is defined:
+Defining a policy in PyTorch is quite similar to that for TensorFlow (and the process of defining a trainer given a Torch policy is exactly the same). Building on the TF examples above, let's look at how the `A3C torch policy <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/a3c/a3c_torch_policy.py>`__ is defined:
 
 .. code-block:: python
 
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index a9dc682262cfb..3f08e75658999 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -63,7 +63,7 @@
 # yapf: enable
 
 
-def make_optimizer(local_evaluator, remote_evaluators, config):
+def choose_policy_optimizer(local_evaluator, remote_evaluators, config):
     if config["simple_optimizer"]:
         return SyncSamplesOptimizer(
             local_evaluator,
@@ -159,7 +159,7 @@ def validate_config(config):
     name="PPOTrainer",
     default_config=DEFAULT_CONFIG,
     default_policy=PPOTFPolicy,
-    make_policy_optimizer=make_optimizer,
+    make_policy_optimizer=choose_policy_optimizer,
     validate_config=validate_config,
     after_optimizer_step=update_kl,
     before_train_step=warn_about_obs_filter,

From 10da8e53390f1c86978f9d8bbb288d0b9d8d0a61 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 22:15:09 -0700
Subject: [PATCH 5/9] add imports

---
 doc/source/rllib-concepts.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index 05576792ae9d0..caf02e35913ce 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -177,7 +177,9 @@ Suppose we want to customize PPO to use an asynchronous-gradient optimization st
 
 .. code-block:: python
 
+    from ray.rllib.agents.ppo.ppo_policy import *
     from ray.rllib.optimizers import AsyncGradientsOptimizer
+    from ray.rllib.policy.tf_policy_template import build_tf_policy
 
     def make_async_optimizer(workers, config):
         return AsyncGradientsOptimizer(workers, grads_per_step=100)

From 8d883368d84772fb90a45bee5b41832821ad3243 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sun, 19 May 2019 16:22:13 -0700
Subject: [PATCH 6/9] note on get ph

---
 doc/source/rllib-concepts.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index caf02e35913ce..1eaca7d8d309f 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -131,7 +131,7 @@ Let's modify our policy loss to include rewards summed over time. To enable this
 
 The ``postprocess_advantages()`` function above uses calls RLlib's ``compute_advantages`` function to compute advantages for each timestep. If you re-run the trainer with this improved policy, you'll find that it quickly achieves the max reward of 200.
 
-You might be wondering how RLlib makes the advantages placeholder automatically available as ``batch_tensors[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. RLlib tracks which placeholders that ``loss_fn`` and ``stats_fn`` access, and then feeds the corresponding sample data into those placeholders during loss optimization.
+You might be wondering how RLlib makes the advantages placeholder automatically available as ``batch_tensors[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. RLlib tracks which placeholders that ``loss_fn`` and ``stats_fn`` access, and then feeds the corresponding sample data into those placeholders during loss optimization. You can also access these placeholders via ``policy.get_placeholder(<name>)``.
 
 **Example 1: Proximal Policy Optimization**
 

From aa4bc713df1f64c11539c009df7446c953668c4f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sun, 19 May 2019 16:22:13 -0700
Subject: [PATCH 7/9] note on get ph

---
 doc/source/rllib-concepts.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index caf02e35913ce..842b3f8890ee6 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -131,7 +131,7 @@ Let's modify our policy loss to include rewards summed over time. To enable this
 
 The ``postprocess_advantages()`` function above uses calls RLlib's ``compute_advantages`` function to compute advantages for each timestep. If you re-run the trainer with this improved policy, you'll find that it quickly achieves the max reward of 200.
 
-You might be wondering how RLlib makes the advantages placeholder automatically available as ``batch_tensors[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. RLlib tracks which placeholders that ``loss_fn`` and ``stats_fn`` access, and then feeds the corresponding sample data into those placeholders during loss optimization.
+You might be wondering how RLlib makes the advantages placeholder automatically available as ``batch_tensors[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. RLlib tracks which placeholders that ``loss_fn`` and ``stats_fn`` access, and then feeds the corresponding sample data into those placeholders during loss optimization. You can also access these placeholders via ``policy.get_placeholder(<name>)`` after loss initialization.
 
 **Example 1: Proximal Policy Optimization**
 

From dfaf6167ebdfad9d91adcfbf5170916d832cc810 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 25 May 2019 15:30:00 -0700
Subject: [PATCH 8/9] rename to building custom algs

---
 doc/source/rllib-concepts.rst | 4 ++--
 doc/source/rllib.rst          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index 842b3f8890ee6..d16ce4e66b6af 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -1,5 +1,5 @@
-RLlib Concepts and Custom Algorithms
-====================================
+RLlib Concepts and Building Custom Algorithms
+=============================================
 
 This page describes the internal concepts used to implement algorithms in RLlib. You might find this useful if modifying or adding new algorithms to RLlib.
 
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 3f15e90015999..724a3caf83d5d 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -95,8 +95,8 @@ Offline Datasets
 * `Input API <rllib-offline.html#input-api>`__
 * `Output API <rllib-offline.html#output-api>`__
 
-Concepts and Custom Algorithms
-------------------------------
+Building Custom Algorithms
+--------------------------
 *  `Policies <rllib-concepts.html>`__
 
    -  `Building Policies in TensorFlow <rllib-concepts.html#building-policies-in-tensorflow>`__

From 5e8fced1f15b797e533ad790a0864ca3663ff726 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sun, 26 May 2019 13:22:43 -0700
Subject: [PATCH 9/9] add rnn state info

---
 doc/source/rllib-concepts.rst | 61 +++++++++++++++++++++++++++++++++--
 doc/source/rllib.rst          |  4 +--
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index d16ce4e66b6af..06e890832295a 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -8,7 +8,7 @@ Policies
 
 Policy classes encapsulate the core numerical components of RL algorithms. This typically includes the policy model that determines actions to take, a trajectory postprocessor for experiences, and a loss function to improve the policy given postprocessed experiences. For a simple example, see the policy gradients `graph definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy.py>`__.
 
-Most interaction with deep learning frameworks is isolated to the `Policy interface <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/policy.py>`__, allowing RLlib to support multiple frameworks. To simplify the definition of policies, RLlib includes `Tensorflow <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/tf_policy_template.py>`__ and `PyTorch-specific <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/torch_policy_template.py>`__ templates. You can also write your own from scratch. Here is an example:
+Most interaction with deep learning frameworks is isolated to the `Policy interface <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/policy.py>`__, allowing RLlib to support multiple frameworks. To simplify the definition of policies, RLlib includes `Tensorflow <#building-policies-in-tensorflow>`__ and `PyTorch-specific <#building-policies-in-pytorch>`__ templates. You can also write your own from scratch. Here is an example:
 
 .. code-block:: python
 
@@ -46,6 +46,63 @@ Most interaction with deep learning frameworks is isolated to the `Policy interf
         def set_weights(self, weights):
             self.w = weights["w"]
 
+
+The above basic policy, when run, will produce batches of observations with the basic ``obs``, ``new_obs``, ``actions``, ``rewards``, ``dones``, and ``infos`` columns. There are two more mechanisms to pass along and emit extra information:
+
+**Policy recurrent state**: Suppose you want to compute actions based on the current timestep of the episode. While it is possible to have the environment provide this as part of the observation, we can instead compute and store it as part of the Policy recurrent state:
+
+.. code-block:: python
+
+    def get_initial_state(self):
+        """Returns initial RNN state for the current policy."""
+        return [0]  # list of single state element (t=0)
+                    # you could also return multiple values, e.g., [0, "foo"]
+
+    def compute_actions(self,
+                        obs_batch,
+                        state_batches,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
+                        info_batch=None,
+                        episodes=None,
+                        **kwargs):
+        assert len(state_batches) == len(self.get_initial_state())
+        new_state_batches = [[
+            t + 1 for t in state_batches[0]
+        ]]
+        return ..., new_state_batches, {}
+
+    def learn_on_batch(self, samples):
+        # can access array of the state elements at each timestep
+        # or state_in_1, 2, etc. if there are multiple state elements
+        assert "state_in_0" in samples.keys()
+        assert "state_out_0" in samples.keys()
+
+
+**Extra action info output**: You can also emit extra outputs at each step which will be available for learning on. For example, you might want to output the behaviour policy logits as extra action info, which can be used for importance weighting, but in general arbitrary values can be stored here (as long as they are convertible to numpy arrays):
+
+.. code-block:: python
+
+    def compute_actions(self,
+                        obs_batch,
+                        state_batches,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
+                        info_batch=None,
+                        episodes=None,
+                        **kwargs):
+        action_info_batch = {
+            "some_value": ["foo" for _ in obs_batch],
+            "other_value": [12345 for _ in obs_batch],
+        }
+        return ..., [], action_info_batch
+
+    def learn_on_batch(self, samples):
+        # can access array of the extra values at each timestep
+        assert "some_value" in samples.keys()
+        assert "other_value" in samples.keys()
+
+
 Building Policies in TensorFlow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -427,7 +484,7 @@ Trainers
 
 Trainers are the boilerplate classes that put the above components together, making algorithms accessible via Python API and the command line. They manage algorithm configuration, setup of the rollout workers and optimizer, and collection of training metrics. Trainers also implement the `Trainable API <https://ray.readthedocs.io/en/latest/tune-usage.html#training-api>`__ for easy experiment management.
 
-Example of three equivalent ways of interacting with the PPO trainer:
+Example of three equivalent ways of interacting with the PPO trainer, all of which log results in ``~/ray_results``:
 
 .. code-block:: python
 
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 724a3caf83d5d..e77a0ab427f81 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -95,8 +95,8 @@ Offline Datasets
 * `Input API <rllib-offline.html#input-api>`__
 * `Output API <rllib-offline.html#output-api>`__
 
-Building Custom Algorithms
---------------------------
+Concepts and Building Custom Algorithms
+---------------------------------------
 *  `Policies <rllib-concepts.html>`__
 
    -  `Building Policies in TensorFlow <rllib-concepts.html#building-policies-in-tensorflow>`__