From 93b86ed97703b8e5716ac7d9c5e68d13d77676ea Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 14:54:12 -0700
Subject: [PATCH 01/13] wip

---
 doc/source/rllib-algorithms.rst               |  2 +-
 doc/source/rllib-concepts.rst                 | 20 ++---
 doc/source/rllib-env.rst                      | 18 ++---
 doc/source/rllib-models.rst                   | 26 +++----
 doc/source/rllib-offline.rst                  |  4 +-
 doc/source/rllib.rst                          |  4 +-
 python/ray/rllib/__init__.py                  |  8 +-
 python/ray/rllib/agents/a3c/a3c.py            |  8 +-
 .../rllib/agents/a3c/a3c_tf_policy_graph.py   | 24 +++---
 python/ray/rllib/agents/ddpg/ddpg.py          |  4 +-
 .../rllib/agents/ddpg/ddpg_policy_graph.py    | 34 ++++-----
 python/ray/rllib/agents/dqn/dqn.py            |  8 +-
 .../ray/rllib/agents/dqn/dqn_policy_graph.py  | 46 ++++++------
 python/ray/rllib/agents/impala/impala.py      | 14 ++--
 .../agents/impala/vtrace_policy_graph.py      | 34 ++++-----
 python/ray/rllib/agents/marwil/marwil.py      |  8 +-
 .../agents/marwil/marwil_policy_graph.py      | 18 ++---
 python/ray/rllib/agents/pg/pg.py              |  4 +-
 python/ray/rllib/agents/ppo/appo.py           |  6 +-
 .../ray/rllib/agents/ppo/appo_policy_graph.py |  6 +-
 python/ray/rllib/agents/ppo/ppo.py            |  4 +-
 .../ray/rllib/agents/ppo/ppo_policy_graph.py  |  2 +-
 python/ray/rllib/agents/qmix/qmix.py          |  4 +-
 .../rllib/agents/qmix/qmix_policy_graph.py    | 18 ++---
 python/ray/rllib/agents/trainer.py            | 32 ++++----
 python/ray/rllib/agents/trainer_template.py   | 14 ++--
 python/ray/rllib/evaluation/__init__.py       | 10 +--
 .../evaluation/dynamic_tf_policy_graph.py     | 26 +++----
 python/ray/rllib/evaluation/episode.py        |  4 +-
 python/ray/rllib/evaluation/interface.py      |  2 +-
 .../rllib/evaluation/keras_policy_graph.py    | 12 +--
 python/ray/rllib/evaluation/metrics.py        |  2 +-
 .../ray/rllib/evaluation/policy_evaluator.py  | 74 +++++++++----------
 python/ray/rllib/evaluation/policy_graph.py   | 18 ++---
 .../rllib/evaluation/sample_batch_builder.py  |  2 +-
 python/ray/rllib/evaluation/sampler.py        |  8 +-
 .../ray/rllib/evaluation/tf_policy_graph.py   | 32 ++++----
 .../rllib/evaluation/tf_policy_template.py    | 30 ++++----
 .../rllib/evaluation/torch_policy_graph.py    | 24 +++---
 .../rllib/evaluation/torch_policy_template.py | 32 ++++----
 .../rllib/examples/hierarchical_training.py   |  2 +-
 .../ray/rllib/examples/multiagent_cartpole.py | 10 +--
 .../examples/multiagent_custom_policy.py      |  6 +-
 .../rllib/examples/multiagent_two_trainers.py | 14 ++--
 .../policy_evaluator_custom_workflow.py       | 12 +--
 python/ray/rllib/models/model.py              |  2 +-
 .../ray/rllib/offline/off_policy_estimator.py |  4 +-
 python/ray/rllib/optimizers/multi_gpu_impl.py |  2 +-
 .../rllib/optimizers/multi_gpu_optimizer.py   |  8 +-
 python/ray/rllib/tests/test_evaluators.py     |  2 +-
 python/ray/rllib/tests/test_external_env.py   | 14 ++--
 .../tests/test_external_multi_agent_env.py    | 16 ++--
 python/ray/rllib/tests/test_io.py             |  6 +-
 .../ray/rllib/tests/test_multi_agent_env.py   | 72 +++++++++---------
 python/ray/rllib/tests/test_nested_spaces.py  |  4 +-
 python/ray/rllib/tests/test_optimizers.py     |  6 +-
 python/ray/rllib/tests/test_perf.py           |  4 +-
 .../ray/rllib/tests/test_policy_evaluator.py  | 42 +++++------
 58 files changed, 436 insertions(+), 436 deletions(-)

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index 2f1a74b2458b2..5a07280e3972e 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -274,7 +274,7 @@ QMIX Monotonic Value Factorisation (QMIX, VDN, IQN)
 ---------------------------------------------------
 `[paper] <https://arxiv.org/abs/1803.11485>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/qmix/qmix.py>`__ Q-Mix is a specialized multi-agent algorithm. Code here is adapted from https://github.com/oxwhirl/pymarl_alpha  to integrate with RLlib multi-agent APIs. To use Q-Mix, you must specify an agent `grouping <rllib-env.html#grouping-agents>`__ in the environment (see the `two-step game example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/twostep_game.py>`__). Currently, all agents in the group must be homogeneous. The algorithm can be scaled by increasing the number of workers or using Ape-X.
 
-Q-Mix is implemented in `PyTorch <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/qmix/qmix_policy_graph.py>`__ and is currently *experimental*.
+Q-Mix is implemented in `PyTorch <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/qmix/qmix_policy.py>`__ and is currently *experimental*.
 
 Tuned examples: `Two-step game <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/twostep_game.py>`__
 
diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index d91a29f28b9ff..8ddcd06943a4f 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -3,24 +3,24 @@ RLlib Concepts
 
 This page describes the internal concepts used to implement algorithms in RLlib. You might find this useful if modifying or adding new algorithms to RLlib.
 
-Policy Graphs
+Policies
 -------------
 
-Policy graph classes encapsulate the core numerical components of RL algorithms. This typically includes the policy model that determines actions to take, a trajectory postprocessor for experiences, and a loss function to improve the policy given postprocessed experiences. For a simple example, see the policy gradients `graph definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy_graph.py>`__.
+Policy graph classes encapsulate the core numerical components of RL algorithms. This typically includes the policy model that determines actions to take, a trajectory postprocessor for experiences, and a loss function to improve the policy given postprocessed experiences. For a simple example, see the policy gradients `graph definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy.py>`__.
 
-Most interaction with deep learning frameworks is isolated to the `PolicyGraph interface <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_graph.py>`__, allowing RLlib to support multiple frameworks. To simplify the definition of policy graphs, RLlib includes `Tensorflow <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/tf_policy_graph.py>`__ and `PyTorch-specific <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/torch_policy_graph.py>`__ templates. You can also write your own from scratch. Here is an example:
+Most interaction with deep learning frameworks is isolated to the `Policy interface <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy.py>`__, allowing RLlib to support multiple frameworks. To simplify the definition of policies, RLlib includes `Tensorflow <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/tf_policy.py>`__ and `PyTorch-specific <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/torch_policy.py>`__ templates. You can also write your own from scratch. Here is an example:
 
 .. code-block:: python
 
-    class CustomPolicy(PolicyGraph):
-        """Example of a custom policy graph written from scratch.
+    class CustomPolicy(Policy):
+        """Example of a custom policy written from scratch.
 
-        You might find it more convenient to extend TF/TorchPolicyGraph instead
+        You might find it more convenient to extend TF/TorchPolicy instead
         for a real policy.
         """
 
         def __init__(self, observation_space, action_space, config):
-            PolicyGraph.__init__(self, observation_space, action_space, config)
+            Policy.__init__(self, observation_space, action_space, config)
             # example parameter
             self.w = 1.0
 
@@ -48,7 +48,7 @@ Most interaction with deep learning frameworks is isolated to the `PolicyGraph i
 Policy Evaluation
 -----------------
 
-Given an environment and policy graph, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
+Given an environment and policy, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
 
 You can use policy evaluation standalone to produce batches of experiences. This can be done by calling ``ev.sample()`` on an evaluator instance, or ``ev.sample.remote()`` in parallel on evaluator instances created as Ray actors (see ``PolicyEvaluator.as_remote()``).
 
@@ -81,9 +81,9 @@ Here is an example of creating a set of policy evaluation actors and using the t
 Policy Optimization
 -------------------
 
-Similar to how a `gradient-descent optimizer <https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer>`__ can be used to improve a model, RLlib's `policy optimizers <https://github.com/ray-project/ray/tree/master/python/ray/rllib/optimizers>`__ implement different strategies for improving a policy graph.
+Similar to how a `gradient-descent optimizer <https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer>`__ can be used to improve a model, RLlib's `policy optimizers <https://github.com/ray-project/ray/tree/master/python/ray/rllib/optimizers>`__ implement different strategies for improving a policy.
 
-For example, in A3C you'd want to compute gradients asynchronously on different workers, and apply them to a central policy graph replica. This strategy is implemented by the `AsyncGradientsOptimizer <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/async_gradients_optimizer.py>`__. Another alternative is to gather experiences synchronously in parallel and optimize the model centrally, as in `SyncSamplesOptimizer <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/sync_samples_optimizer.py>`__. Policy optimizers abstract these strategies away into reusable modules.
+For example, in A3C you'd want to compute gradients asynchronously on different workers, and apply them to a central policy replica. This strategy is implemented by the `AsyncGradientsOptimizer <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/async_gradients_optimizer.py>`__. Another alternative is to gather experiences synchronously in parallel and optimize the model centrally, as in `SyncSamplesOptimizer <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/sync_samples_optimizer.py>`__. Policy optimizers abstract these strategies away into reusable modules.
 
 This is how the example in the previous section looks when written using a policy optimizer:
 
diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst
index 2701a689dc2c1..3d00ac69bcdea 100644
--- a/doc/source/rllib-env.rst
+++ b/doc/source/rllib-env.rst
@@ -167,8 +167,8 @@ If all the agents will be using the same algorithm class to train, then you can
 
     trainer = pg.PGAgent(env="my_multiagent_env", config={
         "multiagent": {
-            "policy_graphs": {
-                # the first tuple value is None -> uses default policy graph
+            "policies": {
+                # the first tuple value is None -> uses default policy
                 "car1": (None, car_obs_space, car_act_space, {"gamma": 0.85}),
                 "car2": (None, car_obs_space, car_act_space, {"gamma": 0.99}),
                 "traffic_light": (None, tl_obs_space, tl_act_space, {}),
@@ -234,10 +234,10 @@ This can be implemented as a multi-agent environment with three types of agents.
 .. code-block:: python
 
     "multiagent": {
-        "policy_graphs": {
-            "top_level": (custom_policy_graph or None, ...),
-            "mid_level": (custom_policy_graph or None, ...),
-            "low_level": (custom_policy_graph or None, ...),
+        "policies": {
+            "top_level": (custom_policy or None, ...),
+            "mid_level": (custom_policy or None, ...),
+            "low_level": (custom_policy or None, ...),
         },
         "policy_mapping_fn":
             lambda agent_id:
@@ -269,9 +269,9 @@ There is a full example of this in the `example training script <https://github.
 Implementing a Centralized Critic
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Implementing a centralized critic that takes as input the observations and actions of other concurrent agents requires the definition of custom policy graphs. It can be done as follows:
+Implementing a centralized critic that takes as input the observations and actions of other concurrent agents requires the definition of custom policies. It can be done as follows:
 
-1. Querying the critic: this can be done in the ``postprocess_trajectory`` method of a custom policy graph, which has full access to the policies and observations of concurrent agents via the ``other_agent_batches`` and ``episode`` arguments. The batch of critic predictions can then be added to the postprocessed trajectory. Here's an example:
+1. Querying the critic: this can be done in the ``postprocess_trajectory`` method of a custom policy, which has full access to the policies and observations of concurrent agents via the ``other_agent_batches`` and ``episode`` arguments. The batch of critic predictions can then be added to the postprocessed trajectory. Here's an example:
 
 .. code-block:: python
 
@@ -286,7 +286,7 @@ Implementing a centralized critic that takes as input the observations and actio
             self.critic_network, feed_dict={"obs": global_obs_batch})
         return sample_batch
 
-2. Updating the critic: the centralized critic loss can be added to the loss of the custom policy graph, the same as with any other value function. For an example of defining loss inputs, see the `PGPolicyGraph example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy_graph.py>`__.
+2. Updating the critic: the centralized critic loss can be added to the loss of the custom policy, the same as with any other value function. For an example of defining loss inputs, see the `PGPolicy example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy.py>`__.
 
 Grouping Agents
 ~~~~~~~~~~~~~~~
diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst
index b429e04be417a..0978d15729643 100644
--- a/doc/source/rllib-models.rst
+++ b/doc/source/rllib-models.rst
@@ -101,7 +101,7 @@ Custom TF models should subclass the common RLlib `model class <https://github.c
             You can find an runnable example in examples/custom_loss.py.
 
             Arguments:
-                policy_loss (Tensor): scalar policy loss from the policy graph.
+                policy_loss (Tensor): scalar policy loss from the policy.
                 loss_inputs (dict): map of input placeholders for rollout data.
 
             Returns:
@@ -175,7 +175,7 @@ Instead of using the ``use_lstm: True`` option, it can be preferable use a custo
 Batch Normalization
 ~~~~~~~~~~~~~~~~~~~
 
-You can use ``tf.layers.batch_normalization(x, training=input_dict["is_training"])`` to add batch norm layers to your custom model: `code example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/batch_norm_model.py>`__. RLlib will automatically run the update ops for the batch norm layers during optimization (see `tf_policy_graph.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/tf_policy_graph.py>`__ and `multi_gpu_impl.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/multi_gpu_impl.py>`__ for the exact handling of these updates).
+You can use ``tf.layers.batch_normalization(x, training=input_dict["is_training"])`` to add batch norm layers to your custom model: `code example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/batch_norm_model.py>`__. RLlib will automatically run the update ops for the batch norm layers during optimization (see `tf_policy.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/tf_policy.py>`__ and `multi_gpu_impl.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/multi_gpu_impl.py>`__ for the exact handling of these updates).
 
 Custom Models (PyTorch)
 -----------------------
@@ -263,7 +263,7 @@ You can mix supervised losses into any RLlib algorithm through custom models. Fo
 
 **TensorFlow**: To add a supervised loss to a custom TF model, you need to override the ``custom_loss()`` method. This method takes in the existing policy loss for the algorithm, which you can add your own supervised loss to before returning. For debugging, you can also return a dictionary of scalar tensors in the ``custom_metrics()`` method. Here is a `runnable example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/custom_loss.py>`__ of adding an imitation loss to CartPole training that is defined over a `offline dataset <rllib-offline.html#input-pipeline-for-supervised-losses>`__.
 
-**PyTorch**: There is no explicit API for adding losses to custom torch models. However, you can modify the loss in the policy graph definition directly. Like for TF models, offline datasets can be incorporated by creating an input reader and calling ``reader.next()`` in the loss forward pass.
+**PyTorch**: There is no explicit API for adding losses to custom torch models. However, you can modify the loss in the policy definition directly. Like for TF models, offline datasets can be incorporated by creating an input reader and calling ``reader.next()`` in the loss forward pass.
 
 
 Variable-length / Parametric Action Spaces
@@ -312,15 +312,15 @@ Custom models can be used to work with environments where (1) the set of valid a
 
 Depending on your use case it may make sense to use just the masking, just action embeddings, or both. For a runnable example of this in code, check out `parametric_action_cartpole.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/parametric_action_cartpole.py>`__. Note that since masking introduces ``tf.float32.min`` values into the model output, this technique might not work with all algorithm options. For example, algorithms might crash if they incorrectly process the ``tf.float32.min`` values. The cartpole example has working configurations for DQN (must set ``hiddens=[]``), PPO (must disable running mean and set ``vf_share_layers=True``), and several other algorithms.
 
-Customizing Policy Graphs
+Customizing Policies
 -------------------------
 
-For deeper customization of algorithms, you can modify the policy graphs of the trainer classes. Here's an example of extending the DDPG policy graph to specify custom sub-network modules:
+For deeper customization of algorithms, you can modify the policies of the trainer classes. Here's an example of extending the DDPG policy to specify custom sub-network modules:
 
 .. code-block:: python
 
     from ray.rllib.models import ModelCatalog
-    from ray.rllib.agents.ddpg.ddpg_policy_graph import DDPGPolicyGraph as BaseDDPGPolicyGraph
+    from ray.rllib.agents.ddpg.ddpg_policy import DDPGPolicy as BaseDDPGPolicy
 
     class CustomPNetwork(object):
         def __init__(self, dim_actions, hiddens, activation):
@@ -336,7 +336,7 @@ For deeper customization of algorithms, you can modify the policy graphs of the
             self.value = layers.fully_connected(
                 q_out, num_outputs=1, activation_fn=None)
 
-    class CustomDDPGPolicyGraph(BaseDDPGPolicyGraph):
+    class CustomDDPGPolicy(BaseDDPGPolicy):
         def _build_p_network(self, obs):
             return CustomPNetwork(
                 self.dim_actions,
@@ -349,26 +349,26 @@ For deeper customization of algorithms, you can modify the policy graphs of the
                 self.config["critic_hiddens"],
                 self.config["critic_hidden_activation"]).value
 
-Then, you can create an trainer with your custom policy graph by:
+Then, you can create an trainer with your custom policy by:
 
 .. code-block:: python
 
     from ray.rllib.agents.ddpg.ddpg import DDPGTrainer
-    from custom_policy_graph import CustomDDPGPolicyGraph
+    from custom_policy import CustomDDPGPolicy
 
-    DDPGTrainer._policy_graph = CustomDDPGPolicyGraph
+    DDPGTrainer._policy = CustomDDPGPolicy
     trainer = DDPGTrainer(...)
 
-In this example we overrode existing methods of the existing DDPG policy graph, i.e., `_build_q_network`, `_build_p_network`, `_build_action_network`, `_build_actor_critic_loss`, but you can also replace the entire graph class entirely.
+In this example we overrode existing methods of the existing DDPG policy, i.e., `_build_q_network`, `_build_p_network`, `_build_action_network`, `_build_actor_critic_loss`, but you can also replace the entire graph class entirely.
 
 Model-Based Rollouts
 ~~~~~~~~~~~~~~~~~~~~
 
-With a custom policy graph, you can also perform model-based rollouts and optionally incorporate the results of those rollouts as training data. For example, suppose you wanted to extend PGPolicyGraph for model-based rollouts. This involves overriding the ``compute_actions`` method of that policy graph:
+With a custom policy, you can also perform model-based rollouts and optionally incorporate the results of those rollouts as training data. For example, suppose you wanted to extend PGPolicy for model-based rollouts. This involves overriding the ``compute_actions`` method of that policy:
 
 .. code-block:: python
 
-        class ModelBasedPolicyGraph(PGPolicyGraph):
+        class ModelBasedPolicy(PGPolicy):
              def compute_actions(self,
                                  obs_batch,
                                  state_batches,
diff --git a/doc/source/rllib-offline.rst b/doc/source/rllib-offline.rst
index 42dd5f5b4909d..8aee4123c6f76 100644
--- a/doc/source/rllib-offline.rst
+++ b/doc/source/rllib-offline.rst
@@ -65,7 +65,7 @@ This example plot shows the Q-value metric in addition to importance sampling (I
 
 .. image:: offline-q.png
 
-**Estimator Python API:** For greater control over the evaluation process, you can create off-policy estimators in your Python code and call ``estimator.estimate(episode_batch)`` to perform counterfactual estimation as needed. The estimators take in a policy graph object and gamma value for the environment:
+**Estimator Python API:** For greater control over the evaluation process, you can create off-policy estimators in your Python code and call ``estimator.estimate(episode_batch)`` to perform counterfactual estimation as needed. The estimators take in a policy object and gamma value for the environment:
 
 .. code-block:: python
 
@@ -99,7 +99,7 @@ This `runnable example <https://github.com/ray-project/ray/blob/master/python/ra
 On-policy algorithms and experience postprocessing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-RLlib assumes that input batches are of `postprocessed experiences <https://github.com/ray-project/ray/blob/b8a9e3f1064c6f8d754884fd9c75e0b2f88df4d6/python/ray/rllib/evaluation/policy_graph.py#L103>`__. This isn't typically critical for off-policy algorithms (e.g., DQN's `post-processing <https://github.com/ray-project/ray/blob/b8a9e3f1064c6f8d754884fd9c75e0b2f88df4d6/python/ray/rllib/agents/dqn/dqn_policy_graph.py#L514>`__ is only needed if ``n_step > 1`` or ``worker_side_prioritization: True``). For off-policy algorithms, you can also safely set the ``postprocess_inputs: True`` config to auto-postprocess data.
+RLlib assumes that input batches are of `postprocessed experiences <https://github.com/ray-project/ray/blob/b8a9e3f1064c6f8d754884fd9c75e0b2f88df4d6/python/ray/rllib/evaluation/policy.py#L103>`__. This isn't typically critical for off-policy algorithms (e.g., DQN's `post-processing <https://github.com/ray-project/ray/blob/b8a9e3f1064c6f8d754884fd9c75e0b2f88df4d6/python/ray/rllib/agents/dqn/dqn_policy.py#L514>`__ is only needed if ``n_step > 1`` or ``worker_side_prioritization: True``). For off-policy algorithms, you can also safely set the ``postprocess_inputs: True`` config to auto-postprocess data.
 
 However, for on-policy algorithms like PPO, you'll need to pass in the extra values added during policy evaluation and postprocessing to ``batch_builder.add_values()``, e.g., ``logits``, ``vf_preds``, ``value_target``, and ``advantages`` for PPO. This is needed since the calculation of these values depends on the parameters of the *behaviour* policy, which RLlib does not have access to in the offline setting (in online training, these values are automatically added during policy evaluation).
 
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 06c5800355077..02b1bc3478ee8 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -50,7 +50,7 @@ Models and Preprocessors
 * `Custom Preprocessors <rllib-models.html#custom-preprocessors>`__
 * `Supervised Model Losses <rllib-models.html#supervised-model-losses>`__
 * `Variable-length / Parametric Action Spaces <rllib-models.html#variable-length-parametric-action-spaces>`__
-* `Customizing Policy Graphs <rllib-models.html#customizing-policy-graphs>`__
+* `Customizing Policies <rllib-models.html#customizing-policys>`__
 
 Algorithms
 ----------
@@ -98,7 +98,7 @@ Offline Datasets
 
 Concepts
 --------
-* `Policy Graphs <rllib-concepts.html>`__
+* `Policies <rllib-concepts.html>`__
 * `Policy Evaluation <rllib-concepts.html#policy-evaluation>`__
 * `Policy Optimization <rllib-concepts.html#policy-optimization>`__
 * `Trainers <rllib-concepts.html#trainers>`__
diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py
index 613199cf795f8..5577f4cd0355d 100644
--- a/python/ray/rllib/__init__.py
+++ b/python/ray/rllib/__init__.py
@@ -8,8 +8,8 @@
 # This file is imported from the tune module in order to register RLlib agents.
 from ray.tune.registry import register_trainable
 
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.tf_policy import TFPolicy
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.env.vector_env import VectorEnv
@@ -43,8 +43,8 @@ def _register_all():
 _register_all()
 
 __all__ = [
-    "PolicyGraph",
-    "TFPolicyGraph",
+    "Policy",
+    "TFPolicy",
     "PolicyEvaluator",
     "SampleBatch",
     "BaseEnv",
diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py
index eb384058de806..ad3544306e103 100644
--- a/python/ray/rllib/agents/a3c/a3c.py
+++ b/python/ray/rllib/agents/a3c/a3c.py
@@ -4,7 +4,7 @@
 
 import time
 
-from ray.rllib.agents.a3c.a3c_tf_policy_graph import A3CPolicyGraph
+from ray.rllib.agents.a3c.a3c_tf_policy import A3CPolicy
 from ray.rllib.agents.trainer import Trainer, with_common_config
 from ray.rllib.optimizers import AsyncGradientsOptimizer
 from ray.rllib.utils.annotations import override
@@ -43,16 +43,16 @@ class A3CTrainer(Trainer):
 
     _name = "A3C"
     _default_config = DEFAULT_CONFIG
-    _policy_graph = A3CPolicyGraph
+    _policy = A3CPolicy
 
     @override(Trainer)
     def _init(self, config, env_creator):
         if config["use_pytorch"]:
-            from ray.rllib.agents.a3c.a3c_torch_policy_graph import \
+            from ray.rllib.agents.a3c.a3c_torch_policy import \
                 A3CTorchPolicy
             policy_cls = A3CTorchPolicy
         else:
-            policy_cls = self._policy_graph
+            policy_cls = self._policy
 
         if config["entropy_coeff"] < 0:
             raise DeprecationWarning("entropy_coeff must be >= 0")
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
index e6ae8d17bad38..94063dcdf90be 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
@@ -1,4 +1,4 @@
-"""Note: Keep in sync with changes to VTracePolicyGraph."""
+"""Note: Keep in sync with changes to VTracePolicy."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -11,10 +11,10 @@
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.explained_variance import explained_variance
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \
+from ray.rllib.evaluation.tf_policy import TFPolicy, \
     LearningRateSchedule
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.annotations import override
@@ -47,13 +47,13 @@ def __init__(self,
 class A3CPostprocessing(object):
     """Adds the VF preds and advantages fields to the trajectory."""
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_action_fetches(self):
         return dict(
-            TFPolicyGraph.extra_compute_action_fetches(self),
+            TFPolicy.extra_compute_action_fetches(self),
             **{SampleBatch.VF_PREDS: self.vf})
 
-    @override(PolicyGraph)
+    @override(Policy)
     def postprocess_trajectory(self,
                                sample_batch,
                                other_agent_batches=None,
@@ -73,7 +73,7 @@ def postprocess_trajectory(self,
                                   self.config["lambda"])
 
 
-class A3CPolicyGraph(LearningRateSchedule, A3CPostprocessing, TFPolicyGraph):
+class A3CPolicy(LearningRateSchedule, A3CPostprocessing, TFPolicy):
     def __init__(self, observation_space, action_space, config):
         config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
         self.config = config
@@ -114,7 +114,7 @@ def __init__(self, observation_space, action_space, config):
                             self.vf, self.config["vf_loss_coeff"],
                             self.config["entropy_coeff"])
 
-        # Initialize TFPolicyGraph
+        # Initialize TFPolicy
         loss_in = [
             (SampleBatch.CUR_OBS, self.observations),
             (SampleBatch.ACTIONS, actions),
@@ -125,7 +125,7 @@ def __init__(self, observation_space, action_space, config):
         ]
         LearningRateSchedule.__init__(self, self.config["lr"],
                                       self.config["lr_schedule"])
-        TFPolicyGraph.__init__(
+        TFPolicy.__init__(
             self,
             observation_space,
             action_space,
@@ -157,18 +157,18 @@ def __init__(self, observation_space, action_space, config):
 
         self.sess.run(tf.global_variables_initializer())
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_initial_state(self):
         return self.model.state_init
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def gradients(self, optimizer, loss):
         grads = tf.gradients(loss, self.var_list)
         self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
         clipped_grads = list(zip(self.grads, self.var_list))
         return clipped_grads
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_grad_fetches(self):
         return self.stats_fetches
 
diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py
index 7a140beeea242..69672c986b07a 100644
--- a/python/ray/rllib/agents/ddpg/ddpg.py
+++ b/python/ray/rllib/agents/ddpg/ddpg.py
@@ -4,7 +4,7 @@
 
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.dqn.dqn import DQNTrainer
-from ray.rllib.agents.ddpg.ddpg_policy_graph import DDPGPolicyGraph
+from ray.rllib.agents.ddpg.ddpg_policy import DDPGPolicy
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
 
@@ -163,7 +163,7 @@ class DDPGTrainer(DQNTrainer):
     """DDPG implementation in TensorFlow."""
     _name = "DDPG"
     _default_config = DEFAULT_CONFIG
-    _policy_graph = DDPGPolicyGraph
+    _policy = DDPGPolicy
 
     @override(DQNTrainer)
     def _train(self):
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
index 675f9187f2c60..eaea8c5dc965f 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
@@ -7,15 +7,15 @@
 
 import ray
 import ray.experimental.tf_utils
-from ray.rllib.agents.dqn.dqn_policy_graph import (
+from ray.rllib.agents.dqn.dqn_policy import (
     _huber_loss, _minimize_and_clip, _scope_vars, _postprocess_dqn)
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.tf_policy import TFPolicy
 from ray.rllib.utils import try_import_tf
 
 tf = try_import_tf()
@@ -35,7 +35,7 @@
 class DDPGPostprocessing(object):
     """Implements n-step learning and param noise adjustments."""
 
-    @override(PolicyGraph)
+    @override(Policy)
     def postprocess_trajectory(self,
                                sample_batch,
                                other_agent_batches=None,
@@ -68,7 +68,7 @@ def postprocess_trajectory(self,
         return _postprocess_dqn(self, sample_batch)
 
 
-class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph):
+class DDPGPolicy(DDPGPostprocessing, TFPolicy):
     def __init__(self, observation_space, action_space, config):
         config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config)
         if not isinstance(action_space, Box):
@@ -281,7 +281,7 @@ def __init__(self, observation_space, action_space, config):
                 self.critic_loss = self.twin_q_model.custom_loss(
                     self.critic_loss, input_dict)
 
-        TFPolicyGraph.__init__(
+        TFPolicy.__init__(
             self,
             observation_space,
             action_space,
@@ -301,12 +301,12 @@ def __init__(self, observation_space, action_space, config):
         # Hard initial update
         self.update_target(tau=1.0)
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def optimizer(self):
         # we don't use this because we have two separate optimisers
         return None
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def build_apply_op(self, optimizer, grads_and_vars):
         # for policy gradient, update policy net one time v.s.
         # update critic net `policy_delay` time(s)
@@ -327,7 +327,7 @@ def make_apply_op():
         with tf.control_dependencies([tf.assign_add(self.global_step, 1)]):
             return tf.group(actor_op, critic_op)
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def gradients(self, optimizer, loss):
         if self.config["grad_norm_clipping"] is not None:
             actor_grads_and_vars = _minimize_and_clip(
@@ -360,7 +360,7 @@ def gradients(self, optimizer, loss):
             + self._critic_grads_and_vars
         return grads_and_vars
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_action_feed_dict(self):
         return {
             # FIXME: what about turning off exploration? Isn't that a good
@@ -370,31 +370,31 @@ def extra_compute_action_feed_dict(self):
             self.pure_exploration_phase: self.cur_pure_exploration_phase,
         }
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_grad_fetches(self):
         return {
             "td_error": self.td_error,
             LEARNER_STATS_KEY: self.stats,
         }
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def get_weights(self):
         return self.variables.get_weights()
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def set_weights(self, weights):
         self.variables.set_weights(weights)
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_state(self):
         return [
-            TFPolicyGraph.get_state(self), self.cur_noise_scale,
+            TFPolicy.get_state(self), self.cur_noise_scale,
             self.cur_pure_exploration_phase
         ]
 
-    @override(PolicyGraph)
+    @override(Policy)
     def set_state(self, state):
-        TFPolicyGraph.set_state(self, state[0])
+        TFPolicy.set_state(self, state[0])
         self.set_epsilon(state[1])
         self.set_pure_exploration_phase(state[2])
 
diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py
index d8fb480cbda65..9a9cc2821b5dc 100644
--- a/python/ray/rllib/agents/dqn/dqn.py
+++ b/python/ray/rllib/agents/dqn/dqn.py
@@ -8,7 +8,7 @@
 from ray import tune
 from ray.rllib import optimizers
 from ray.rllib.agents.trainer import Trainer, with_common_config
-from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
+from ray.rllib.agents.dqn.dqn_policy import DQNPolicy
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.annotations import override
@@ -133,7 +133,7 @@ class DQNTrainer(Trainer):
 
     _name = "DQN"
     _default_config = DEFAULT_CONFIG
-    _policy_graph = DQNPolicyGraph
+    _policy = DQNPolicy
     _optimizer_shared_configs = OPTIMIZER_SHARED_CONFIGS
 
     @override(Trainer)
@@ -197,10 +197,10 @@ def on_episode_end(info):
                 on_episode_end)
 
         self.local_evaluator = self.make_local_evaluator(
-            env_creator, self._policy_graph)
+            env_creator, self._policy)
 
         def create_remote_evaluators():
-            return self.make_remote_evaluators(env_creator, self._policy_graph,
+            return self.make_remote_evaluators(env_creator, self._policy,
                                                config["num_workers"])
 
         if config["optimizer_class"] != "AsyncReplayOptimizer":
diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
index 1e682ce80cfac..dc4bcb41a04e2 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
@@ -12,8 +12,8 @@
 from ray.rllib.models import ModelCatalog, Categorical
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.tf_policy import TFPolicy, \
     LearningRateSchedule
 from ray.rllib.utils import try_import_tf
 
@@ -105,14 +105,14 @@ def __init__(self,
 class DQNPostprocessing(object):
     """Implements n-step learning and param noise adjustments."""
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_action_fetches(self):
         return dict(
-            TFPolicyGraph.extra_compute_action_fetches(self), **{
+            TFPolicy.extra_compute_action_fetches(self), **{
                 "q_values": self.q_values,
             })
 
-    @override(PolicyGraph)
+    @override(Policy)
     def postprocess_trajectory(self,
                                sample_batch,
                                other_agent_batches=None,
@@ -345,7 +345,7 @@ def __init__(self, q_values, observations, num_actions, stochastic, eps,
         self.action_prob = None
 
 
-class DQNPolicyGraph(LearningRateSchedule, DQNPostprocessing, TFPolicyGraph):
+class DQNPolicy(LearningRateSchedule, DQNPostprocessing, TFPolicy):
     def __init__(self, observation_space, action_space, config):
         config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
         if not isinstance(action_space, Discrete):
@@ -446,7 +446,7 @@ def __init__(self, observation_space, action_space, config):
             update_target_expr.append(var_target.assign(var))
         self.update_target_expr = tf.group(*update_target_expr)
 
-        # initialize TFPolicyGraph
+        # initialize TFPolicy
         self.sess = tf.get_default_session()
         self.loss_inputs = [
             (SampleBatch.CUR_OBS, self.obs_t),
@@ -459,7 +459,7 @@ def __init__(self, observation_space, action_space, config):
 
         LearningRateSchedule.__init__(self, self.config["lr"],
                                       self.config["lr_schedule"])
-        TFPolicyGraph.__init__(
+        TFPolicy.__init__(
             self,
             observation_space,
             action_space,
@@ -477,12 +477,12 @@ def __init__(self, observation_space, action_space, config):
             "cur_lr": tf.cast(self.cur_lr, tf.float64),
         }, **self.loss.stats)
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def optimizer(self):
         return tf.train.AdamOptimizer(
             learning_rate=self.cur_lr, epsilon=self.config["adam_epsilon"])
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def gradients(self, optimizer, loss):
         if self.config["grad_norm_clipping"] is not None:
             grads_and_vars = _minimize_and_clip(
@@ -496,27 +496,27 @@ def gradients(self, optimizer, loss):
         grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
         return grads_and_vars
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_action_feed_dict(self):
         return {
             self.stochastic: True,
             self.eps: self.cur_epsilon,
         }
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_grad_fetches(self):
         return {
             "td_error": self.loss.td_error,
             LEARNER_STATS_KEY: self.stats_fetches,
         }
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_state(self):
-        return [TFPolicyGraph.get_state(self), self.cur_epsilon]
+        return [TFPolicy.get_state(self), self.cur_epsilon]
 
-    @override(PolicyGraph)
+    @override(Policy)
     def set_state(self, state):
-        TFPolicyGraph.set_state(self, state[0])
+        TFPolicy.set_state(self, state[0])
         self.set_epsilon(state[1])
 
     def _build_parameter_noise(self, pnet_params):
@@ -633,11 +633,11 @@ def _adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
                 rewards[i] += gamma**j * rewards[i + j]
 
 
-def _postprocess_dqn(policy_graph, batch):
+def _postprocess_dqn(policy, batch):
     # N-step Q adjustments
-    if policy_graph.config["n_step"] > 1:
-        _adjust_nstep(policy_graph.config["n_step"],
-                      policy_graph.config["gamma"], batch[SampleBatch.CUR_OBS],
+    if policy.config["n_step"] > 1:
+        _adjust_nstep(policy.config["n_step"],
+                      policy.config["gamma"], batch[SampleBatch.CUR_OBS],
                       batch[SampleBatch.ACTIONS], batch[SampleBatch.REWARDS],
                       batch[SampleBatch.NEXT_OBS], batch[SampleBatch.DONES])
 
@@ -645,13 +645,13 @@ def _postprocess_dqn(policy_graph, batch):
         batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])
 
     # Prioritize on the worker side
-    if batch.count > 0 and policy_graph.config["worker_side_prioritization"]:
-        td_errors = policy_graph.compute_td_error(
+    if batch.count > 0 and policy.config["worker_side_prioritization"]:
+        td_errors = policy.compute_td_error(
             batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
             batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
             batch[SampleBatch.DONES], batch[PRIO_WEIGHTS])
         new_priorities = (
-            np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
+            np.abs(td_errors) + policy.config["prioritized_replay_eps"])
         batch.data[PRIO_WEIGHTS] = new_priorities
 
     return batch
diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py
index ffe74c087a3ea..77b47c4dedf9c 100644
--- a/python/ray/rllib/agents/impala/impala.py
+++ b/python/ray/rllib/agents/impala/impala.py
@@ -4,8 +4,8 @@
 
 import time
 
-from ray.rllib.agents.a3c.a3c_tf_policy_graph import A3CPolicyGraph
-from ray.rllib.agents.impala.vtrace_policy_graph import VTracePolicyGraph
+from ray.rllib.agents.a3c.a3c_tf_policy import A3CPolicy
+from ray.rllib.agents.impala.vtrace_policy import VTracePolicy
 from ray.rllib.agents.trainer import Trainer, with_common_config
 from ray.rllib.optimizers import AsyncSamplesOptimizer
 from ray.rllib.optimizers.aso_tree_aggregator import TreeAggregator
@@ -105,14 +105,14 @@ class ImpalaTrainer(Trainer):
 
     _name = "IMPALA"
     _default_config = DEFAULT_CONFIG
-    _policy_graph = VTracePolicyGraph
+    _policy = VTracePolicy
 
     @override(Trainer)
     def _init(self, config, env_creator):
         for k in OPTIMIZER_SHARED_CONFIGS:
             if k not in config["optimizer"]:
                 config["optimizer"][k] = config[k]
-        policy_cls = self._get_policy_graph()
+        policy_cls = self._get_policy()
         self.local_evaluator = self.make_local_evaluator(
             self.env_creator, policy_cls)
 
@@ -158,9 +158,9 @@ def _train(self):
                       prev_steps)
         return result
 
-    def _get_policy_graph(self):
+    def _get_policy(self):
         if self.config["vtrace"]:
-            policy_cls = self._policy_graph
+            policy_cls = self._policy
         else:
-            policy_cls = A3CPolicyGraph
+            policy_cls = A3CPolicy
         return policy_cls
diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
index 56b6de42ed5a3..ed44545fb576d 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
@@ -1,6 +1,6 @@
-"""Adapted from A3CPolicyGraph to add V-trace.
+"""Adapted from A3CPolicy to add V-trace.
 
-Keep in sync with changes to A3CPolicyGraph and VtraceSurrogatePolicyGraph."""
+Keep in sync with changes to A3CPolicy and VtraceSurrogatePolicy."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -11,9 +11,9 @@
 import numpy as np
 from ray.rllib.agents.impala import vtrace
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \
+from ray.rllib.evaluation.tf_policy import TFPolicy, \
     LearningRateSchedule
 from ray.rllib.models.action_dist import MultiCategorical
 from ray.rllib.models.catalog import ModelCatalog
@@ -110,13 +110,13 @@ def __init__(self,
 class VTracePostprocessing(object):
     """Adds the policy logits to the trajectory."""
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_action_fetches(self):
         return dict(
-            TFPolicyGraph.extra_compute_action_fetches(self),
+            TFPolicy.extra_compute_action_fetches(self),
             **{BEHAVIOUR_LOGITS: self.model.outputs})
 
-    @override(PolicyGraph)
+    @override(Policy)
     def postprocess_trajectory(self,
                                sample_batch,
                                other_agent_batches=None,
@@ -126,8 +126,8 @@ def postprocess_trajectory(self,
         return sample_batch
 
 
-class VTracePolicyGraph(LearningRateSchedule, VTracePostprocessing,
-                        TFPolicyGraph):
+class VTracePolicy(LearningRateSchedule, VTracePostprocessing,
+                        TFPolicy):
     def __init__(self,
                  observation_space,
                  action_space,
@@ -285,7 +285,7 @@ def make_time_major(tensor, drop_last=False):
                 "max_KL": tf.reduce_max(kls[0]),
             }
 
-        # Initialize TFPolicyGraph
+        # Initialize TFPolicy
         loss_in = [
             (SampleBatch.ACTIONS, actions),
             (SampleBatch.DONES, dones),
@@ -297,7 +297,7 @@ def make_time_major(tensor, drop_last=False):
         ]
         LearningRateSchedule.__init__(self, self.config["lr"],
                                       self.config["lr_schedule"])
-        TFPolicyGraph.__init__(
+        TFPolicy.__init__(
             self,
             observation_space,
             action_space,
@@ -332,15 +332,15 @@ def make_time_major(tensor, drop_last=False):
             }, **self.KL_stats),
         }
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def copy(self, existing_inputs):
-        return VTracePolicyGraph(
+        return VTracePolicy(
             self.observation_space,
             self.action_space,
             self.config,
             existing_inputs=existing_inputs)
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def optimizer(self):
         if self.config["opt_type"] == "adam":
             return tf.train.AdamOptimizer(self.cur_lr)
@@ -349,17 +349,17 @@ def optimizer(self):
                                              self.config["momentum"],
                                              self.config["epsilon"])
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def gradients(self, optimizer, loss):
         grads = tf.gradients(loss, self.var_list)
         self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
         clipped_grads = list(zip(self.grads, self.var_list))
         return clipped_grads
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_grad_fetches(self):
         return self.stats_fetches
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_initial_state(self):
         return self.model.state_init
diff --git a/python/ray/rllib/agents/marwil/marwil.py b/python/ray/rllib/agents/marwil/marwil.py
index b1e535b645307..d6c6eadeaa9cb 100644
--- a/python/ray/rllib/agents/marwil/marwil.py
+++ b/python/ray/rllib/agents/marwil/marwil.py
@@ -3,7 +3,7 @@
 from __future__ import print_function
 
 from ray.rllib.agents.trainer import Trainer, with_common_config
-from ray.rllib.agents.marwil.marwil_policy_graph import MARWILPolicyGraph
+from ray.rllib.agents.marwil.marwil_policy import MARWILPolicy
 from ray.rllib.optimizers import SyncBatchReplayOptimizer
 from ray.rllib.utils.annotations import override
 
@@ -44,14 +44,14 @@ class MARWILTrainer(Trainer):
 
     _name = "MARWIL"
     _default_config = DEFAULT_CONFIG
-    _policy_graph = MARWILPolicyGraph
+    _policy = MARWILPolicy
 
     @override(Trainer)
     def _init(self, config, env_creator):
         self.local_evaluator = self.make_local_evaluator(
-            env_creator, self._policy_graph)
+            env_creator, self._policy)
         self.remote_evaluators = self.make_remote_evaluators(
-            env_creator, self._policy_graph, config["num_workers"])
+            env_creator, self._policy, config["num_workers"])
         self.optimizer = SyncBatchReplayOptimizer(
             self.local_evaluator,
             self.remote_evaluators,
diff --git a/python/ray/rllib/agents/marwil/marwil_policy_graph.py b/python/ray/rllib/agents/marwil/marwil_policy_graph.py
index 2c647db9aa969..a6b38e4f8667a 100644
--- a/python/ray/rllib/agents/marwil/marwil_policy_graph.py
+++ b/python/ray/rllib/agents/marwil/marwil_policy_graph.py
@@ -9,9 +9,9 @@
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.utils.annotations import override
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
-from ray.rllib.agents.dqn.dqn_policy_graph import _scope_vars
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.agents.dqn.dqn_policy import _scope_vars
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils import try_import_tf
 
@@ -59,7 +59,7 @@ def __init__(self, state_values, cumulative_rewards, logits, actions,
 class MARWILPostprocessing(object):
     """Adds the advantages field to the trajectory."""
 
-    @override(PolicyGraph)
+    @override(Policy)
     def postprocess_trajectory(self,
                                sample_batch,
                                other_agent_batches=None,
@@ -79,7 +79,7 @@ def postprocess_trajectory(self,
         return batch
 
 
-class MARWILPolicyGraph(MARWILPostprocessing, TFPolicyGraph):
+class MARWILPolicy(MARWILPostprocessing, TFPolicy):
     def __init__(self, observation_space, action_space, config):
         config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
         self.config = config
@@ -127,14 +127,14 @@ def __init__(self, observation_space, action_space, config):
         self.explained_variance = tf.reduce_mean(
             explained_variance(self.cum_rew_t, state_values))
 
-        # initialize TFPolicyGraph
+        # initialize TFPolicy
         self.sess = tf.get_default_session()
         self.loss_inputs = [
             (SampleBatch.CUR_OBS, self.obs_t),
             (SampleBatch.ACTIONS, self.act_t),
             (Postprocessing.ADVANTAGES, self.cum_rew_t),
         ]
-        TFPolicyGraph.__init__(
+        TFPolicy.__init__(
             self,
             observation_space,
             action_space,
@@ -166,10 +166,10 @@ def _build_policy_loss(self, state_values, cum_rwds, logits, actions,
         return ReweightedImitationLoss(state_values, cum_rwds, logits, actions,
                                        action_space, self.config["beta"])
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def extra_compute_grad_fetches(self):
         return {LEARNER_STATS_KEY: self.stats_fetches}
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_initial_state(self):
         return self.model.state_init
diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index ffbb899d1b9e5..71e2ab3fbd69c 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -4,7 +4,7 @@
 
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
+from ray.rllib.agents.pg.pg_policy import PGTFPolicy
 
 # yapf: disable
 # __sphinx_doc_begin__
@@ -22,7 +22,7 @@
 
 def get_policy_class(config):
     if config["use_pytorch"]:
-        from ray.rllib.agents.pg.torch_pg_policy_graph import PGTorchPolicy
+        from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy
         return PGTorchPolicy
     else:
         return PGTFPolicy
diff --git a/python/ray/rllib/agents/ppo/appo.py b/python/ray/rllib/agents/ppo/appo.py
index b32531dd7d5ce..0438b27142213 100644
--- a/python/ray/rllib/agents/ppo/appo.py
+++ b/python/ray/rllib/agents/ppo/appo.py
@@ -2,7 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.agents.ppo.appo_policy_graph import AsyncPPOTFPolicy
+from ray.rllib.agents.ppo.appo_policy import AsyncPPOTFPolicy
 from ray.rllib.agents.trainer import with_base_config
 from ray.rllib.agents import impala
 from ray.rllib.utils.annotations import override
@@ -57,8 +57,8 @@ class APPOTrainer(impala.ImpalaTrainer):
 
     _name = "APPO"
     _default_config = DEFAULT_CONFIG
-    _policy_graph = AsyncPPOTFPolicy
+    _policy = AsyncPPOTFPolicy
 
     @override(impala.ImpalaTrainer)
-    def _get_policy_graph(self):
+    def _get_policy(self):
         return AsyncPPOTFPolicy
diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py
index 5aa76913194fa..a5e5f58245683 100644
--- a/python/ray/rllib/agents/ppo/appo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py
@@ -1,6 +1,6 @@
-"""Adapted from VTracePolicyGraph to use the PPO surrogate loss.
+"""Adapted from VTracePolicy to use the PPO surrogate loss.
 
-Keep in sync with changes to VTracePolicyGraph."""
+Keep in sync with changes to VTracePolicy."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -15,7 +15,7 @@
 from ray.rllib.evaluation.postprocessing import Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.tf_policy_template import build_tf_policy
-from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule
+from ray.rllib.evaluation.tf_policy import LearningRateSchedule
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.utils import try_import_tf
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index d3f5abdaa95c0..6c9f6744179d2 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -5,7 +5,7 @@
 import logging
 
 from ray.rllib.agents import with_common_config
-from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy
+from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
 from ray.rllib.agents.trainer_template import build_trainer
 from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer
 
@@ -143,7 +143,7 @@ def validate_config(config):
         raise ValueError(
             "Episode truncation is not supported without a value "
             "function. Consider setting batch_mode=complete_episodes.")
-    if (config["multiagent"]["policy_graphs"]
+    if (config["multiagent"]["policies"]
             and not config["simple_optimizer"]):
         logger.info(
             "In multi-agent mode, policies will be optimized sequentially "
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index 334ca788c9361..42e749c83f204 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -8,7 +8,7 @@
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule
+from ray.rllib.evaluation.tf_policy import LearningRateSchedule
 from ray.rllib.evaluation.tf_policy_template import build_tf_policy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.explained_variance import explained_variance
diff --git a/python/ray/rllib/agents/qmix/qmix.py b/python/ray/rllib/agents/qmix/qmix.py
index 420a567d8eff4..885094f75c203 100644
--- a/python/ray/rllib/agents/qmix/qmix.py
+++ b/python/ray/rllib/agents/qmix/qmix.py
@@ -4,7 +4,7 @@
 
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.dqn.dqn import DQNTrainer
-from ray.rllib.agents.qmix.qmix_policy_graph import QMixPolicyGraph
+from ray.rllib.agents.qmix.qmix_policy import QMixPolicy
 
 # yapf: disable
 # __sphinx_doc_begin__
@@ -95,7 +95,7 @@ class QMixTrainer(DQNTrainer):
 
     _name = "QMIX"
     _default_config = DEFAULT_CONFIG
-    _policy_graph = QMixPolicyGraph
+    _policy = QMixPolicy
     _optimizer_shared_configs = [
         "learning_starts", "buffer_size", "train_batch_size"
     ]
diff --git a/python/ray/rllib/agents/qmix/qmix_policy_graph.py b/python/ray/rllib/agents/qmix/qmix_policy_graph.py
index b7c9a7ad81208..334e1dddb0a88 100644
--- a/python/ray/rllib/agents/qmix/qmix_policy_graph.py
+++ b/python/ray/rllib/agents/qmix/qmix_policy_graph.py
@@ -14,7 +14,7 @@
 from ray.rllib.agents.qmix.mixers import VDNMixer, QMixer
 from ray.rllib.agents.qmix.model import RNNModel, _get_size
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.models.action_dist import TupleActions
 from ray.rllib.models.catalog import ModelCatalog
@@ -130,7 +130,7 @@ def forward(self, rewards, actions, terminated, mask, obs, next_obs,
         return loss, mask, masked_td_error, chosen_action_qvals, targets
 
 
-class QMixPolicyGraph(PolicyGraph):
+class QMixPolicy(Policy):
     """QMix impl. Assumes homogeneous agents for now.
 
     You must use MultiAgentEnv.with_agent_groups() to group agents
@@ -213,7 +213,7 @@ def __init__(self, obs_space, action_space, config):
             alpha=config["optim_alpha"],
             eps=config["optim_eps"])
 
-    @override(PolicyGraph)
+    @override(Policy)
     def compute_actions(self,
                         obs_batch,
                         state_batches=None,
@@ -243,7 +243,7 @@ def compute_actions(self,
 
         return TupleActions(list(actions.transpose([1, 0]))), hiddens, {}
 
-    @override(PolicyGraph)
+    @override(Policy)
     def learn_on_batch(self, samples):
         obs_batch, action_mask = self._unpack_observation(
             samples[SampleBatch.CUR_OBS])
@@ -314,22 +314,22 @@ def to_batches(arr):
         }
         return {LEARNER_STATS_KEY: stats}
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_initial_state(self):
         return [
             s.expand([self.n_agents, -1]).numpy()
             for s in self.model.state_init()
         ]
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_weights(self):
         return {"model": self.model.state_dict()}
 
-    @override(PolicyGraph)
+    @override(Policy)
     def set_weights(self, weights):
         self.model.load_state_dict(weights["model"])
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_state(self):
         return {
             "model": self.model.state_dict(),
@@ -340,7 +340,7 @@ def get_state(self):
             "cur_epsilon": self.cur_epsilon,
         }
 
-    @override(PolicyGraph)
+    @override(Policy)
     def set_state(self, state):
         self.model.load_state_dict(state["model"])
         self.target_model.load_state_dict(state["target_model"])
diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py
index 8e6db02707d80..e48e2166cebf5 100644
--- a/python/ray/rllib/agents/trainer.py
+++ b/python/ray/rllib/agents/trainer.py
@@ -220,9 +220,9 @@
 
     # === Multiagent ===
     "multiagent": {
-        # Map from policy ids to tuples of (policy_graph_cls, obs_space,
+        # Map from policy ids to tuples of (policy_cls, obs_space,
         # act_space, config). See policy_evaluator.py for more info.
-        "policy_graphs": {},
+        "policies": {},
         # Function mapping agent ids to policy ids.
         "policy_mapping_fn": None,
         # Optional whitelist of policies to train, or None for all policies.
@@ -436,7 +436,7 @@ def get_scope():
                 # Make local evaluation evaluators
                 self.evaluation_ev = self.make_local_evaluator(
                     self.env_creator,
-                    self._policy_graph,
+                    self._policy,
                     extra_config=extra_config)
                 self.evaluation_metrics = self._evaluate()
 
@@ -578,10 +578,10 @@ def _default_config(self):
 
     @PublicAPI
     def get_policy(self, policy_id=DEFAULT_POLICY_ID):
-        """Return policy graph for the specified id, or None.
+        """Return policy for the specified id, or None.
 
         Arguments:
-            policy_id (str): id of policy graph to return.
+            policy_id (str): id of policy to return.
         """
 
         return self.local_evaluator.get_policy(policy_id)
@@ -608,14 +608,14 @@ def set_weights(self, weights):
     @DeveloperAPI
     def make_local_evaluator(self,
                              env_creator,
-                             policy_graph,
+                             policy,
                              extra_config=None):
         """Convenience method to return configured local evaluator."""
 
         return self._make_evaluator(
             PolicyEvaluator,
             env_creator,
-            policy_graph,
+            policy,
             0,
             merge_dicts(
                 # important: allow local tf to use more CPUs for optimization
@@ -627,7 +627,7 @@ def make_local_evaluator(self,
                 extra_config or {}))
 
     @DeveloperAPI
-    def make_remote_evaluators(self, env_creator, policy_graph, count):
+    def make_remote_evaluators(self, env_creator, policy, count):
         """Convenience method to return a number of remote evaluators."""
 
         remote_args = {
@@ -639,7 +639,7 @@ def make_remote_evaluators(self, env_creator, policy_graph, count):
         cls = PolicyEvaluator.as_remote(**remote_args).remote
 
         return [
-            self._make_evaluator(cls, env_creator, policy_graph, i + 1,
+            self._make_evaluator(cls, env_creator, policy, i + 1,
                                  self.config) for i in range(count)
         ]
 
@@ -760,7 +760,7 @@ def _has_policy_optimizer(self):
         return hasattr(self, "optimizer") and isinstance(
             self.optimizer, PolicyOptimizer)
 
-    def _make_evaluator(self, cls, env_creator, policy_graph, worker_index,
+    def _make_evaluator(self, cls, env_creator, policy, worker_index,
                         config):
         def session_creator():
             logger.debug("Creating TF session {}".format(
@@ -803,18 +803,18 @@ def session_creator():
         else:
             input_evaluation = config["input_evaluation"]
 
-        # Fill in the default policy graph if 'None' is specified in multiagent
-        if self.config["multiagent"]["policy_graphs"]:
-            tmp = self.config["multiagent"]["policy_graphs"]
+        # Fill in the default policy if 'None' is specified in multiagent
+        if self.config["multiagent"]["policies"]:
+            tmp = self.config["multiagent"]["policies"]
             _validate_multiagent_config(tmp, allow_none_graph=True)
             for k, v in tmp.items():
                 if v[0] is None:
-                    tmp[k] = (policy_graph, v[1], v[2], v[3])
-            policy_graph = tmp
+                    tmp[k] = (policy, v[1], v[2], v[3])
+            policy = tmp
 
         return cls(
             env_creator,
-            policy_graph,
+            policy,
             policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"],
             policies_to_train=self.config["multiagent"]["policies_to_train"],
             tf_session_creator=(session_creator
diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py
index 618bc3b30ace1..314202a1e8428 100644
--- a/python/ray/rllib/agents/trainer_template.py
+++ b/python/ray/rllib/agents/trainer_template.py
@@ -21,7 +21,7 @@ def build_trainer(name,
 
     Arguments:
         name (str): name of the trainer (e.g., "PPO")
-        default_policy (cls): the default PolicyGraph class to use
+        default_policy (cls): the default Policy class to use
         default_config (dict): the default config dict of the algorithm,
             otherwises uses the Trainer default config
         make_policy_optimizer (func): optional function that returns a
@@ -30,7 +30,7 @@ def build_trainer(name,
         validate_config (func): optional callback that checks a given config
             for correctness. It may mutate the config as needed.
         get_policy_class (func): optional callback that takes a config and
-            returns the policy graph class to override the default with
+            returns the policy class to override the default with
         before_train_step (func): optional callback to run before each train()
             call. It takes the trainer instance as an argument.
         after_optimizer_step (func): optional callback to run after each
@@ -51,19 +51,19 @@ def build_trainer(name,
     class trainer_cls(Trainer):
         _name = name
         _default_config = default_config or Trainer.COMMON_CONFIG
-        _policy_graph = default_policy
+        _policy = default_policy
 
         def _init(self, config, env_creator):
             if validate_config:
                 validate_config(config)
             if get_policy_class is None:
-                policy_graph = default_policy
+                policy = default_policy
             else:
-                policy_graph = get_policy_class(config)
+                policy = get_policy_class(config)
             self.local_evaluator = self.make_local_evaluator(
-                env_creator, policy_graph)
+                env_creator, policy)
             self.remote_evaluators = self.make_remote_evaluators(
-                env_creator, policy_graph, config["num_workers"])
+                env_creator, policy, config["num_workers"])
             if make_policy_optimizer:
                 self.optimizer = make_policy_optimizer(
                     self.local_evaluator, self.remote_evaluators, config)
diff --git a/python/ray/rllib/evaluation/__init__.py b/python/ray/rllib/evaluation/__init__.py
index 7e56bb7479a04..a08a656f86279 100644
--- a/python/ray/rllib/evaluation/__init__.py
+++ b/python/ray/rllib/evaluation/__init__.py
@@ -1,9 +1,9 @@
 from ray.rllib.evaluation.episode import MultiAgentEpisode
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.evaluation.interface import EvaluatorInterface
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
-from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.evaluation.torch_policy import TorchPolicy
 from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.evaluation.sample_batch_builder import (
     SampleBatchBuilder, MultiAgentSampleBatchBuilder)
@@ -12,8 +12,8 @@
 from ray.rllib.evaluation.metrics import collect_metrics
 
 __all__ = [
-    "EvaluatorInterface", "PolicyEvaluator", "PolicyGraph", "TFPolicyGraph",
-    "TorchPolicyGraph", "SampleBatch", "MultiAgentBatch", "SampleBatchBuilder",
+    "EvaluatorInterface", "PolicyEvaluator", "Policy", "TFPolicy",
+    "TorchPolicy", "SampleBatch", "MultiAgentBatch", "SampleBatchBuilder",
     "MultiAgentSampleBatchBuilder", "SyncSampler", "AsyncSampler",
     "compute_advantages", "collect_metrics", "MultiAgentEpisode"
 ]
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 73e08fcf90930..a82e751825a8f 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -6,9 +6,9 @@
 import logging
 import numpy as np
 
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.tf_policy import TFPolicy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils import try_import_tf
@@ -20,8 +20,8 @@
 logger = logging.getLogger(__name__)
 
 
-class DynamicTFPolicyGraph(TFPolicyGraph):
-    """A TFPolicyGraph that auto-defines placeholders dynamically at runtime.
+class DynamicTFPolicy(TFPolicy):
+    """A TFPolicy that auto-defines placeholders dynamically at runtime.
 
     Initialization of this class occurs in two phases.
       * Phase 1: the model is created and model variables are initialized.
@@ -42,7 +42,7 @@ def __init__(self,
                  make_action_sampler=None,
                  existing_inputs=None,
                  get_batch_divisibility_req=None):
-        """Initialize a dynamic TF policy graph.
+        """Initialize a dynamic TF policy.
 
         Arguments:
             observation_space (gym.Space): Observation space of the policy.
@@ -51,16 +51,16 @@ def __init__(self,
             loss_fn (func): function that returns a loss tensor the policy
                 graph, and dict of experience tensor placeholders
             stats_fn (func): optional function that returns a dict of
-                TF fetches given the policy graph and batch input tensors
+                TF fetches given the policy and batch input tensors
             grad_stats_fn (func): optional function that returns a dict of
-                TF fetches given the policy graph and loss gradient tensors
+                TF fetches given the policy and loss gradient tensors
             before_loss_init (func): optional function to run prior to loss
                 init that takes the same arguments as __init__
             make_action_sampler (func): optional function that returns a
                 tuple of action and action prob tensors. The function takes
                 (policy, input_dict, obs_space, action_space, config) as its
                 arguments
-            existing_inputs (OrderedDict): when copying a policy graph, this
+            existing_inputs (OrderedDict): when copying a policy, this
                 specifies an existing dict of placeholders to use instead of
                 defining new ones
             get_batch_divisibility_req (func): optional function that returns
@@ -134,7 +134,7 @@ def __init__(self,
             batch_divisibility_req = get_batch_divisibility_req(self)
         else:
             batch_divisibility_req = 1
-        TFPolicyGraph.__init__(
+        TFPolicy.__init__(
             self,
             obs_space,
             action_space,
@@ -158,7 +158,7 @@ def __init__(self,
         if not existing_inputs:
             self._initialize_loss()
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def copy(self, existing_inputs):
         """Creates a copy of self using existing input placeholders."""
 
@@ -194,7 +194,7 @@ def copy(self, existing_inputs):
         if instance._stats_fn:
             instance._stats_fetches.update(
                 instance._stats_fn(instance, input_dict))
-        TFPolicyGraph._initialize_loss(
+        TFPolicy._initialize_loss(
             instance, loss, [(k, existing_inputs[i])
                              for i, (k, _) in enumerate(self._loss_inputs)])
         if instance._grad_stats_fn:
@@ -202,7 +202,7 @@ def copy(self, existing_inputs):
                 instance._grad_stats_fn(instance, instance._grads))
         return instance
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_initial_state(self):
         if self.model:
             return self.model.state_init
@@ -269,7 +269,7 @@ def fake_array(tensor):
             self._stats_fetches.update(self._stats_fn(self, batch_tensors))
         for k in sorted(batch_tensors.accessed_keys):
             loss_inputs.append((k, batch_tensors[k]))
-        TFPolicyGraph._initialize_loss(self, loss, loss_inputs)
+        TFPolicy._initialize_loss(self, loss, loss_inputs)
         if self._grad_stats_fn:
             self._stats_fetches.update(self._grad_stats_fn(self, self._grads))
         self._sess.run(tf.global_variables_initializer())
diff --git a/python/ray/rllib/evaluation/episode.py b/python/ray/rllib/evaluation/episode.py
index b7afa222b149f..8d7641b9c3131 100644
--- a/python/ray/rllib/evaluation/episode.py
+++ b/python/ray/rllib/evaluation/episode.py
@@ -27,7 +27,7 @@ class MultiAgentEpisode(object):
         user_data (dict): Dict that you can use for temporary storage.
 
     Use case 1: Model-based rollouts in multi-agent:
-        A custom compute_actions() function in a policy graph can inspect the
+        A custom compute_actions() function in a policy can inspect the
         current episode state and perform a number of rollouts based on the
         policies and state of other agents in the environment.
 
@@ -80,7 +80,7 @@ def soft_reset(self):
 
     @DeveloperAPI
     def policy_for(self, agent_id=_DUMMY_AGENT_ID):
-        """Returns the policy graph for the specified agent.
+        """Returns the policy for the specified agent.
 
         If the agent is new, the policy mapping fn will be called to bind the
         agent to a policy for the duration of the episode.
diff --git a/python/ray/rllib/evaluation/interface.py b/python/ray/rllib/evaluation/interface.py
index eb705a99b5305..6bc626da11753 100644
--- a/python/ray/rllib/evaluation/interface.py
+++ b/python/ray/rllib/evaluation/interface.py
@@ -62,7 +62,7 @@ def compute_gradients(self, samples):
         Returns:
             (grads, info): A list of gradients that can be applied on a
             compatible evaluator. In the multi-agent case, returns a dict
-            of gradients keyed by policy graph ids. An info dictionary of
+            of gradients keyed by policy ids. An info dictionary of
             extra metadata is also returned.
 
         Examples:
diff --git a/python/ray/rllib/evaluation/keras_policy_graph.py b/python/ray/rllib/evaluation/keras_policy_graph.py
index 88d8e0a9be325..e10f6ec1d178f 100644
--- a/python/ray/rllib/evaluation/keras_policy_graph.py
+++ b/python/ray/rllib/evaluation/keras_policy_graph.py
@@ -4,19 +4,19 @@
 
 import numpy as np
 
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 
 
 def _sample(probs):
     return [np.random.choice(len(pr), p=pr) for pr in probs]
 
 
-class KerasPolicyGraph(PolicyGraph):
-    """Initialize the Keras Policy Graph.
+class KerasPolicy(Policy):
+    """Initialize the Keras Policy.
 
-    This is a Policy Graph used for models with actor and critics.
+    This is a Policy used for models with actor and critics.
     Note: This class is built for specific usage of Actor-Critic models,
-    and is less general compared to TFPolicyGraph and TorchPolicyGraphs.
+    and is less general compared to TFPolicy and TorchPolicies.
 
     Args:
         observation_space (gym.Space): Observation space of the policy.
@@ -32,7 +32,7 @@ def __init__(self,
                  config,
                  actor=None,
                  critic=None):
-        PolicyGraph.__init__(self, observation_space, action_space, config)
+        Policy.__init__(self, observation_space, action_space, config)
         self.actor = actor
         self.critic = critic
         self.models = [self.actor, self.critic]
diff --git a/python/ray/rllib/evaluation/metrics.py b/python/ray/rllib/evaluation/metrics.py
index a92c64bc9e4b5..99de4dd3b2e47 100644
--- a/python/ray/rllib/evaluation/metrics.py
+++ b/python/ray/rllib/evaluation/metrics.py
@@ -21,7 +21,7 @@
 
 @DeveloperAPI
 def get_learner_stats(grad_info):
-    """Return optimization stats reported from the policy graph.
+    """Return optimization stats reported from the policy.
 
     Example:
         >>> grad_info = evaluator.learn_on_batch(samples)
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index 48e19dfcb96e1..01087afc66cb0 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -18,8 +18,8 @@
 from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \
     DEFAULT_POLICY_ID
 from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.tf_policy import TFPolicy
 from ray.rllib.offline import NoopOutput, IOContext, OutputWriter, InputReader
 from ray.rllib.offline.is_estimator import ImportanceSamplingEstimator
 from ray.rllib.offline.wis_estimator import WeightedImportanceSamplingEstimator
@@ -52,9 +52,9 @@ def get_global_evaluator():
 
 @DeveloperAPI
 class PolicyEvaluator(EvaluatorInterface):
-    """Common ``PolicyEvaluator`` implementation that wraps a ``PolicyGraph``.
+    """Common ``PolicyEvaluator`` implementation that wraps a ``Policy``.
 
-    This class wraps a policy graph instance and an environment class to
+    This class wraps a policy instance and an environment class to
     collect experiences from the environment. You can create many replicas of
     this class as Ray actors to scale RL training.
 
@@ -65,7 +65,7 @@ class PolicyEvaluator(EvaluatorInterface):
         >>> # Create a policy evaluator and using it to collect experiences.
         >>> evaluator = PolicyEvaluator(
         ...   env_creator=lambda _: gym.make("CartPole-v0"),
-        ...   policy_graph=PGTFPolicy)
+        ...   policy=PGTFPolicy)
         >>> print(evaluator.sample())
         SampleBatch({
             "obs": [[...]], "actions": [[...]], "rewards": [[...]],
@@ -76,7 +76,7 @@ class PolicyEvaluator(EvaluatorInterface):
         ...   evaluator_cls=PolicyEvaluator,
         ...   evaluator_args={
         ...     "env_creator": lambda _: gym.make("CartPole-v0"),
-        ...     "policy_graph": PGTFPolicy,
+        ...     "policy": PGTFPolicy,
         ...   },
         ...   num_workers=10)
         >>> for _ in range(10): optimizer.step()
@@ -84,7 +84,7 @@ class PolicyEvaluator(EvaluatorInterface):
         >>> # Creating a multi-agent policy evaluator
         >>> evaluator = PolicyEvaluator(
         ...   env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25),
-        ...   policy_graphs={
+        ...   policies={
         ...       # Use an ensemble of two policies for car agents
         ...       "car_policy1":
         ...         (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.99}),
@@ -113,7 +113,7 @@ def as_remote(cls, num_cpus=None, num_gpus=None, resources=None):
     @DeveloperAPI
     def __init__(self,
                  env_creator,
-                 policy_graph,
+                 policy,
                  policy_mapping_fn=None,
                  policies_to_train=None,
                  tf_session_creator=None,
@@ -147,9 +147,9 @@ def __init__(self,
         Arguments:
             env_creator (func): Function that returns a gym.Env given an
                 EnvContext wrapped configuration.
-            policy_graph (class|dict): Either a class implementing
-                PolicyGraph, or a dictionary of policy id strings to
-                (PolicyGraph, obs_space, action_space, config) tuples. If a
+            policy (class|dict): Either a class implementing
+                Policy, or a dictionary of policy id strings to
+                (Policy, obs_space, action_space, config) tuples. If a
                 dict is specified, then we are in multi-agent mode and a
                 policy_mapping_fn should also be set.
             policy_mapping_fn (func): A function that maps agent ids to
@@ -159,7 +159,7 @@ def __init__(self,
             policies_to_train (list): Optional whitelist of policies to train,
                 or None for all policies.
             tf_session_creator (func): A function that returns a TF session.
-                This is optional and only useful with TFPolicyGraph.
+                This is optional and only useful with TFPolicy.
             batch_steps (int): The target number of env transitions to include
                 in each sample batch returned from this evaluator.
             batch_mode (str): One of the following batch modes:
@@ -196,7 +196,7 @@ def __init__(self,
             model_config (dict): Config to use when creating the policy model.
             policy_config (dict): Config to pass to the policy. In the
                 multi-agent case, this config will be merged with the
-                per-policy configs specified by `policy_graph`.
+                per-policy configs specified by `policy`.
             worker_index (int): For remote evaluators, this should be set to a
                 non-zero and unique value. This index is passed to created envs
                 through EnvContext so that envs can be configured per worker.
@@ -301,7 +301,7 @@ def make_env(vector_index):
                         vector_index=vector_index, remote=remote_worker_envs)))
 
         self.tf_sess = None
-        policy_dict = _validate_and_canonicalize(policy_graph, self.env)
+        policy_dict = _validate_and_canonicalize(policy, self.env)
         self.policies_to_train = policies_to_train or list(policy_dict.keys())
         if _has_tensorflow_graph(policy_dict):
             if (ray.is_initialized()
@@ -330,7 +330,7 @@ def make_env(vector_index):
                      or isinstance(self.env, ExternalMultiAgentEnv))
                     or isinstance(self.env, BaseEnv)):
                 raise ValueError(
-                    "Have multiple policy graphs {}, but the env ".format(
+                    "Have multiple policies {}, but the env ".format(
                         self.policy_map) +
                     "{} is not a subclass of BaseEnv, MultiAgentEnv or "
                     "ExternalMultiAgentEnv?".format(self.env))
@@ -608,17 +608,17 @@ def foreach_env(self, func):
 
     @DeveloperAPI
     def get_policy(self, policy_id=DEFAULT_POLICY_ID):
-        """Return policy graph for the specified id, or None.
+        """Return policy for the specified id, or None.
 
         Arguments:
-            policy_id (str): id of policy graph to return.
+            policy_id (str): id of policy to return.
         """
 
         return self.policy_map.get(policy_id)
 
     @DeveloperAPI
     def for_policy(self, func, policy_id=DEFAULT_POLICY_ID):
-        """Apply the given function to the specified policy graph."""
+        """Apply the given function to the specified policy."""
 
         return func(self.policy_map[policy_id])
 
@@ -708,7 +708,7 @@ def _build_policy_map(self, policy_dict, policy_config):
         preprocessors = {}
         for name, (cls, obs_space, act_space,
                    conf) in sorted(policy_dict.items()):
-            logger.debug("Creating policy graph for {}".format(name))
+            logger.debug("Creating policy for {}".format(name))
             merged_conf = merge_dicts(policy_config, conf)
             if self.preprocessing_enabled:
                 preprocessor = ModelCatalog.get_preprocessor_for_space(
@@ -720,7 +720,7 @@ def _build_policy_map(self, policy_dict, policy_config):
             if isinstance(obs_space, gym.spaces.Dict) or \
                     isinstance(obs_space, gym.spaces.Tuple):
                 raise ValueError(
-                    "Found raw Tuple|Dict space as input to policy graph. "
+                    "Found raw Tuple|Dict space as input to policy. "
                     "Please preprocess these observations with a "
                     "Tuple|DictFlatteningPreprocessor.")
             if tf:
@@ -738,12 +738,12 @@ def __del__(self):
             self.sampler.shutdown = True
 
 
-def _validate_and_canonicalize(policy_graph, env):
-    if isinstance(policy_graph, dict):
-        _validate_multiagent_config(policy_graph)
-        return policy_graph
-    elif not issubclass(policy_graph, PolicyGraph):
-        raise ValueError("policy_graph must be a rllib.PolicyGraph class")
+def _validate_and_canonicalize(policy, env):
+    if isinstance(policy, dict):
+        _validate_multiagent_config(policy)
+        return policy
+    elif not issubclass(policy, Policy):
+        raise ValueError("policy must be a rllib.Policy class")
     else:
         if (isinstance(env, MultiAgentEnv)
                 and not hasattr(env, "observation_space")):
@@ -751,37 +751,37 @@ def _validate_and_canonicalize(policy_graph, env):
                 "MultiAgentEnv must have observation_space defined if run "
                 "in a single-agent configuration.")
         return {
-            DEFAULT_POLICY_ID: (policy_graph, env.observation_space,
+            DEFAULT_POLICY_ID: (policy, env.observation_space,
                                 env.action_space, {})
         }
 
 
-def _validate_multiagent_config(policy_graph, allow_none_graph=False):
-    for k, v in policy_graph.items():
+def _validate_multiagent_config(policy, allow_none_graph=False):
+    for k, v in policy.items():
         if not isinstance(k, str):
-            raise ValueError("policy_graph keys must be strs, got {}".format(
+            raise ValueError("policy keys must be strs, got {}".format(
                 type(k)))
         if not isinstance(v, tuple) or len(v) != 4:
             raise ValueError(
-                "policy_graph values must be tuples of "
+                "policy values must be tuples of "
                 "(cls, obs_space, action_space, config), got {}".format(v))
         if allow_none_graph and v[0] is None:
             pass
-        elif not issubclass(v[0], PolicyGraph):
+        elif not issubclass(v[0], Policy):
             raise ValueError(
-                "policy_graph tuple value 0 must be a rllib.PolicyGraph "
+                "policy tuple value 0 must be a rllib.Policy "
                 "class or None, got {}".format(v[0]))
         if not isinstance(v[1], gym.Space):
             raise ValueError(
-                "policy_graph tuple value 1 (observation_space) must be a "
+                "policy tuple value 1 (observation_space) must be a "
                 "gym.Space, got {}".format(type(v[1])))
         if not isinstance(v[2], gym.Space):
             raise ValueError(
-                "policy_graph tuple value 2 (action_space) must be a "
+                "policy tuple value 2 (action_space) must be a "
                 "gym.Space, got {}".format(type(v[2])))
         if not isinstance(v[3], dict):
             raise ValueError(
-                "policy_graph tuple value 3 (config) must be a dict, "
+                "policy tuple value 3 (config) must be a dict, "
                 "got {}".format(type(v[3])))
 
 
@@ -805,6 +805,6 @@ def _monitor(env, path):
 
 def _has_tensorflow_graph(policy_dict):
     for policy, _, _, _ in policy_dict.values():
-        if issubclass(policy, TFPolicyGraph):
+        if issubclass(policy, TFPolicy):
             return True
     return False
diff --git a/python/ray/rllib/evaluation/policy_graph.py b/python/ray/rllib/evaluation/policy_graph.py
index a577550975f90..72393e7826c53 100644
--- a/python/ray/rllib/evaluation/policy_graph.py
+++ b/python/ray/rllib/evaluation/policy_graph.py
@@ -9,16 +9,16 @@
 
 
 @DeveloperAPI
-class PolicyGraph(object):
-    """An agent policy and loss, i.e., a TFPolicyGraph or other subclass.
+class Policy(object):
+    """An agent policy and loss, i.e., a TFPolicy or other subclass.
 
     This object defines how to act in the environment, and also losses used to
     improve the policy based on its experiences. Note that both policy and
     loss are defined together for convenience, though the policy itself is
     logically separate.
 
-    All policies can directly extend PolicyGraph, however TensorFlow users may
-    find TFPolicyGraph simpler to implement. TFPolicyGraph also enables RLlib
+    All policies can directly extend Policy, however TensorFlow users may
+    find TFPolicy simpler to implement. TFPolicy also enables RLlib
     to apply TensorFlow-specific optimizations such as fusing multiple policy
     graphs and multi-GPU support.
 
@@ -31,7 +31,7 @@ class PolicyGraph(object):
     def __init__(self, observation_space, action_space, config):
         """Initialize the graph.
 
-        This is the standard constructor for policy graphs. The policy graph
+        This is the standard constructor for policies. The policy
         class you pass into PolicyEvaluator will be constructed with
         these arguments.
 
@@ -143,8 +143,8 @@ def postprocess_trajectory(self,
             sample_batch (SampleBatch): batch of experiences for the policy,
                 which will contain at most one episode trajectory.
             other_agent_batches (dict): In a multi-agent env, this contains a
-                mapping of agent ids to (policy_graph, agent_batch) tuples
-                containing the policy graph and experiences of the other agent.
+                mapping of agent ids to (policy, agent_batch) tuples
+                containing the policy and experiences of the other agent.
             episode (MultiAgentEpisode): this provides access to all of the
                 internal episode state, which may be useful for model-based or
                 multi-agent algorithms.
@@ -245,7 +245,7 @@ def on_global_var_update(self, global_vars):
 
     @DeveloperAPI
     def export_model(self, export_dir):
-        """Export PolicyGraph to local directory for serving.
+        """Export Policy to local directory for serving.
 
         Arguments:
             export_dir (str): Local writable directory.
@@ -254,7 +254,7 @@ def export_model(self, export_dir):
 
     @DeveloperAPI
     def export_checkpoint(self, export_dir):
-        """Export PolicyGraph checkpoint to local directory.
+        """Export Policy checkpoint to local directory.
 
         Argument:
             export_dir (str): Local writable directory.
diff --git a/python/ray/rllib/evaluation/sample_batch_builder.py b/python/ray/rllib/evaluation/sample_batch_builder.py
index c6d69d7d97f13..675cfc24d1a43 100644
--- a/python/ray/rllib/evaluation/sample_batch_builder.py
+++ b/python/ray/rllib/evaluation/sample_batch_builder.py
@@ -79,7 +79,7 @@ def __init__(self, policy_map, clip_rewards, postp_callback):
         """Initialize a MultiAgentSampleBatchBuilder.
 
         Arguments:
-            policy_map (dict): Maps policy ids to policy graph instances.
+            policy_map (dict): Maps policy ids to policy instances.
             clip_rewards (bool): Whether to clip rewards before postprocessing.
             postp_callback: function to call on each postprocessed batch.
         """
diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py
index 25dd1ef5d9a78..368c6f01ce10a 100644
--- a/python/ray/rllib/evaluation/sampler.py
+++ b/python/ray/rllib/evaluation/sampler.py
@@ -12,7 +12,7 @@
 from ray.rllib.evaluation.episode import MultiAgentEpisode, _flatten_action
 from ray.rllib.evaluation.sample_batch_builder import \
     MultiAgentSampleBatchBuilder
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.tf_policy import TFPolicy
 from ray.rllib.env.base_env import BaseEnv, ASYNC_RESET_RETURN
 from ray.rllib.env.atari_wrappers import get_wrapper_by_cls, MonitorEnv
 from ray.rllib.models.action_dist import TupleActions
@@ -20,7 +20,7 @@
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.debug import log_once, summarize
 from ray.rllib.utils.tf_run_builder import TFRunBuilder
-from ray.rllib.evaluation.policy_graph import clip_action
+from ray.rllib.evaluation.policy import clip_action
 
 logger = logging.getLogger(__name__)
 
@@ -236,7 +236,7 @@ def _env_runner(base_env, extra_batch_callback, policies, policy_mapping_fn,
     Args:
         base_env (BaseEnv): env implementing BaseEnv.
         extra_batch_callback (fn): function to send extra batch data to.
-        policies (dict): Map of policy ids to PolicyGraph instances.
+        policies (dict): Map of policy ids to Policy instances.
         policy_mapping_fn (func): Function that maps agent ids to policy ids.
             This is called when an agent first enters the environment. The
             agent is then "bound" to the returned policy for the episode.
@@ -528,7 +528,7 @@ def _do_policy_eval(tf_sess, to_eval, policies, active_episodes):
         rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data])
         policy = _get_or_raise(policies, policy_id)
         if builder and (policy.compute_actions.__code__ is
-                        TFPolicyGraph.compute_actions.__code__):
+                        TFPolicy.compute_actions.__code__):
             # TODO(ekl): how can we make info batch available to TF code?
             pending_fetches[policy_id] = policy._build_compute_actions(
                 builder, [t.obs for t in eval_data],
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index b921e6cfb0d1a..c6cc619af524d 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -10,7 +10,7 @@
 import ray
 import ray.experimental.tf_utils
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.models.lstm import chop_into_sequences
 from ray.rllib.utils.annotations import override, DeveloperAPI
@@ -24,11 +24,11 @@
 
 
 @DeveloperAPI
-class TFPolicyGraph(PolicyGraph):
+class TFPolicy(Policy):
     """An agent policy and loss implemented in TensorFlow.
 
     Extending this class enables RLlib to perform TensorFlow specific
-    optimizations on the policy graph, e.g., parallelization across gpus or
+    optimizations on the policy, e.g., parallelization across gpus or
     fusing multiple graphs together in the multi-agent setting.
 
     Input tensors are typically shaped like [BATCH_SIZE, ...].
@@ -39,7 +39,7 @@ class TFPolicyGraph(PolicyGraph):
         model (rllib.models.Model): RLlib model used for the policy.
 
     Examples:
-        >>> policy = TFPolicyGraphSubclass(
+        >>> policy = TFPolicySubclass(
             sess, obs_input, action_sampler, loss, loss_inputs)
 
         >>> print(policy.compute_actions([1, 0, 2]))
@@ -68,7 +68,7 @@ def __init__(self,
                  max_seq_len=20,
                  batch_divisibility_req=1,
                  update_ops=None):
-        """Initialize the policy graph.
+        """Initialize the policy.
 
         Arguments:
             observation_space (gym.Space): Observation space of the env.
@@ -179,7 +179,7 @@ def _initialize_loss(self, loss, loss_inputs):
 
         self._sess.run(tf.global_variables_initializer())
 
-    @override(PolicyGraph)
+    @override(Policy)
     def compute_actions(self,
                         obs_batch,
                         state_batches=None,
@@ -194,36 +194,36 @@ def compute_actions(self,
                                               prev_reward_batch)
         return builder.get(fetches)
 
-    @override(PolicyGraph)
+    @override(Policy)
     def compute_gradients(self, postprocessed_batch):
         assert self._loss is not None, "Loss not initialized"
         builder = TFRunBuilder(self._sess, "compute_gradients")
         fetches = self._build_compute_gradients(builder, postprocessed_batch)
         return builder.get(fetches)
 
-    @override(PolicyGraph)
+    @override(Policy)
     def apply_gradients(self, gradients):
         assert self._loss is not None, "Loss not initialized"
         builder = TFRunBuilder(self._sess, "apply_gradients")
         fetches = self._build_apply_gradients(builder, gradients)
         builder.get(fetches)
 
-    @override(PolicyGraph)
+    @override(Policy)
     def learn_on_batch(self, postprocessed_batch):
         assert self._loss is not None, "Loss not initialized"
         builder = TFRunBuilder(self._sess, "learn_on_batch")
         fetches = self._build_learn_on_batch(builder, postprocessed_batch)
         return builder.get(fetches)
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_weights(self):
         return self._variables.get_flat()
 
-    @override(PolicyGraph)
+    @override(Policy)
     def set_weights(self, weights):
         return self._variables.set_flat(weights)
 
-    @override(PolicyGraph)
+    @override(Policy)
     def export_model(self, export_dir):
         """Export tensorflow graph to export_dir for serving."""
         with self._sess.graph.as_default():
@@ -234,7 +234,7 @@ def export_model(self, export_dir):
                 signature_def_map=signature_def_map)
             builder.save()
 
-    @override(PolicyGraph)
+    @override(Policy)
     def export_checkpoint(self, export_dir, filename_prefix="model"):
         """Export tensorflow checkpoint to export_dir."""
         try:
@@ -491,7 +491,7 @@ def _get_loss_inputs_dict(self, batch):
 
 @DeveloperAPI
 class LearningRateSchedule(object):
-    """Mixin for TFPolicyGraph that adds a learning rate schedule."""
+    """Mixin for TFPolicy that adds a learning rate schedule."""
 
     @DeveloperAPI
     def __init__(self, lr, lr_schedule):
@@ -502,13 +502,13 @@ def __init__(self, lr, lr_schedule):
             self.lr_schedule = PiecewiseSchedule(
                 lr_schedule, outside_value=lr_schedule[-1][-1])
 
-    @override(PolicyGraph)
+    @override(Policy)
     def on_global_var_update(self, global_vars):
         super(LearningRateSchedule, self).on_global_var_update(global_vars)
         self.cur_lr.load(
             self.lr_schedule.value(global_vars["timestep"]),
             session=self._sess)
 
-    @override(TFPolicyGraph)
+    @override(TFPolicy)
     def optimizer(self):
         return tf.train.AdamOptimizer(self.cur_lr)
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index b2549e973a656..717c9b32b038c 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -2,9 +2,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.dynamic_tf_policy import DynamicTFPolicy
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.tf_policy import TFPolicy
 from ray.rllib.utils.annotations import override, DeveloperAPI
 
 
@@ -39,7 +39,7 @@ def build_tf_policy(name,
         extra_action_fetches_fn (func): optional function that returns
             a dict of TF fetches given the policy object
         postprocess_fn (func): optional experience postprocessing function
-            that takes the same args as PolicyGraph.postprocess_trajectory()
+            that takes the same args as Policy.postprocess_trajectory()
         optimizer_fn (func): optional function that returns a tf.Optimizer
             given the policy and config
         gradients_fn (func): optional function that returns a list of gradients
@@ -57,18 +57,18 @@ def build_tf_policy(name,
             arguments
         mixins (list): list of any class mixins for the returned policy class.
             These mixins will be applied in order and will have higher
-            precedence than the DynamicTFPolicyGraph class
+            precedence than the DynamicTFPolicy class
         get_batch_divisibility_req (func): optional function that returns
             the divisibility requirement for sample batches
 
     Returns:
-        a DynamicTFPolicyGraph instance that uses the specified args
+        a DynamicTFPolicy instance that uses the specified args
     """
 
     if not name.endswith("TFPolicy"):
         raise ValueError("Name should match *TFPolicy", name)
 
-    base = DynamicTFPolicyGraph
+    base = DynamicTFPolicy
     while mixins:
 
         class new_base(mixins.pop(), base):
@@ -97,7 +97,7 @@ def before_loss_init_wrapper(policy, obs_space, action_space,
                 else:
                     self._extra_action_fetches = extra_action_fetches_fn(self)
 
-            DynamicTFPolicyGraph.__init__(
+            DynamicTFPolicy.__init__(
                 self,
                 obs_space,
                 action_space,
@@ -111,7 +111,7 @@ def before_loss_init_wrapper(policy, obs_space, action_space,
             if after_init:
                 after_init(self, obs_space, action_space, config)
 
-        @override(PolicyGraph)
+        @override(Policy)
         def postprocess_trajectory(self,
                                    sample_batch,
                                    other_agent_batches=None,
@@ -121,24 +121,24 @@ def postprocess_trajectory(self,
             return postprocess_fn(self, sample_batch, other_agent_batches,
                                   episode)
 
-        @override(TFPolicyGraph)
+        @override(TFPolicy)
         def optimizer(self):
             if optimizer_fn:
                 return optimizer_fn(self, self.config)
             else:
-                return TFPolicyGraph.optimizer(self)
+                return TFPolicy.optimizer(self)
 
-        @override(TFPolicyGraph)
+        @override(TFPolicy)
         def gradients(self, optimizer, loss):
             if gradients_fn:
                 return gradients_fn(self, optimizer, loss)
             else:
-                return TFPolicyGraph.gradients(self, optimizer, loss)
+                return TFPolicy.gradients(self, optimizer, loss)
 
-        @override(TFPolicyGraph)
+        @override(TFPolicy)
         def extra_compute_action_fetches(self):
             return dict(
-                TFPolicyGraph.extra_compute_action_fetches(self),
+                TFPolicy.extra_compute_action_fetches(self),
                 **self._extra_action_fetches)
 
     graph_cls.__name__ = name
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py
index ccf1b9eeb81d6..cd0602935a0a3 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph.py
@@ -13,15 +13,15 @@
     pass  # soft dep
 
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.tracking_dict import UsageTrackingDict
 
 
-class TorchPolicyGraph(PolicyGraph):
+class TorchPolicy(Policy):
     """Template for a PyTorch policy and loss to use with RLlib.
 
-    This is similar to TFPolicyGraph, but for PyTorch.
+    This is similar to TFPolicy, but for PyTorch.
 
     Attributes:
         observation_space (gym.Space): observation space of the policy.
@@ -32,7 +32,7 @@ class TorchPolicyGraph(PolicyGraph):
 
     def __init__(self, observation_space, action_space, model, loss,
                  action_distribution_cls):
-        """Build a policy graph from policy and loss torch modules.
+        """Build a policy from policy and loss torch modules.
 
         Note that model will be placed on GPU device if CUDA_VISIBLE_DEVICES
         is set. Only single GPU is supported for now.
@@ -43,7 +43,7 @@ def __init__(self, observation_space, action_space, model, loss,
             model (nn.Module): PyTorch policy module. Given observations as
                 input, this module must return a list of outputs where the
                 first item is action logits, and the rest can be any value.
-            loss (func): Function that takes (policy_graph, batch_tensors)
+            loss (func): Function that takes (policy, batch_tensors)
                 and returns a single scalar loss.
             action_distribution_cls (ActionDistribution): Class for action
                 distribution.
@@ -59,7 +59,7 @@ def __init__(self, observation_space, action_space, model, loss,
         self._optimizer = self.optimizer()
         self._action_dist_cls = action_distribution_cls
 
-    @override(PolicyGraph)
+    @override(Policy)
     def compute_actions(self,
                         obs_batch,
                         state_batches=None,
@@ -80,7 +80,7 @@ def compute_actions(self,
                         [h.cpu().numpy() for h in state],
                         self.extra_action_out(model_out))
 
-    @override(PolicyGraph)
+    @override(Policy)
     def learn_on_batch(self, postprocessed_batch):
         batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
 
@@ -96,7 +96,7 @@ def learn_on_batch(self, postprocessed_batch):
             grad_info.update(grad_process_info)
             return {LEARNER_STATS_KEY: grad_info}
 
-    @override(PolicyGraph)
+    @override(Policy)
     def compute_gradients(self, postprocessed_batch):
         batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
 
@@ -120,7 +120,7 @@ def compute_gradients(self, postprocessed_batch):
             grad_info.update(grad_process_info)
             return grads, {LEARNER_STATS_KEY: grad_info}
 
-    @override(PolicyGraph)
+    @override(Policy)
     def apply_gradients(self, gradients):
         with self.lock:
             for g, p in zip(gradients, self._model.parameters()):
@@ -128,17 +128,17 @@ def apply_gradients(self, gradients):
                     p.grad = torch.from_numpy(g).to(self.device)
             self._optimizer.step()
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_weights(self):
         with self.lock:
             return {k: v.cpu() for k, v in self._model.state_dict().items()}
 
-    @override(PolicyGraph)
+    @override(Policy)
     def set_weights(self, weights):
         with self.lock:
             self._model.load_state_dict(weights)
 
-    @override(PolicyGraph)
+    @override(Policy)
     def get_initial_state(self):
         return [s.numpy() for s in self._model.state_init()]
 
diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py
index 7f65c2b963b8e..ff78d136629ac 100644
--- a/python/ray/rllib/evaluation/torch_policy_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_template.py
@@ -2,8 +2,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.torch_policy import TorchPolicy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.annotations import override, DeveloperAPI
 
@@ -32,7 +32,7 @@ def build_torch_policy(name,
         stats_fn (func): optional function that returns a dict of
             values given the policy and batch input tensors
         postprocess_fn (func): optional experience postprocessing function
-            that takes the same args as PolicyGraph.postprocess_trajectory()
+            that takes the same args as Policy.postprocess_trajectory()
         extra_action_out_fn (func): optional function that returns
             a dict of extra values to include in experiences
         extra_grad_process_fn (func): optional function that is called after
@@ -49,16 +49,16 @@ def build_torch_policy(name,
             model and action dist from the catalog will be used
         mixins (list): list of any class mixins for the returned policy class.
             These mixins will be applied in order and will have higher
-            precedence than the TorchPolicyGraph class
+            precedence than the TorchPolicy class
 
     Returns:
-        a TorchPolicyGraph instance that uses the specified args
+        a TorchPolicy instance that uses the specified args
     """
 
     if not name.endswith("TorchPolicy"):
         raise ValueError("Name should match *TorchPolicy", name)
 
-    base = TorchPolicyGraph
+    base = TorchPolicy
     while mixins:
 
         class new_base(mixins.pop(), base):
@@ -84,13 +84,13 @@ def __init__(self, obs_space, action_space, config):
                 self.model = ModelCatalog.get_torch_model(
                     obs_space, logit_dim, self.config["model"])
 
-            TorchPolicyGraph.__init__(self, obs_space, action_space,
+            TorchPolicy.__init__(self, obs_space, action_space,
                                       self.model, loss_fn, self.dist_class)
 
             if after_init:
                 after_init(self, obs_space, action_space, config)
 
-        @override(PolicyGraph)
+        @override(Policy)
         def postprocess_trajectory(self,
                                    sample_batch,
                                    other_agent_batches=None,
@@ -100,33 +100,33 @@ def postprocess_trajectory(self,
             return postprocess_fn(self, sample_batch, other_agent_batches,
                                   episode)
 
-        @override(TorchPolicyGraph)
+        @override(TorchPolicy)
         def extra_grad_process(self):
             if extra_grad_process_fn:
                 return extra_grad_process_fn(self)
             else:
-                return TorchPolicyGraph.extra_grad_process(self)
+                return TorchPolicy.extra_grad_process(self)
 
-        @override(TorchPolicyGraph)
+        @override(TorchPolicy)
         def extra_action_out(self, model_out):
             if extra_action_out_fn:
                 return extra_action_out_fn(self, model_out)
             else:
-                return TorchPolicyGraph.extra_action_out(self, model_out)
+                return TorchPolicy.extra_action_out(self, model_out)
 
-        @override(TorchPolicyGraph)
+        @override(TorchPolicy)
         def optimizer(self):
             if optimizer_fn:
                 return optimizer_fn(self, self.config)
             else:
-                return TorchPolicyGraph.optimizer(self)
+                return TorchPolicy.optimizer(self)
 
-        @override(TorchPolicyGraph)
+        @override(TorchPolicy)
         def extra_grad_info(self, batch_tensors):
             if stats_fn:
                 return stats_fn(self, batch_tensors)
             else:
-                return TorchPolicyGraph.extra_grad_info(self, batch_tensors)
+                return TorchPolicy.extra_grad_info(self, batch_tensors)
 
     graph_cls.__name__ = name
     graph_cls.__qualname__ = name
diff --git a/python/ray/rllib/examples/hierarchical_training.py b/python/ray/rllib/examples/hierarchical_training.py
index c6d2db96837f7..2fe61953dc967 100644
--- a/python/ray/rllib/examples/hierarchical_training.py
+++ b/python/ray/rllib/examples/hierarchical_training.py
@@ -209,7 +209,7 @@ def policy_mapping_fn(agent_id):
                 "log_level": "INFO",
                 "entropy_coeff": 0.01,
                 "multiagent": {
-                    "policy_graphs": {
+                    "policies": {
                         "high_level_policy": (None, maze.observation_space,
                                               Discrete(4), {
                                                   "gamma": 0.9
diff --git a/python/ray/rllib/examples/multiagent_cartpole.py b/python/ray/rllib/examples/multiagent_cartpole.py
index 6e0f93711540e..efa77ecbf7a56 100644
--- a/python/ray/rllib/examples/multiagent_cartpole.py
+++ b/python/ray/rllib/examples/multiagent_cartpole.py
@@ -6,7 +6,7 @@
 Control the number of agents and policies via --num-agents and --num-policies.
 
 This works with hundreds of agents and policies, but note that initializing
-many TF policy graphs will take some time.
+many TF policies will take some time.
 
 Also, TF evals might slow down with large numbers of policies. To debug TF
 execution, set the TF_TIMELINE_DIR environment variable.
@@ -90,12 +90,12 @@ def gen_policy(i):
         }
         return (None, obs_space, act_space, config)
 
-    # Setup PPO with an ensemble of `num_policies` different policy graphs
-    policy_graphs = {
+    # Setup PPO with an ensemble of `num_policies` different policies
+    policies = {
         "policy_{}".format(i): gen_policy(i)
         for i in range(args.num_policies)
     }
-    policy_ids = list(policy_graphs.keys())
+    policy_ids = list(policies.keys())
 
     tune.run(
         "PPO",
@@ -105,7 +105,7 @@ def gen_policy(i):
             "log_level": "DEBUG",
             "num_sgd_iter": 10,
             "multiagent": {
-                "policy_graphs": policy_graphs,
+                "policies": policies,
                 "policy_mapping_fn": tune.function(
                     lambda agent_id: random.choice(policy_ids)),
             },
diff --git a/python/ray/rllib/examples/multiagent_custom_policy.py b/python/ray/rllib/examples/multiagent_custom_policy.py
index 855051d52ef4c..2ef7378eb3871 100644
--- a/python/ray/rllib/examples/multiagent_custom_policy.py
+++ b/python/ray/rllib/examples/multiagent_custom_policy.py
@@ -22,7 +22,7 @@
 
 import ray
 from ray import tune
-from ray.rllib.evaluation import PolicyGraph
+from ray.rllib.evaluation import Policy
 from ray.rllib.tests.test_multi_agent_env import MultiCartpole
 from ray.tune.registry import register_env
 
@@ -30,7 +30,7 @@
 parser.add_argument("--num-iters", type=int, default=20)
 
 
-class RandomPolicy(PolicyGraph):
+class RandomPolicy(Policy):
     """Hand-coded policy that returns random actions."""
 
     def compute_actions(self,
@@ -65,7 +65,7 @@ def learn_on_batch(self, samples):
         config={
             "env": "multi_cartpole",
             "multiagent": {
-                "policy_graphs": {
+                "policies": {
                     "pg_policy": (None, obs_space, act_space, {}),
                     "random": (RandomPolicy, obs_space, act_space, {}),
                 },
diff --git a/python/ray/rllib/examples/multiagent_two_trainers.py b/python/ray/rllib/examples/multiagent_two_trainers.py
index 1d4257e4eb9d2..7c78e1dd625e0 100644
--- a/python/ray/rllib/examples/multiagent_two_trainers.py
+++ b/python/ray/rllib/examples/multiagent_two_trainers.py
@@ -16,9 +16,9 @@
 
 import ray
 from ray.rllib.agents.dqn.dqn import DQNTrainer
-from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
+from ray.rllib.agents.dqn.dqn_policy import DQNPolicy
 from ray.rllib.agents.ppo.ppo import PPOTrainer
-from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy
+from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
 from ray.rllib.tests.test_multi_agent_env import MultiCartpole
 from ray.tune.logger import pretty_print
 from ray.tune.registry import register_env
@@ -36,11 +36,11 @@
     obs_space = single_env.observation_space
     act_space = single_env.action_space
 
-    # You can also have multiple policy graphs per trainer, but here we just
+    # You can also have multiple policies per trainer, but here we just
     # show one each for PPO and DQN.
-    policy_graphs = {
+    policies = {
         "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}),
-        "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}),
+        "dqn_policy": (DQNPolicy, obs_space, act_space, {}),
     }
 
     def policy_mapping_fn(agent_id):
@@ -53,7 +53,7 @@ def policy_mapping_fn(agent_id):
         env="multi_cartpole",
         config={
             "multiagent": {
-                "policy_graphs": policy_graphs,
+                "policies": policies,
                 "policy_mapping_fn": policy_mapping_fn,
                 "policies_to_train": ["ppo_policy"],
             },
@@ -66,7 +66,7 @@ def policy_mapping_fn(agent_id):
         env="multi_cartpole",
         config={
             "multiagent": {
-                "policy_graphs": policy_graphs,
+                "policies": policies,
                 "policy_mapping_fn": policy_mapping_fn,
                 "policies_to_train": ["dqn_policy"],
             },
diff --git a/python/ray/rllib/examples/policy_evaluator_custom_workflow.py b/python/ray/rllib/examples/policy_evaluator_custom_workflow.py
index b077871292469..ef62227dae192 100644
--- a/python/ray/rllib/examples/policy_evaluator_custom_workflow.py
+++ b/python/ray/rllib/examples/policy_evaluator_custom_workflow.py
@@ -1,7 +1,7 @@
 """Example of using policy evaluator classes directly to implement training.
 
 Instead of using the built-in Trainer classes provided by RLlib, here we define
-a custom PolicyGraph class and manually coordinate distributed sample
+a custom Policy class and manually coordinate distributed sample
 collection and policy optimization.
 """
 
@@ -14,7 +14,7 @@
 
 import ray
 from ray import tune
-from ray.rllib.evaluation import PolicyGraph, PolicyEvaluator, SampleBatch
+from ray.rllib.evaluation import Policy, PolicyEvaluator, SampleBatch
 from ray.rllib.evaluation.metrics import collect_metrics
 
 parser = argparse.ArgumentParser()
@@ -23,15 +23,15 @@
 parser.add_argument("--num-workers", type=int, default=2)
 
 
-class CustomPolicy(PolicyGraph):
-    """Example of a custom policy graph written from scratch.
+class CustomPolicy(Policy):
+    """Example of a custom policy written from scratch.
 
-    You might find it more convenient to extend TF/TorchPolicyGraph instead
+    You might find it more convenient to extend TF/TorchPolicy instead
     for a real policy.
     """
 
     def __init__(self, observation_space, action_space, config):
-        PolicyGraph.__init__(self, observation_space, action_space, config)
+        Policy.__init__(self, observation_space, action_space, config)
         # example parameter
         self.w = 1.0
 
diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py
index 4996f3cdf4376..901ffa8024bfe 100644
--- a/python/ray/rllib/models/model.py
+++ b/python/ray/rllib/models/model.py
@@ -161,7 +161,7 @@ def custom_loss(self, policy_loss, loss_inputs):
         You can find an runnable example in examples/custom_loss.py.
 
         Arguments:
-            policy_loss (Tensor): scalar policy loss from the policy graph.
+            policy_loss (Tensor): scalar policy loss from the policy.
             loss_inputs (dict): map of input placeholders for rollout data.
 
         Returns:
diff --git a/python/ray/rllib/offline/off_policy_estimator.py b/python/ray/rllib/offline/off_policy_estimator.py
index d09fe6baf052f..c738049984bd4 100644
--- a/python/ray/rllib/offline/off_policy_estimator.py
+++ b/python/ray/rllib/offline/off_policy_estimator.py
@@ -23,7 +23,7 @@ def __init__(self, policy, gamma):
         """Creates an off-policy estimator.
 
         Arguments:
-            policy (PolicyGraph): Policy graph to evaluate.
+            policy (Policy): Policy graph to evaluate.
             gamma (float): Discount of the MDP.
         """
         self.policy = policy
@@ -71,7 +71,7 @@ def action_prob(self, batch):
             raise ValueError(
                 "Off-policy estimation is not possible unless the policy "
                 "returns action probabilities when computing actions (i.e., "
-                "the 'action_prob' key is output by the policy graph). You "
+                "the 'action_prob' key is output by the policy). You "
                 "can set `input_evaluation: []` to resolve this.")
         return info["action_prob"]
 
diff --git a/python/ray/rllib/optimizers/multi_gpu_impl.py b/python/ray/rllib/optimizers/multi_gpu_impl.py
index 8d1bbd4fb54d9..aad301b29eee6 100644
--- a/python/ray/rllib/optimizers/multi_gpu_impl.py
+++ b/python/ray/rllib/optimizers/multi_gpu_impl.py
@@ -48,7 +48,7 @@ class LocalSyncParallelOptimizer(object):
             processed. If this is larger than the total data size, it will be
             clipped.
         build_graph: Function that takes the specified inputs and returns a
-            TF Policy Graph instance.
+            TF Policy instance.
     """
 
     def __init__(self,
diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
index de2671e6a932d..c11ee1e51ca70 100644
--- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py
+++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
@@ -9,7 +9,7 @@
 
 import ray
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.tf_policy import TFPolicy
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
 from ray.rllib.optimizers.multi_gpu_impl import LocalSyncParallelOptimizer
 from ray.rllib.optimizers.rollout import collect_samples, \
@@ -34,9 +34,9 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
     details, see `multi_gpu_impl.LocalSyncParallelOptimizer`.
 
     This optimizer is Tensorflow-specific and require the underlying
-    PolicyGraph to be a TFPolicyGraph instance that support `.copy()`.
+    Policy to be a TFPolicy instance that support `.copy()`.
 
-    Note that all replicas of the TFPolicyGraph will merge their
+    Note that all replicas of the TFPolicy will merge their
     extra_compute_grad and apply_grad feed_dicts and fetches. This
     may result in unexpected behavior.
     """
@@ -83,7 +83,7 @@ def __init__(self,
             self.local_evaluator.foreach_trainable_policy(lambda p, i: (i, p)))
         logger.debug("Policies to train: {}".format(self.policies))
         for policy_id, policy in self.policies.items():
-            if not isinstance(policy, TFPolicyGraph):
+            if not isinstance(policy, TFPolicy):
                 raise ValueError(
                     "Only TF policies are supported with multi-GPU. Try using "
                     "the simple optimizer instead.")
diff --git a/python/ray/rllib/tests/test_evaluators.py b/python/ray/rllib/tests/test_evaluators.py
index 36ded2b4e800b..7f2ef740e4f55 100644
--- a/python/ray/rllib/tests/test_evaluators.py
+++ b/python/ray/rllib/tests/test_evaluators.py
@@ -7,7 +7,7 @@
 import ray
 from ray.rllib.agents.dqn import DQNTrainer
 from ray.rllib.agents.a3c import A3CTrainer
-from ray.rllib.agents.dqn.dqn_policy_graph import _adjust_nstep
+from ray.rllib.agents.dqn.dqn_policy import _adjust_nstep
 from ray.tune.registry import register_env
 import gym
 
diff --git a/python/ray/rllib/tests/test_external_env.py b/python/ray/rllib/tests/test_external_env.py
index 3379639612f6f..31ac75135f54a 100644
--- a/python/ray/rllib/tests/test_external_env.py
+++ b/python/ray/rllib/tests/test_external_env.py
@@ -13,8 +13,8 @@
 from ray.rllib.agents.pg import PGTrainer
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.external_env import ExternalEnv
-from ray.rllib.tests.test_policy_evaluator import (BadPolicyGraph,
-                                                   MockPolicyGraph, MockEnv)
+from ray.rllib.tests.test_policy_evaluator import (BadPolicy,
+                                                   MockPolicy, MockEnv)
 from ray.tune.registry import register_env
 
 
@@ -121,7 +121,7 @@ class TestExternalEnv(unittest.TestCase):
     def testExternalEnvCompleteEpisodes(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: SimpleServing(MockEnv(25)),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_steps=40,
             batch_mode="complete_episodes")
         for _ in range(3):
@@ -131,7 +131,7 @@ def testExternalEnvCompleteEpisodes(self):
     def testExternalEnvTruncateEpisodes(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: SimpleServing(MockEnv(25)),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_steps=40,
             batch_mode="truncate_episodes")
         for _ in range(3):
@@ -141,7 +141,7 @@ def testExternalEnvTruncateEpisodes(self):
     def testExternalEnvOffPolicy(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: SimpleOffPolicyServing(MockEnv(25), 42),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_steps=40,
             batch_mode="complete_episodes")
         for _ in range(3):
@@ -153,7 +153,7 @@ def testExternalEnvOffPolicy(self):
     def testExternalEnvBadActions(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: SimpleServing(MockEnv(25)),
-            policy_graph=BadPolicyGraph,
+            policy=BadPolicy,
             sample_async=True,
             batch_steps=40,
             batch_mode="truncate_episodes")
@@ -198,7 +198,7 @@ def testTrainCartpoleMulti(self):
     def testExternalEnvHorizonNotSupported(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: SimpleServing(MockEnv(25)),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             episode_horizon=20,
             batch_steps=10,
             batch_mode="complete_episodes")
diff --git a/python/ray/rllib/tests/test_external_multi_agent_env.py b/python/ray/rllib/tests/test_external_multi_agent_env.py
index c01e6fa0b7aeb..fcb3de634cbea 100644
--- a/python/ray/rllib/tests/test_external_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_external_multi_agent_env.py
@@ -8,11 +8,11 @@
 import unittest
 
 import ray
-from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
+from ray.rllib.agents.pg.pg_policy import PGTFPolicy
 from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
-from ray.rllib.tests.test_policy_evaluator import MockPolicyGraph
+from ray.rllib.tests.test_policy_evaluator import MockPolicy
 from ray.rllib.tests.test_external_env import make_simple_serving
 from ray.rllib.tests.test_multi_agent_env import BasicMultiAgent, MultiCartpole
 from ray.rllib.evaluation.metrics import collect_metrics
@@ -25,7 +25,7 @@ def testExternalMultiAgentEnvCompleteEpisodes(self):
         agents = 4
         ev = PolicyEvaluator(
             env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_steps=40,
             batch_mode="complete_episodes")
         for _ in range(3):
@@ -37,7 +37,7 @@ def testExternalMultiAgentEnvTruncateEpisodes(self):
         agents = 4
         ev = PolicyEvaluator(
             env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_steps=40,
             batch_mode="truncate_episodes")
         for _ in range(3):
@@ -51,9 +51,9 @@ def testExternalMultiAgentEnvSample(self):
         obs_space = gym.spaces.Discrete(2)
         ev = PolicyEvaluator(
             env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)),
-            policy_graph={
-                "p0": (MockPolicyGraph, obs_space, act_space, {}),
-                "p1": (MockPolicyGraph, obs_space, act_space, {}),
+            policy={
+                "p0": (MockPolicy, obs_space, act_space, {}),
+                "p1": (MockPolicy, obs_space, act_space, {}),
             },
             policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
             batch_steps=50)
@@ -72,7 +72,7 @@ def testTrainExternalMultiCartpoleManyPolicies(self):
         policy_ids = list(policies.keys())
         ev = PolicyEvaluator(
             env_creator=lambda _: MultiCartpole(n),
-            policy_graph=policies,
+            policy=policies,
             policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
             batch_steps=100)
         optimizer = SyncSamplesOptimizer(ev, [])
diff --git a/python/ray/rllib/tests/test_io.py b/python/ray/rllib/tests/test_io.py
index 0706be1019cca..c98e4553dcf18 100644
--- a/python/ray/rllib/tests/test_io.py
+++ b/python/ray/rllib/tests/test_io.py
@@ -15,7 +15,7 @@
 
 import ray
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
+from ray.rllib.agents.pg.pg_policy import PGTFPolicy
 from ray.rllib.evaluation import SampleBatch
 from ray.rllib.offline import IOContext, JsonWriter, JsonReader
 from ray.rllib.offline.json_writer import _to_json
@@ -167,7 +167,7 @@ def gen_policy():
                 "num_workers": 0,
                 "output": self.test_dir,
                 "multiagent": {
-                    "policy_graphs": {
+                    "policies": {
                         "policy_1": gen_policy(),
                         "policy_2": gen_policy(),
                     },
@@ -188,7 +188,7 @@ def gen_policy():
                 "input_evaluation": ["simulation"],
                 "train_batch_size": 2000,
                 "multiagent": {
-                    "policy_graphs": {
+                    "policies": {
                         "policy_1": gen_policy(),
                         "policy_2": gen_policy(),
                     },
diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py
index 72130712d5556..8ef39491f18eb 100644
--- a/python/ray/rllib/tests/test_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_multi_agent_env.py
@@ -8,14 +8,14 @@
 
 import ray
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
-from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
+from ray.rllib.agents.pg.pg_policy import PGTFPolicy
+from ray.rllib.agents.dqn.dqn_policy import DQNPolicy
 from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer,
                                   AsyncGradientsOptimizer)
 from ray.rllib.tests.test_policy_evaluator import (MockEnv, MockEnv2,
-                                                   MockPolicyGraph)
+                                                   MockPolicy)
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
@@ -329,9 +329,9 @@ def testMultiAgentSample(self):
         obs_space = gym.spaces.Discrete(2)
         ev = PolicyEvaluator(
             env_creator=lambda _: BasicMultiAgent(5),
-            policy_graph={
-                "p0": (MockPolicyGraph, obs_space, act_space, {}),
-                "p1": (MockPolicyGraph, obs_space, act_space, {}),
+            policy={
+                "p0": (MockPolicy, obs_space, act_space, {}),
+                "p1": (MockPolicy, obs_space, act_space, {}),
             },
             policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
             batch_steps=50)
@@ -347,9 +347,9 @@ def testMultiAgentSampleSyncRemote(self):
         obs_space = gym.spaces.Discrete(2)
         ev = PolicyEvaluator(
             env_creator=lambda _: BasicMultiAgent(5),
-            policy_graph={
-                "p0": (MockPolicyGraph, obs_space, act_space, {}),
-                "p1": (MockPolicyGraph, obs_space, act_space, {}),
+            policy={
+                "p0": (MockPolicy, obs_space, act_space, {}),
+                "p1": (MockPolicy, obs_space, act_space, {}),
             },
             policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
             batch_steps=50,
@@ -364,9 +364,9 @@ def testMultiAgentSampleAsyncRemote(self):
         obs_space = gym.spaces.Discrete(2)
         ev = PolicyEvaluator(
             env_creator=lambda _: BasicMultiAgent(5),
-            policy_graph={
-                "p0": (MockPolicyGraph, obs_space, act_space, {}),
-                "p1": (MockPolicyGraph, obs_space, act_space, {}),
+            policy={
+                "p0": (MockPolicy, obs_space, act_space, {}),
+                "p1": (MockPolicy, obs_space, act_space, {}),
             },
             policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
             batch_steps=50,
@@ -380,9 +380,9 @@ def testMultiAgentSampleWithHorizon(self):
         obs_space = gym.spaces.Discrete(2)
         ev = PolicyEvaluator(
             env_creator=lambda _: BasicMultiAgent(5),
-            policy_graph={
-                "p0": (MockPolicyGraph, obs_space, act_space, {}),
-                "p1": (MockPolicyGraph, obs_space, act_space, {}),
+            policy={
+                "p0": (MockPolicy, obs_space, act_space, {}),
+                "p1": (MockPolicy, obs_space, act_space, {}),
             },
             policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
             episode_horizon=10,  # test with episode horizon set
@@ -395,9 +395,9 @@ def testSampleFromEarlyDoneEnv(self):
         obs_space = gym.spaces.Discrete(2)
         ev = PolicyEvaluator(
             env_creator=lambda _: EarlyDoneMultiAgent(),
-            policy_graph={
-                "p0": (MockPolicyGraph, obs_space, act_space, {}),
-                "p1": (MockPolicyGraph, obs_space, act_space, {}),
+            policy={
+                "p0": (MockPolicy, obs_space, act_space, {}),
+                "p1": (MockPolicy, obs_space, act_space, {}),
             },
             policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
             batch_mode="complete_episodes",
@@ -411,8 +411,8 @@ def testMultiAgentSampleRoundRobin(self):
         obs_space = gym.spaces.Discrete(10)
         ev = PolicyEvaluator(
             env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True),
-            policy_graph={
-                "p0": (MockPolicyGraph, obs_space, act_space, {}),
+            policy={
+                "p0": (MockPolicy, obs_space, act_space, {}),
             },
             policy_mapping_fn=lambda agent_id: "p0",
             batch_steps=50)
@@ -445,7 +445,7 @@ def testMultiAgentSampleRoundRobin(self):
     def testCustomRNNStateValues(self):
         h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}}
 
-        class StatefulPolicyGraph(PolicyGraph):
+        class StatefulPolicy(Policy):
             def compute_actions(self,
                                 obs_batch,
                                 state_batches,
@@ -460,7 +460,7 @@ def get_initial_state(self):
 
         ev = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
-            policy_graph=StatefulPolicyGraph,
+            policy=StatefulPolicy,
             batch_steps=5)
         batch = ev.sample()
         self.assertEqual(batch.count, 5)
@@ -470,7 +470,7 @@ def get_initial_state(self):
         self.assertEqual(batch["state_out_0"][1], h)
 
     def testReturningModelBasedRolloutsData(self):
-        class ModelBasedPolicyGraph(PGTFPolicy):
+        class ModelBasedPolicy(PGTFPolicy):
             def compute_actions(self,
                                 obs_batch,
                                 state_batches,
@@ -505,9 +505,9 @@ def compute_actions(self,
         act_space = single_env.action_space
         ev = PolicyEvaluator(
             env_creator=lambda _: MultiCartpole(2),
-            policy_graph={
-                "p0": (ModelBasedPolicyGraph, obs_space, act_space, {}),
-                "p1": (ModelBasedPolicyGraph, obs_space, act_space, {}),
+            policy={
+                "p0": (ModelBasedPolicy, obs_space, act_space, {}),
+                "p1": (ModelBasedPolicy, obs_space, act_space, {}),
             },
             policy_mapping_fn=lambda agent_id: "p0",
             batch_steps=5)
@@ -547,7 +547,7 @@ def gen_policy():
             config={
                 "num_workers": 0,
                 "multiagent": {
-                    "policy_graphs": {
+                    "policies": {
                         "policy_1": gen_policy(),
                         "policy_2": gen_policy(),
                     },
@@ -579,17 +579,17 @@ def _testWithOptimizer(self, optimizer_cls):
             # happen since the replay buffer doesn't encode extra fields like
             # "advantages" that PG uses.
             policies = {
-                "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config),
-                "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
+                "p1": (DQNPolicy, obs_space, act_space, dqn_config),
+                "p2": (DQNPolicy, obs_space, act_space, dqn_config),
             }
         else:
             policies = {
                 "p1": (PGTFPolicy, obs_space, act_space, {}),
-                "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
+                "p2": (DQNPolicy, obs_space, act_space, dqn_config),
             }
         ev = PolicyEvaluator(
             env_creator=lambda _: MultiCartpole(n),
-            policy_graph=policies,
+            policy=policies,
             policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
             batch_steps=50)
         if optimizer_cls == AsyncGradientsOptimizer:
@@ -600,7 +600,7 @@ def policy_mapper(agent_id):
             remote_evs = [
                 PolicyEvaluator.as_remote().remote(
                     env_creator=lambda _: MultiCartpole(n),
-                    policy_graph=policies,
+                    policy=policies,
                     policy_mapping_fn=policy_mapper,
                     batch_steps=50)
             ]
@@ -610,12 +610,12 @@ def policy_mapper(agent_id):
         for i in range(200):
             ev.foreach_policy(lambda p, _: p.set_epsilon(
                 max(0.02, 1 - i * .02))
-                              if isinstance(p, DQNPolicyGraph) else None)
+                              if isinstance(p, DQNPolicy) else None)
             optimizer.step()
             result = collect_metrics(ev, remote_evs)
             if i % 20 == 0:
                 ev.foreach_policy(lambda p, _: p.update_target() if isinstance(
-                    p, DQNPolicyGraph) else None)
+                    p, DQNPolicy) else None)
                 print("Iter {}, rew {}".format(i,
                                                result["policy_reward_mean"]))
                 print("Total reward", result["episode_reward_mean"])
@@ -645,7 +645,7 @@ def testTrainMultiCartpoleManyPolicies(self):
         policy_ids = list(policies.keys())
         ev = PolicyEvaluator(
             env_creator=lambda _: MultiCartpole(n),
-            policy_graph=policies,
+            policy=policies,
             policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
             batch_steps=100)
         optimizer = SyncSamplesOptimizer(ev, [])
diff --git a/python/ray/rllib/tests/test_nested_spaces.py b/python/ray/rllib/tests/test_nested_spaces.py
index b70bd9a2908e0..0220ba01722c1 100644
--- a/python/ray/rllib/tests/test_nested_spaces.py
+++ b/python/ray/rllib/tests/test_nested_spaces.py
@@ -12,7 +12,7 @@
 import ray
 from ray.rllib.agents.a3c import A2CTrainer
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
+from ray.rllib.agents.pg.pg_policy import PGTFPolicy
 from ray.rllib.env import MultiAgentEnv
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.vector_env import VectorEnv
@@ -331,7 +331,7 @@ def testMultiAgentComplexSpaces(self):
                 "sample_batch_size": 5,
                 "train_batch_size": 5,
                 "multiagent": {
-                    "policy_graphs": {
+                    "policies": {
                         "tuple_policy": (
                             PGTFPolicy, TUPLE_SPACE, act_space,
                             {"model": {"custom_model": "tuple_spy"}}),
diff --git a/python/ray/rllib/tests/test_optimizers.py b/python/ray/rllib/tests/test_optimizers.py
index 5436baeafa909..f851cfc33f128 100644
--- a/python/ray/rllib/tests/test_optimizers.py
+++ b/python/ray/rllib/tests/test_optimizers.py
@@ -9,7 +9,7 @@
 
 import ray
 from ray.rllib.agents.ppo import PPOTrainer
-from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy
+from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
 from ray.rllib.evaluation import SampleBatch
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.optimizers import AsyncGradientsOptimizer, AsyncSamplesOptimizer
@@ -240,12 +240,12 @@ def make_sess():
 
         local = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
-            policy_graph=PPOTFPolicy,
+            policy=PPOTFPolicy,
             tf_session_creator=make_sess)
         remotes = [
             PolicyEvaluator.as_remote().remote(
                 env_creator=lambda _: gym.make("CartPole-v0"),
-                policy_graph=PPOTFPolicy,
+                policy=PPOTFPolicy,
                 tf_session_creator=make_sess)
         ]
         return local, remotes
diff --git a/python/ray/rllib/tests/test_perf.py b/python/ray/rllib/tests/test_perf.py
index f437c9628dfda..e31530f44ced6 100644
--- a/python/ray/rllib/tests/test_perf.py
+++ b/python/ray/rllib/tests/test_perf.py
@@ -8,7 +8,7 @@
 
 import ray
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
-from ray.rllib.tests.test_policy_evaluator import MockPolicyGraph
+from ray.rllib.tests.test_policy_evaluator import MockPolicy
 
 
 class TestPerf(unittest.TestCase):
@@ -19,7 +19,7 @@ def testBaselinePerformance(self):
         for _ in range(20):
             ev = PolicyEvaluator(
                 env_creator=lambda _: gym.make("CartPole-v0"),
-                policy_graph=MockPolicyGraph,
+                policy=MockPolicy,
                 batch_steps=100)
             start = time.time()
             count = 0
diff --git a/python/ray/rllib/tests/test_policy_evaluator.py b/python/ray/rllib/tests/test_policy_evaluator.py
index 6283a5b663144..c332434b628ef 100644
--- a/python/ray/rllib/tests/test_policy_evaluator.py
+++ b/python/ray/rllib/tests/test_policy_evaluator.py
@@ -14,14 +14,14 @@
 from ray.rllib.agents.a3c import A2CTrainer
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.evaluation.metrics import collect_metrics
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.policy import Policy
 from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID, SampleBatch
 from ray.rllib.env.vector_env import VectorEnv
 from ray.tune.registry import register_env
 
 
-class MockPolicyGraph(PolicyGraph):
+class MockPolicy(Policy):
     def compute_actions(self,
                         obs_batch,
                         state_batches,
@@ -39,7 +39,7 @@ def postprocess_trajectory(self,
         return compute_advantages(batch, 100.0, 0.9, use_gae=False)
 
 
-class BadPolicyGraph(PolicyGraph):
+class BadPolicy(Policy):
     def compute_actions(self,
                         obs_batch,
                         state_batches,
@@ -133,7 +133,7 @@ class TestPolicyEvaluator(unittest.TestCase):
     def testBasic(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
-            policy_graph=MockPolicyGraph)
+            policy=MockPolicy)
         batch = ev.sample()
         for key in [
                 "obs", "actions", "rewards", "dones", "advantages",
@@ -158,7 +158,7 @@ def to_prev(vec):
     def testBatchIds(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
-            policy_graph=MockPolicyGraph)
+            policy=MockPolicy)
         batch1 = ev.sample()
         batch2 = ev.sample()
         self.assertEqual(len(set(batch1["unroll_id"])), 1)
@@ -229,7 +229,7 @@ def testRewardClipping(self):
         # clipping on
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv2(episode_length=10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             clip_rewards=True,
             batch_mode="complete_episodes")
         self.assertEqual(max(ev.sample()["rewards"]), 1)
@@ -239,7 +239,7 @@ def testRewardClipping(self):
         # clipping off
         ev2 = PolicyEvaluator(
             env_creator=lambda _: MockEnv2(episode_length=10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             clip_rewards=False,
             batch_mode="complete_episodes")
         self.assertEqual(max(ev2.sample()["rewards"]), 100)
@@ -249,7 +249,7 @@ def testRewardClipping(self):
     def testHardHorizon(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv(episode_length=10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_mode="complete_episodes",
             batch_steps=10,
             episode_horizon=4,
@@ -263,7 +263,7 @@ def testHardHorizon(self):
     def testSoftHorizon(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv(episode_length=10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_mode="complete_episodes",
             batch_steps=10,
             episode_horizon=4,
@@ -277,11 +277,11 @@ def testSoftHorizon(self):
     def testMetrics(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv(episode_length=10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_mode="complete_episodes")
         remote_ev = PolicyEvaluator.as_remote().remote(
             env_creator=lambda _: MockEnv(episode_length=10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_mode="complete_episodes")
         ev.sample()
         ray.get(remote_ev.sample.remote())
@@ -293,7 +293,7 @@ def testAsync(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
             sample_async=True,
-            policy_graph=MockPolicyGraph)
+            policy=MockPolicy)
         batch = ev.sample()
         for key in ["obs", "actions", "rewards", "dones", "advantages"]:
             self.assertIn(key, batch)
@@ -302,7 +302,7 @@ def testAsync(self):
     def testAutoVectorization(self):
         ev = PolicyEvaluator(
             env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_mode="truncate_episodes",
             batch_steps=2,
             num_envs=8)
@@ -325,7 +325,7 @@ def testAutoVectorization(self):
     def testBatchesLargerWhenVectorized(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv(episode_length=8),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_mode="truncate_episodes",
             batch_steps=4,
             num_envs=4)
@@ -340,7 +340,7 @@ def testBatchesLargerWhenVectorized(self):
     def testVectorEnvSupport(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_mode="truncate_episodes",
             batch_steps=10)
         for _ in range(8):
@@ -357,7 +357,7 @@ def testVectorEnvSupport(self):
     def testTruncateEpisodes(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv(10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_steps=15,
             batch_mode="truncate_episodes")
         batch = ev.sample()
@@ -366,7 +366,7 @@ def testTruncateEpisodes(self):
     def testCompleteEpisodes(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv(10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_steps=5,
             batch_mode="complete_episodes")
         batch = ev.sample()
@@ -375,7 +375,7 @@ def testCompleteEpisodes(self):
     def testCompleteEpisodesPacking(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: MockEnv(10),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             batch_steps=15,
             batch_mode="complete_episodes")
         batch = ev.sample()
@@ -387,7 +387,7 @@ def testCompleteEpisodesPacking(self):
     def testFilterSync(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             sample_async=True,
             observation_filter="ConcurrentMeanStdFilter")
         time.sleep(2)
@@ -400,7 +400,7 @@ def testFilterSync(self):
     def testGetFilters(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             sample_async=True,
             observation_filter="ConcurrentMeanStdFilter")
         self.sample_and_flush(ev)
@@ -415,7 +415,7 @@ def testGetFilters(self):
     def testSyncFilter(self):
         ev = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
-            policy_graph=MockPolicyGraph,
+            policy=MockPolicy,
             sample_async=True,
             observation_filter="ConcurrentMeanStdFilter")
         obs_f = self.sample_and_flush(ev)

From 99ff8f88beee788b3fe1ea0835cca76c9474b499 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 14:55:34 -0700
Subject: [PATCH 02/13] rename files

---
 .../rllib/agents/a3c/{a3c_tf_policy_graph.py => a3c_tf_policy.py} | 0
 .../agents/a3c/{a3c_torch_policy_graph.py => a3c_torch_policy.py} | 0
 .../rllib/agents/ddpg/{ddpg_policy_graph.py => ddpg_policy.py}    | 0
 .../ray/rllib/agents/dqn/{dqn_policy_graph.py => dqn_policy.py}   | 0
 .../agents/impala/{vtrace_policy_graph.py => vtrace_policy.py}    | 0
 .../agents/marwil/{marwil_policy_graph.py => marwil_policy.py}    | 0
 python/ray/rllib/agents/pg/{pg_policy_graph.py => pg_policy.py}   | 0
 .../agents/pg/{torch_pg_policy_graph.py => torch_pg_policy.py}    | 0
 .../ray/rllib/agents/ppo/{appo_policy_graph.py => appo_policy.py} | 0
 .../ray/rllib/agents/ppo/{ppo_policy_graph.py => ppo_policy.py}   | 0
 .../rllib/agents/qmix/{qmix_policy_graph.py => qmix_policy.py}    | 0
 .../{dynamic_tf_policy_graph.py => dynamic_tf_policy.py}          | 0
 .../rllib/evaluation/{keras_policy_graph.py => keras_policy.py}   | 0
 python/ray/rllib/evaluation/{policy_graph.py => policy.py}        | 0
 python/ray/rllib/evaluation/{tf_policy_graph.py => tf_policy.py}  | 0
 .../rllib/evaluation/{torch_policy_graph.py => torch_policy.py}   | 0
 16 files changed, 0 insertions(+), 0 deletions(-)
 rename python/ray/rllib/agents/a3c/{a3c_tf_policy_graph.py => a3c_tf_policy.py} (100%)
 rename python/ray/rllib/agents/a3c/{a3c_torch_policy_graph.py => a3c_torch_policy.py} (100%)
 rename python/ray/rllib/agents/ddpg/{ddpg_policy_graph.py => ddpg_policy.py} (100%)
 rename python/ray/rllib/agents/dqn/{dqn_policy_graph.py => dqn_policy.py} (100%)
 rename python/ray/rllib/agents/impala/{vtrace_policy_graph.py => vtrace_policy.py} (100%)
 rename python/ray/rllib/agents/marwil/{marwil_policy_graph.py => marwil_policy.py} (100%)
 rename python/ray/rllib/agents/pg/{pg_policy_graph.py => pg_policy.py} (100%)
 rename python/ray/rllib/agents/pg/{torch_pg_policy_graph.py => torch_pg_policy.py} (100%)
 rename python/ray/rllib/agents/ppo/{appo_policy_graph.py => appo_policy.py} (100%)
 rename python/ray/rllib/agents/ppo/{ppo_policy_graph.py => ppo_policy.py} (100%)
 rename python/ray/rllib/agents/qmix/{qmix_policy_graph.py => qmix_policy.py} (100%)
 rename python/ray/rllib/evaluation/{dynamic_tf_policy_graph.py => dynamic_tf_policy.py} (100%)
 rename python/ray/rllib/evaluation/{keras_policy_graph.py => keras_policy.py} (100%)
 rename python/ray/rllib/evaluation/{policy_graph.py => policy.py} (100%)
 rename python/ray/rllib/evaluation/{tf_policy_graph.py => tf_policy.py} (100%)
 rename python/ray/rllib/evaluation/{torch_policy_graph.py => torch_policy.py} (100%)

diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
similarity index 100%
rename from python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
rename to python/ray/rllib/agents/a3c/a3c_tf_policy.py
diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy.py
similarity index 100%
rename from python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
rename to python/ray/rllib/agents/a3c/a3c_torch_policy.py
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy.py
similarity index 100%
rename from python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
rename to python/ray/rllib/agents/ddpg/ddpg_policy.py
diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy.py
similarity index 100%
rename from python/ray/rllib/agents/dqn/dqn_policy_graph.py
rename to python/ray/rllib/agents/dqn/dqn_policy.py
diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy.py
similarity index 100%
rename from python/ray/rllib/agents/impala/vtrace_policy_graph.py
rename to python/ray/rllib/agents/impala/vtrace_policy.py
diff --git a/python/ray/rllib/agents/marwil/marwil_policy_graph.py b/python/ray/rllib/agents/marwil/marwil_policy.py
similarity index 100%
rename from python/ray/rllib/agents/marwil/marwil_policy_graph.py
rename to python/ray/rllib/agents/marwil/marwil_policy.py
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy.py
similarity index 100%
rename from python/ray/rllib/agents/pg/pg_policy_graph.py
rename to python/ray/rllib/agents/pg/pg_policy.py
diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy.py
similarity index 100%
rename from python/ray/rllib/agents/pg/torch_pg_policy_graph.py
rename to python/ray/rllib/agents/pg/torch_pg_policy.py
diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy.py
similarity index 100%
rename from python/ray/rllib/agents/ppo/appo_policy_graph.py
rename to python/ray/rllib/agents/ppo/appo_policy.py
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy.py
similarity index 100%
rename from python/ray/rllib/agents/ppo/ppo_policy_graph.py
rename to python/ray/rllib/agents/ppo/ppo_policy.py
diff --git a/python/ray/rllib/agents/qmix/qmix_policy_graph.py b/python/ray/rllib/agents/qmix/qmix_policy.py
similarity index 100%
rename from python/ray/rllib/agents/qmix/qmix_policy_graph.py
rename to python/ray/rllib/agents/qmix/qmix_policy.py
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy.py
similarity index 100%
rename from python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
rename to python/ray/rllib/evaluation/dynamic_tf_policy.py
diff --git a/python/ray/rllib/evaluation/keras_policy_graph.py b/python/ray/rllib/evaluation/keras_policy.py
similarity index 100%
rename from python/ray/rllib/evaluation/keras_policy_graph.py
rename to python/ray/rllib/evaluation/keras_policy.py
diff --git a/python/ray/rllib/evaluation/policy_graph.py b/python/ray/rllib/evaluation/policy.py
similarity index 100%
rename from python/ray/rllib/evaluation/policy_graph.py
rename to python/ray/rllib/evaluation/policy.py
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy.py
similarity index 100%
rename from python/ray/rllib/evaluation/tf_policy_graph.py
rename to python/ray/rllib/evaluation/tf_policy.py
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy.py
similarity index 100%
rename from python/ray/rllib/evaluation/torch_policy_graph.py
rename to python/ray/rllib/evaluation/torch_policy.py

From e58c854f8029a3cec09338b0f17b8c4ba0e514ae Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:00:38 -0700
Subject: [PATCH 03/13] format

---
 python/ray/rllib/agents/ddpg/ddpg_policy.py   |  4 ++--
 python/ray/rllib/agents/dqn/dqn_policy.py     |  8 +++----
 .../ray/rllib/agents/impala/vtrace_policy.py  |  3 +--
 python/ray/rllib/agents/ppo/ppo.py            |  3 +--
 python/ray/rllib/agents/trainer.py            | 23 ++++++++++---------
 .../ray/rllib/evaluation/policy_evaluator.py  | 15 +++++-------
 .../rllib/evaluation/torch_policy_template.py |  4 ++--
 python/ray/rllib/tests/test_external_env.py   |  4 ++--
 .../ray/rllib/tests/test_multi_agent_env.py   |  5 ++--
 .../ray/rllib/tests/test_policy_evaluator.py  |  6 ++---
 10 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy.py b/python/ray/rllib/agents/ddpg/ddpg_policy.py
index eaea8c5dc965f..aa8b91f8143b6 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy.py
@@ -7,8 +7,8 @@
 
 import ray
 import ray.experimental.tf_utils
-from ray.rllib.agents.dqn.dqn_policy import (
-    _huber_loss, _minimize_and_clip, _scope_vars, _postprocess_dqn)
+from ray.rllib.agents.dqn.dqn_policy import (_huber_loss, _minimize_and_clip,
+                                             _scope_vars, _postprocess_dqn)
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.models import ModelCatalog
diff --git a/python/ray/rllib/agents/dqn/dqn_policy.py b/python/ray/rllib/agents/dqn/dqn_policy.py
index dc4bcb41a04e2..903d4d72be42e 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy.py
@@ -636,10 +636,10 @@ def _adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
 def _postprocess_dqn(policy, batch):
     # N-step Q adjustments
     if policy.config["n_step"] > 1:
-        _adjust_nstep(policy.config["n_step"],
-                      policy.config["gamma"], batch[SampleBatch.CUR_OBS],
-                      batch[SampleBatch.ACTIONS], batch[SampleBatch.REWARDS],
-                      batch[SampleBatch.NEXT_OBS], batch[SampleBatch.DONES])
+        _adjust_nstep(policy.config["n_step"], policy.config["gamma"],
+                      batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
+                      batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
+                      batch[SampleBatch.DONES])
 
     if PRIO_WEIGHTS not in batch:
         batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])
diff --git a/python/ray/rllib/agents/impala/vtrace_policy.py b/python/ray/rllib/agents/impala/vtrace_policy.py
index ed44545fb576d..ccf8a87f3d04d 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy.py
@@ -126,8 +126,7 @@ def postprocess_trajectory(self,
         return sample_batch
 
 
-class VTracePolicy(LearningRateSchedule, VTracePostprocessing,
-                        TFPolicy):
+class VTracePolicy(LearningRateSchedule, VTracePostprocessing, TFPolicy):
     def __init__(self,
                  observation_space,
                  action_space,
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index 6c9f6744179d2..b395d935f1197 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -143,8 +143,7 @@ def validate_config(config):
         raise ValueError(
             "Episode truncation is not supported without a value "
             "function. Consider setting batch_mode=complete_episodes.")
-    if (config["multiagent"]["policies"]
-            and not config["simple_optimizer"]):
+    if (config["multiagent"]["policies"] and not config["simple_optimizer"]):
         logger.info(
             "In multi-agent mode, policies will be optimized sequentially "
             "by the multi-GPU optimizer. Consider setting "
diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py
index e48e2166cebf5..ea7961ed04d43 100644
--- a/python/ray/rllib/agents/trainer.py
+++ b/python/ray/rllib/agents/trainer.py
@@ -435,9 +435,7 @@ def get_scope():
                     "using evaluation_config: {}".format(extra_config))
                 # Make local evaluation evaluators
                 self.evaluation_ev = self.make_local_evaluator(
-                    self.env_creator,
-                    self._policy,
-                    extra_config=extra_config)
+                    self.env_creator, self._policy, extra_config=extra_config)
                 self.evaluation_metrics = self._evaluate()
 
     @override(Trainable)
@@ -606,10 +604,7 @@ def set_weights(self, weights):
         self.local_evaluator.set_weights(weights)
 
     @DeveloperAPI
-    def make_local_evaluator(self,
-                             env_creator,
-                             policy,
-                             extra_config=None):
+    def make_local_evaluator(self, env_creator, policy, extra_config=None):
         """Convenience method to return configured local evaluator."""
 
         return self._make_evaluator(
@@ -639,8 +634,8 @@ def make_remote_evaluators(self, env_creator, policy, count):
         cls = PolicyEvaluator.as_remote(**remote_args).remote
 
         return [
-            self._make_evaluator(cls, env_creator, policy, i + 1,
-                                 self.config) for i in range(count)
+            self._make_evaluator(cls, env_creator, policy, i + 1, self.config)
+            for i in range(count)
         ]
 
     @DeveloperAPI
@@ -700,6 +695,13 @@ def resource_help(cls, config):
 
     @staticmethod
     def _validate_config(config):
+        if "policy_graphs" in config["multiagent"]:
+            logger.warning(
+                "The `policy_graphs` config has been renamed to `policies`.")
+            # Backwards compatibility
+            config["multiagent"]["policies"] = config["multiagent"][
+                "policy_graphs"]
+            del config["multiagent"]["policy_graphs"]
         if "gpu" in config:
             raise ValueError(
                 "The `gpu` config is deprecated, please use `num_gpus=0|1` "
@@ -760,8 +762,7 @@ def _has_policy_optimizer(self):
         return hasattr(self, "optimizer") and isinstance(
             self.optimizer, PolicyOptimizer)
 
-    def _make_evaluator(self, cls, env_creator, policy, worker_index,
-                        config):
+    def _make_evaluator(self, cls, env_creator, policy, worker_index, config):
         def session_creator():
             logger.debug("Creating TF session {}".format(
                 config["tf_session_args"]))
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index 01087afc66cb0..1aaf72236143e 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -768,21 +768,18 @@ def _validate_multiagent_config(policy, allow_none_graph=False):
         if allow_none_graph and v[0] is None:
             pass
         elif not issubclass(v[0], Policy):
-            raise ValueError(
-                "policy tuple value 0 must be a rllib.Policy "
-                "class or None, got {}".format(v[0]))
+            raise ValueError("policy tuple value 0 must be a rllib.Policy "
+                             "class or None, got {}".format(v[0]))
         if not isinstance(v[1], gym.Space):
             raise ValueError(
                 "policy tuple value 1 (observation_space) must be a "
                 "gym.Space, got {}".format(type(v[1])))
         if not isinstance(v[2], gym.Space):
-            raise ValueError(
-                "policy tuple value 2 (action_space) must be a "
-                "gym.Space, got {}".format(type(v[2])))
+            raise ValueError("policy tuple value 2 (action_space) must be a "
+                             "gym.Space, got {}".format(type(v[2])))
         if not isinstance(v[3], dict):
-            raise ValueError(
-                "policy tuple value 3 (config) must be a dict, "
-                "got {}".format(type(v[3])))
+            raise ValueError("policy tuple value 3 (config) must be a dict, "
+                             "got {}".format(type(v[3])))
 
 
 def _validate_env(env):
diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py
index ff78d136629ac..cdba19b3cf0ed 100644
--- a/python/ray/rllib/evaluation/torch_policy_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_template.py
@@ -84,8 +84,8 @@ def __init__(self, obs_space, action_space, config):
                 self.model = ModelCatalog.get_torch_model(
                     obs_space, logit_dim, self.config["model"])
 
-            TorchPolicy.__init__(self, obs_space, action_space,
-                                      self.model, loss_fn, self.dist_class)
+            TorchPolicy.__init__(self, obs_space, action_space, self.model,
+                                 loss_fn, self.dist_class)
 
             if after_init:
                 after_init(self, obs_space, action_space, config)
diff --git a/python/ray/rllib/tests/test_external_env.py b/python/ray/rllib/tests/test_external_env.py
index 31ac75135f54a..3b2158959267c 100644
--- a/python/ray/rllib/tests/test_external_env.py
+++ b/python/ray/rllib/tests/test_external_env.py
@@ -13,8 +13,8 @@
 from ray.rllib.agents.pg import PGTrainer
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.external_env import ExternalEnv
-from ray.rllib.tests.test_policy_evaluator import (BadPolicy,
-                                                   MockPolicy, MockEnv)
+from ray.rllib.tests.test_policy_evaluator import (BadPolicy, MockPolicy,
+                                                   MockEnv)
 from ray.tune.registry import register_env
 
 
diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py
index 8ef39491f18eb..281d797f7f54f 100644
--- a/python/ray/rllib/tests/test_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_multi_agent_env.py
@@ -614,8 +614,9 @@ def policy_mapper(agent_id):
             optimizer.step()
             result = collect_metrics(ev, remote_evs)
             if i % 20 == 0:
-                ev.foreach_policy(lambda p, _: p.update_target() if isinstance(
-                    p, DQNPolicy) else None)
+                ev.foreach_policy(
+                    lambda p, _: p.update_target() if isinstance(p, DQNPolicy) else None
+                )
                 print("Iter {}, rew {}".format(i,
                                                result["policy_reward_mean"]))
                 print("Total reward", result["episode_reward_mean"])
diff --git a/python/ray/rllib/tests/test_policy_evaluator.py b/python/ray/rllib/tests/test_policy_evaluator.py
index c332434b628ef..ba66ee94552d3 100644
--- a/python/ray/rllib/tests/test_policy_evaluator.py
+++ b/python/ray/rllib/tests/test_policy_evaluator.py
@@ -132,8 +132,7 @@ def get_unwrapped(self):
 class TestPolicyEvaluator(unittest.TestCase):
     def testBasic(self):
         ev = PolicyEvaluator(
-            env_creator=lambda _: gym.make("CartPole-v0"),
-            policy=MockPolicy)
+            env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy)
         batch = ev.sample()
         for key in [
                 "obs", "actions", "rewards", "dones", "advantages",
@@ -157,8 +156,7 @@ def to_prev(vec):
 
     def testBatchIds(self):
         ev = PolicyEvaluator(
-            env_creator=lambda _: gym.make("CartPole-v0"),
-            policy=MockPolicy)
+            env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy)
         batch1 = ev.sample()
         batch2 = ev.sample()
         self.assertEqual(len(set(batch1["unroll_id"])), 1)

From 00e84f95d812f5cf3a73b681401fead20a8c373f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:11:28 -0700
Subject: [PATCH 04/13] wip

---
 python/ray/rllib/__init__.py                  |   4 +
 python/ray/rllib/evaluation/__init__.py       |   6 +-
 .../ray/rllib/evaluation/dynamic_tf_policy.py | 275 ----------
 python/ray/rllib/evaluation/keras_policy.py   |  65 ---
 python/ray/rllib/evaluation/policy.py         | 287 ----------
 python/ray/rllib/evaluation/tf_policy.py      | 514 ------------------
 .../rllib/evaluation/tf_policy_template.py    |  10 +-
 python/ray/rllib/evaluation/torch_policy.py   | 174 ------
 .../rllib/evaluation/torch_policy_template.py | 133 -----
 .../ray/rllib/offline/off_policy_estimator.py |   2 +-
 python/ray/rllib/utils/__init__.py            |   7 +-
 11 files changed, 19 insertions(+), 1458 deletions(-)
 delete mode 100644 python/ray/rllib/evaluation/dynamic_tf_policy.py
 delete mode 100644 python/ray/rllib/evaluation/keras_policy.py
 delete mode 100644 python/ray/rllib/evaluation/policy.py
 delete mode 100644 python/ray/rllib/evaluation/tf_policy.py
 delete mode 100644 python/ray/rllib/evaluation/torch_policy.py
 delete mode 100644 python/ray/rllib/evaluation/torch_policy_template.py

diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py
index 5577f4cd0355d..f8d41ff447858 100644
--- a/python/ray/rllib/__init__.py
+++ b/python/ray/rllib/__init__.py
@@ -9,7 +9,9 @@
 from ray.tune.registry import register_trainable
 
 from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.env.vector_env import VectorEnv
@@ -44,7 +46,9 @@ def _register_all():
 
 __all__ = [
     "Policy",
+    "PolicyGraph",
     "TFPolicy",
+    "TFPolicyGraph",
     "PolicyEvaluator",
     "SampleBatch",
     "BaseEnv",
diff --git a/python/ray/rllib/evaluation/__init__.py b/python/ray/rllib/evaluation/__init__.py
index a08a656f86279..95e9aba2180a1 100644
--- a/python/ray/rllib/evaluation/__init__.py
+++ b/python/ray/rllib/evaluation/__init__.py
@@ -2,8 +2,11 @@
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.evaluation.interface import EvaluatorInterface
 from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 from ray.rllib.evaluation.torch_policy import TorchPolicy
+from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.evaluation.sample_batch_builder import (
     SampleBatchBuilder, MultiAgentSampleBatchBuilder)
@@ -13,7 +16,8 @@
 
 __all__ = [
     "EvaluatorInterface", "PolicyEvaluator", "Policy", "TFPolicy",
-    "TorchPolicy", "SampleBatch", "MultiAgentBatch", "SampleBatchBuilder",
+    "TorchPolicy", "PolicyGraph", "TFPolicyGraph", "TorchPolicyGraph",
+    "SampleBatch", "MultiAgentBatch", "SampleBatchBuilder",
     "MultiAgentSampleBatchBuilder", "SyncSampler", "AsyncSampler",
     "compute_advantages", "collect_metrics", "MultiAgentEpisode"
 ]
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy.py b/python/ray/rllib/evaluation/dynamic_tf_policy.py
deleted file mode 100644
index a82e751825a8f..0000000000000
--- a/python/ray/rllib/evaluation/dynamic_tf_policy.py
+++ /dev/null
@@ -1,275 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import OrderedDict
-import logging
-import numpy as np
-
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy import TFPolicy
-from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils import try_import_tf
-from ray.rllib.utils.debug import log_once, summarize
-from ray.rllib.utils.tracking_dict import UsageTrackingDict
-
-tf = try_import_tf()
-
-logger = logging.getLogger(__name__)
-
-
-class DynamicTFPolicy(TFPolicy):
-    """A TFPolicy that auto-defines placeholders dynamically at runtime.
-
-    Initialization of this class occurs in two phases.
-      * Phase 1: the model is created and model variables are initialized.
-      * Phase 2: a fake batch of data is created, sent to the trajectory
-        postprocessor, and then used to create placeholders for the loss
-        function. The loss and stats functions are initialized with these
-        placeholders.
-    """
-
-    def __init__(self,
-                 obs_space,
-                 action_space,
-                 config,
-                 loss_fn,
-                 stats_fn=None,
-                 grad_stats_fn=None,
-                 before_loss_init=None,
-                 make_action_sampler=None,
-                 existing_inputs=None,
-                 get_batch_divisibility_req=None):
-        """Initialize a dynamic TF policy.
-
-        Arguments:
-            observation_space (gym.Space): Observation space of the policy.
-            action_space (gym.Space): Action space of the policy.
-            config (dict): Policy-specific configuration data.
-            loss_fn (func): function that returns a loss tensor the policy
-                graph, and dict of experience tensor placeholders
-            stats_fn (func): optional function that returns a dict of
-                TF fetches given the policy and batch input tensors
-            grad_stats_fn (func): optional function that returns a dict of
-                TF fetches given the policy and loss gradient tensors
-            before_loss_init (func): optional function to run prior to loss
-                init that takes the same arguments as __init__
-            make_action_sampler (func): optional function that returns a
-                tuple of action and action prob tensors. The function takes
-                (policy, input_dict, obs_space, action_space, config) as its
-                arguments
-            existing_inputs (OrderedDict): when copying a policy, this
-                specifies an existing dict of placeholders to use instead of
-                defining new ones
-            get_batch_divisibility_req (func): optional function that returns
-                the divisibility requirement for sample batches
-        """
-        self.config = config
-        self._loss_fn = loss_fn
-        self._stats_fn = stats_fn
-        self._grad_stats_fn = grad_stats_fn
-
-        # Setup standard placeholders
-        if existing_inputs is not None:
-            obs = existing_inputs[SampleBatch.CUR_OBS]
-            prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS]
-            prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS]
-        else:
-            obs = tf.placeholder(
-                tf.float32,
-                shape=[None] + list(obs_space.shape),
-                name="observation")
-            prev_actions = ModelCatalog.get_action_placeholder(action_space)
-            prev_rewards = tf.placeholder(
-                tf.float32, [None], name="prev_reward")
-
-        input_dict = {
-            "obs": obs,
-            "prev_actions": prev_actions,
-            "prev_rewards": prev_rewards,
-            "is_training": self._get_is_training_placeholder(),
-        }
-
-        # Create the model network and action outputs
-        if make_action_sampler:
-            assert not existing_inputs, \
-                "Cloning not supported with custom action sampler"
-            self.model = None
-            self.dist_class = None
-            self.action_dist = None
-            action_sampler, action_prob = make_action_sampler(
-                self, input_dict, obs_space, action_space, config)
-        else:
-            self.dist_class, logit_dim = ModelCatalog.get_action_dist(
-                action_space, self.config["model"])
-            if existing_inputs:
-                existing_state_in = [
-                    v for k, v in existing_inputs.items()
-                    if k.startswith("state_in_")
-                ]
-                if existing_state_in:
-                    existing_seq_lens = existing_inputs["seq_lens"]
-                else:
-                    existing_seq_lens = None
-            else:
-                existing_state_in = []
-                existing_seq_lens = None
-            self.model = ModelCatalog.get_model(
-                input_dict,
-                obs_space,
-                action_space,
-                logit_dim,
-                self.config["model"],
-                state_in=existing_state_in,
-                seq_lens=existing_seq_lens)
-            self.action_dist = self.dist_class(self.model.outputs)
-            action_sampler = self.action_dist.sample()
-            action_prob = self.action_dist.sampled_action_prob()
-
-        # Phase 1 init
-        sess = tf.get_default_session()
-        if get_batch_divisibility_req:
-            batch_divisibility_req = get_batch_divisibility_req(self)
-        else:
-            batch_divisibility_req = 1
-        TFPolicy.__init__(
-            self,
-            obs_space,
-            action_space,
-            sess,
-            obs_input=obs,
-            action_sampler=action_sampler,
-            action_prob=action_prob,
-            loss=None,  # dynamically initialized on run
-            loss_inputs=[],
-            model=self.model,
-            state_inputs=self.model and self.model.state_in,
-            state_outputs=self.model and self.model.state_out,
-            prev_action_input=prev_actions,
-            prev_reward_input=prev_rewards,
-            seq_lens=self.model and self.model.seq_lens,
-            max_seq_len=config["model"]["max_seq_len"],
-            batch_divisibility_req=batch_divisibility_req)
-
-        # Phase 2 init
-        before_loss_init(self, obs_space, action_space, config)
-        if not existing_inputs:
-            self._initialize_loss()
-
-    @override(TFPolicy)
-    def copy(self, existing_inputs):
-        """Creates a copy of self using existing input placeholders."""
-
-        # Note that there might be RNN state inputs at the end of the list
-        if self._state_inputs:
-            num_state_inputs = len(self._state_inputs) + 1
-        else:
-            num_state_inputs = 0
-        if len(self._loss_inputs) + num_state_inputs != len(existing_inputs):
-            raise ValueError("Tensor list mismatch", self._loss_inputs,
-                             self._state_inputs, existing_inputs)
-        for i, (k, v) in enumerate(self._loss_inputs):
-            if v.shape.as_list() != existing_inputs[i].shape.as_list():
-                raise ValueError("Tensor shape mismatch", i, k, v.shape,
-                                 existing_inputs[i].shape)
-        # By convention, the loss inputs are followed by state inputs and then
-        # the seq len tensor
-        rnn_inputs = []
-        for i in range(len(self._state_inputs)):
-            rnn_inputs.append(("state_in_{}".format(i),
-                               existing_inputs[len(self._loss_inputs) + i]))
-        if rnn_inputs:
-            rnn_inputs.append(("seq_lens", existing_inputs[-1]))
-        input_dict = OrderedDict(
-            [(k, existing_inputs[i])
-             for i, (k, _) in enumerate(self._loss_inputs)] + rnn_inputs)
-        instance = self.__class__(
-            self.observation_space,
-            self.action_space,
-            self.config,
-            existing_inputs=input_dict)
-        loss = instance._loss_fn(instance, input_dict)
-        if instance._stats_fn:
-            instance._stats_fetches.update(
-                instance._stats_fn(instance, input_dict))
-        TFPolicy._initialize_loss(
-            instance, loss, [(k, existing_inputs[i])
-                             for i, (k, _) in enumerate(self._loss_inputs)])
-        if instance._grad_stats_fn:
-            instance._stats_fetches.update(
-                instance._grad_stats_fn(instance, instance._grads))
-        return instance
-
-    @override(Policy)
-    def get_initial_state(self):
-        if self.model:
-            return self.model.state_init
-        else:
-            return []
-
-    def _initialize_loss(self):
-        def fake_array(tensor):
-            shape = tensor.shape.as_list()
-            shape[0] = 1
-            return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype)
-
-        dummy_batch = {
-            SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input),
-            SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input),
-            SampleBatch.CUR_OBS: fake_array(self._obs_input),
-            SampleBatch.NEXT_OBS: fake_array(self._obs_input),
-            SampleBatch.ACTIONS: fake_array(self._prev_action_input),
-            SampleBatch.REWARDS: np.array([0], dtype=np.float32),
-            SampleBatch.DONES: np.array([False], dtype=np.bool),
-        }
-        state_init = self.get_initial_state()
-        for i, h in enumerate(state_init):
-            dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0)
-            dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0)
-        if state_init:
-            dummy_batch["seq_lens"] = np.array([1], dtype=np.int32)
-        for k, v in self.extra_compute_action_fetches().items():
-            dummy_batch[k] = fake_array(v)
-
-        # postprocessing might depend on variable init, so run it first here
-        self._sess.run(tf.global_variables_initializer())
-        postprocessed_batch = self.postprocess_trajectory(
-            SampleBatch(dummy_batch))
-
-        batch_tensors = UsageTrackingDict({
-            SampleBatch.PREV_ACTIONS: self._prev_action_input,
-            SampleBatch.PREV_REWARDS: self._prev_reward_input,
-            SampleBatch.CUR_OBS: self._obs_input,
-        })
-        loss_inputs = [
-            (SampleBatch.PREV_ACTIONS, self._prev_action_input),
-            (SampleBatch.PREV_REWARDS, self._prev_reward_input),
-            (SampleBatch.CUR_OBS, self._obs_input),
-        ]
-
-        for k, v in postprocessed_batch.items():
-            if k in batch_tensors:
-                continue
-            elif v.dtype == np.object:
-                continue  # can't handle arbitrary objects in TF
-            shape = (None, ) + v.shape[1:]
-            dtype = np.float32 if v.dtype == np.float64 else v.dtype
-            placeholder = tf.placeholder(dtype, shape=shape, name=k)
-            batch_tensors[k] = placeholder
-
-        if log_once("loss_init"):
-            logger.info(
-                "Initializing loss function with dummy input:\n\n{}\n".format(
-                    summarize(batch_tensors)))
-
-        loss = self._loss_fn(self, batch_tensors)
-        if self._stats_fn:
-            self._stats_fetches.update(self._stats_fn(self, batch_tensors))
-        for k in sorted(batch_tensors.accessed_keys):
-            loss_inputs.append((k, batch_tensors[k]))
-        TFPolicy._initialize_loss(self, loss, loss_inputs)
-        if self._grad_stats_fn:
-            self._stats_fetches.update(self._grad_stats_fn(self, self._grads))
-        self._sess.run(tf.global_variables_initializer())
diff --git a/python/ray/rllib/evaluation/keras_policy.py b/python/ray/rllib/evaluation/keras_policy.py
deleted file mode 100644
index e10f6ec1d178f..0000000000000
--- a/python/ray/rllib/evaluation/keras_policy.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from ray.rllib.evaluation.policy import Policy
-
-
-def _sample(probs):
-    return [np.random.choice(len(pr), p=pr) for pr in probs]
-
-
-class KerasPolicy(Policy):
-    """Initialize the Keras Policy.
-
-    This is a Policy used for models with actor and critics.
-    Note: This class is built for specific usage of Actor-Critic models,
-    and is less general compared to TFPolicy and TorchPolicies.
-
-    Args:
-        observation_space (gym.Space): Observation space of the policy.
-        action_space (gym.Space): Action space of the policy.
-        config (dict): Policy-specific configuration data.
-        actor (Model): A model that holds the policy.
-        critic (Model): A model that holds the value function.
-    """
-
-    def __init__(self,
-                 observation_space,
-                 action_space,
-                 config,
-                 actor=None,
-                 critic=None):
-        Policy.__init__(self, observation_space, action_space, config)
-        self.actor = actor
-        self.critic = critic
-        self.models = [self.actor, self.critic]
-
-    def compute_actions(self, obs, *args, **kwargs):
-        state = np.array(obs)
-        policy = self.actor.predict(state)
-        value = self.critic.predict(state)
-        return _sample(policy), [], {"vf_preds": value.flatten()}
-
-    def learn_on_batch(self, batch, *args):
-        self.actor.fit(
-            batch["obs"],
-            batch["adv_targets"],
-            epochs=1,
-            verbose=0,
-            steps_per_epoch=20)
-        self.critic.fit(
-            batch["obs"],
-            batch["value_targets"],
-            epochs=1,
-            verbose=0,
-            steps_per_epoch=20)
-        return {}
-
-    def get_weights(self):
-        return [model.get_weights() for model in self.models]
-
-    def set_weights(self, weights):
-        return [model.set_weights(w) for model, w in zip(self.models, weights)]
diff --git a/python/ray/rllib/evaluation/policy.py b/python/ray/rllib/evaluation/policy.py
deleted file mode 100644
index 72393e7826c53..0000000000000
--- a/python/ray/rllib/evaluation/policy.py
+++ /dev/null
@@ -1,287 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import gym
-
-from ray.rllib.utils.annotations import DeveloperAPI
-
-
-@DeveloperAPI
-class Policy(object):
-    """An agent policy and loss, i.e., a TFPolicy or other subclass.
-
-    This object defines how to act in the environment, and also losses used to
-    improve the policy based on its experiences. Note that both policy and
-    loss are defined together for convenience, though the policy itself is
-    logically separate.
-
-    All policies can directly extend Policy, however TensorFlow users may
-    find TFPolicy simpler to implement. TFPolicy also enables RLlib
-    to apply TensorFlow-specific optimizations such as fusing multiple policy
-    graphs and multi-GPU support.
-
-    Attributes:
-        observation_space (gym.Space): Observation space of the policy.
-        action_space (gym.Space): Action space of the policy.
-    """
-
-    @DeveloperAPI
-    def __init__(self, observation_space, action_space, config):
-        """Initialize the graph.
-
-        This is the standard constructor for policies. The policy
-        class you pass into PolicyEvaluator will be constructed with
-        these arguments.
-
-        Args:
-            observation_space (gym.Space): Observation space of the policy.
-            action_space (gym.Space): Action space of the policy.
-            config (dict): Policy-specific configuration data.
-        """
-
-        self.observation_space = observation_space
-        self.action_space = action_space
-
-    @DeveloperAPI
-    def compute_actions(self,
-                        obs_batch,
-                        state_batches,
-                        prev_action_batch=None,
-                        prev_reward_batch=None,
-                        info_batch=None,
-                        episodes=None,
-                        **kwargs):
-        """Compute actions for the current policy.
-
-        Arguments:
-            obs_batch (np.ndarray): batch of observations
-            state_batches (list): list of RNN state input batches, if any
-            prev_action_batch (np.ndarray): batch of previous action values
-            prev_reward_batch (np.ndarray): batch of previous rewards
-            info_batch (info): batch of info objects
-            episodes (list): MultiAgentEpisode for each obs in obs_batch.
-                This provides access to all of the internal episode state,
-                which may be useful for model-based or multiagent algorithms.
-            kwargs: forward compatibility placeholder
-
-        Returns:
-            actions (np.ndarray): batch of output actions, with shape like
-                [BATCH_SIZE, ACTION_SHAPE].
-            state_outs (list): list of RNN state output batches, if any, with
-                shape like [STATE_SIZE, BATCH_SIZE].
-            info (dict): dictionary of extra feature batches, if any, with
-                shape like {"f1": [BATCH_SIZE, ...], "f2": [BATCH_SIZE, ...]}.
-        """
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def compute_single_action(self,
-                              obs,
-                              state,
-                              prev_action=None,
-                              prev_reward=None,
-                              info=None,
-                              episode=None,
-                              clip_actions=False,
-                              **kwargs):
-        """Unbatched version of compute_actions.
-
-        Arguments:
-            obs (obj): single observation
-            state_batches (list): list of RNN state inputs, if any
-            prev_action (obj): previous action value, if any
-            prev_reward (int): previous reward, if any
-            info (dict): info object, if any
-            episode (MultiAgentEpisode): this provides access to all of the
-                internal episode state, which may be useful for model-based or
-                multi-agent algorithms.
-            clip_actions (bool): should the action be clipped
-            kwargs: forward compatibility placeholder
-
-        Returns:
-            actions (obj): single action
-            state_outs (list): list of RNN state outputs, if any
-            info (dict): dictionary of extra features, if any
-        """
-
-        prev_action_batch = None
-        prev_reward_batch = None
-        info_batch = None
-        episodes = None
-        if prev_action is not None:
-            prev_action_batch = [prev_action]
-        if prev_reward is not None:
-            prev_reward_batch = [prev_reward]
-        if info is not None:
-            info_batch = [info]
-        if episode is not None:
-            episodes = [episode]
-        [action], state_out, info = self.compute_actions(
-            [obs], [[s] for s in state],
-            prev_action_batch=prev_action_batch,
-            prev_reward_batch=prev_reward_batch,
-            info_batch=info_batch,
-            episodes=episodes)
-        if clip_actions:
-            action = clip_action(action, self.action_space)
-        return action, [s[0] for s in state_out], \
-            {k: v[0] for k, v in info.items()}
-
-    @DeveloperAPI
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        """Implements algorithm-specific trajectory postprocessing.
-
-        This will be called on each trajectory fragment computed during policy
-        evaluation. Each fragment is guaranteed to be only from one episode.
-
-        Arguments:
-            sample_batch (SampleBatch): batch of experiences for the policy,
-                which will contain at most one episode trajectory.
-            other_agent_batches (dict): In a multi-agent env, this contains a
-                mapping of agent ids to (policy, agent_batch) tuples
-                containing the policy and experiences of the other agent.
-            episode (MultiAgentEpisode): this provides access to all of the
-                internal episode state, which may be useful for model-based or
-                multi-agent algorithms.
-
-        Returns:
-            SampleBatch: postprocessed sample batch.
-        """
-        return sample_batch
-
-    @DeveloperAPI
-    def learn_on_batch(self, samples):
-        """Fused compute gradients and apply gradients call.
-
-        Either this or the combination of compute/apply grads must be
-        implemented by subclasses.
-
-        Returns:
-            grad_info: dictionary of extra metadata from compute_gradients().
-
-        Examples:
-            >>> batch = ev.sample()
-            >>> ev.learn_on_batch(samples)
-        """
-
-        grads, grad_info = self.compute_gradients(samples)
-        self.apply_gradients(grads)
-        return grad_info
-
-    @DeveloperAPI
-    def compute_gradients(self, postprocessed_batch):
-        """Computes gradients against a batch of experiences.
-
-        Either this or learn_on_batch() must be implemented by subclasses.
-
-        Returns:
-            grads (list): List of gradient output values
-            info (dict): Extra policy-specific values
-        """
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def apply_gradients(self, gradients):
-        """Applies previously computed gradients.
-
-        Either this or learn_on_batch() must be implemented by subclasses.
-        """
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def get_weights(self):
-        """Returns model weights.
-
-        Returns:
-            weights (obj): Serializable copy or view of model weights
-        """
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def set_weights(self, weights):
-        """Sets model weights.
-
-        Arguments:
-            weights (obj): Serializable copy or view of model weights
-        """
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def get_initial_state(self):
-        """Returns initial RNN state for the current policy."""
-        return []
-
-    @DeveloperAPI
-    def get_state(self):
-        """Saves all local state.
-
-        Returns:
-            state (obj): Serialized local state.
-        """
-        return self.get_weights()
-
-    @DeveloperAPI
-    def set_state(self, state):
-        """Restores all local state.
-
-        Arguments:
-            state (obj): Serialized local state.
-        """
-        self.set_weights(state)
-
-    @DeveloperAPI
-    def on_global_var_update(self, global_vars):
-        """Called on an update to global vars.
-
-        Arguments:
-            global_vars (dict): Global variables broadcast from the driver.
-        """
-        pass
-
-    @DeveloperAPI
-    def export_model(self, export_dir):
-        """Export Policy to local directory for serving.
-
-        Arguments:
-            export_dir (str): Local writable directory.
-        """
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def export_checkpoint(self, export_dir):
-        """Export Policy checkpoint to local directory.
-
-        Argument:
-            export_dir (str): Local writable directory.
-        """
-        raise NotImplementedError
-
-
-def clip_action(action, space):
-    """Called to clip actions to the specified range of this policy.
-
-    Arguments:
-        action: Single action.
-        space: Action space the actions should be present in.
-
-    Returns:
-        Clipped batch of actions.
-    """
-
-    if isinstance(space, gym.spaces.Box):
-        return np.clip(action, space.low, space.high)
-    elif isinstance(space, gym.spaces.Tuple):
-        if type(action) not in (tuple, list):
-            raise ValueError("Expected tuple space for actions {}: {}".format(
-                action, space))
-        out = []
-        for a, s in zip(action, space.spaces):
-            out.append(clip_action(a, s))
-        return out
-    else:
-        return action
diff --git a/python/ray/rllib/evaluation/tf_policy.py b/python/ray/rllib/evaluation/tf_policy.py
deleted file mode 100644
index c6cc619af524d..0000000000000
--- a/python/ray/rllib/evaluation/tf_policy.py
+++ /dev/null
@@ -1,514 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import errno
-import logging
-import numpy as np
-
-import ray
-import ray.experimental.tf_utils
-from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.models.lstm import chop_into_sequences
-from ray.rllib.utils.annotations import override, DeveloperAPI
-from ray.rllib.utils.debug import log_once, summarize
-from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
-from ray.rllib.utils.tf_run_builder import TFRunBuilder
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-logger = logging.getLogger(__name__)
-
-
-@DeveloperAPI
-class TFPolicy(Policy):
-    """An agent policy and loss implemented in TensorFlow.
-
-    Extending this class enables RLlib to perform TensorFlow specific
-    optimizations on the policy, e.g., parallelization across gpus or
-    fusing multiple graphs together in the multi-agent setting.
-
-    Input tensors are typically shaped like [BATCH_SIZE, ...].
-
-    Attributes:
-        observation_space (gym.Space): observation space of the policy.
-        action_space (gym.Space): action space of the policy.
-        model (rllib.models.Model): RLlib model used for the policy.
-
-    Examples:
-        >>> policy = TFPolicySubclass(
-            sess, obs_input, action_sampler, loss, loss_inputs)
-
-        >>> print(policy.compute_actions([1, 0, 2]))
-        (array([0, 1, 1]), [], {})
-
-        >>> print(policy.postprocess_trajectory(SampleBatch({...})))
-        SampleBatch({"action": ..., "advantages": ..., ...})
-    """
-
-    @DeveloperAPI
-    def __init__(self,
-                 observation_space,
-                 action_space,
-                 sess,
-                 obs_input,
-                 action_sampler,
-                 loss,
-                 loss_inputs,
-                 model=None,
-                 action_prob=None,
-                 state_inputs=None,
-                 state_outputs=None,
-                 prev_action_input=None,
-                 prev_reward_input=None,
-                 seq_lens=None,
-                 max_seq_len=20,
-                 batch_divisibility_req=1,
-                 update_ops=None):
-        """Initialize the policy.
-
-        Arguments:
-            observation_space (gym.Space): Observation space of the env.
-            action_space (gym.Space): Action space of the env.
-            sess (Session): TensorFlow session to use.
-            obs_input (Tensor): input placeholder for observations, of shape
-                [BATCH_SIZE, obs...].
-            action_sampler (Tensor): Tensor for sampling an action, of shape
-                [BATCH_SIZE, action...]
-            loss (Tensor): scalar policy loss output tensor.
-            loss_inputs (list): a (name, placeholder) tuple for each loss
-                input argument. Each placeholder name must correspond to a
-                SampleBatch column key returned by postprocess_trajectory(),
-                and has shape [BATCH_SIZE, data...]. These keys will be read
-                from postprocessed sample batches and fed into the specified
-                placeholders during loss computation.
-            model (rllib.models.Model): used to integrate custom losses and
-                stats from user-defined RLlib models.
-            action_prob (Tensor): probability of the sampled action.
-            state_inputs (list): list of RNN state input Tensors.
-            state_outputs (list): list of RNN state output Tensors.
-            prev_action_input (Tensor): placeholder for previous actions
-            prev_reward_input (Tensor): placeholder for previous rewards
-            seq_lens (Tensor): placeholder for RNN sequence lengths, of shape
-                [NUM_SEQUENCES]. Note that NUM_SEQUENCES << BATCH_SIZE. See
-                models/lstm.py for more information.
-            max_seq_len (int): max sequence length for LSTM training.
-            batch_divisibility_req (int): pad all agent experiences batches to
-                multiples of this value. This only has an effect if not using
-                a LSTM model.
-            update_ops (list): override the batchnorm update ops to run when
-                applying gradients. Otherwise we run all update ops found in
-                the current variable scope.
-        """
-
-        self.observation_space = observation_space
-        self.action_space = action_space
-        self.model = model
-        self._sess = sess
-        self._obs_input = obs_input
-        self._prev_action_input = prev_action_input
-        self._prev_reward_input = prev_reward_input
-        self._sampler = action_sampler
-        self._is_training = self._get_is_training_placeholder()
-        self._action_prob = action_prob
-        self._state_inputs = state_inputs or []
-        self._state_outputs = state_outputs or []
-        self._seq_lens = seq_lens
-        self._max_seq_len = max_seq_len
-        self._batch_divisibility_req = batch_divisibility_req
-        self._update_ops = update_ops
-        self._stats_fetches = {}
-
-        if loss is not None:
-            self._initialize_loss(loss, loss_inputs)
-        else:
-            self._loss = None
-
-        if len(self._state_inputs) != len(self._state_outputs):
-            raise ValueError(
-                "Number of state input and output tensors must match, got: "
-                "{} vs {}".format(self._state_inputs, self._state_outputs))
-        if len(self.get_initial_state()) != len(self._state_inputs):
-            raise ValueError(
-                "Length of initial state must match number of state inputs, "
-                "got: {} vs {}".format(self.get_initial_state(),
-                                       self._state_inputs))
-        if self._state_inputs and self._seq_lens is None:
-            raise ValueError(
-                "seq_lens tensor must be given if state inputs are defined")
-
-    def _initialize_loss(self, loss, loss_inputs):
-        self._loss_inputs = loss_inputs
-        self._loss_input_dict = dict(self._loss_inputs)
-        for i, ph in enumerate(self._state_inputs):
-            self._loss_input_dict["state_in_{}".format(i)] = ph
-
-        if self.model:
-            self._loss = self.model.custom_loss(loss, self._loss_input_dict)
-            self._stats_fetches.update({"model": self.model.custom_stats()})
-        else:
-            self._loss = loss
-
-        self._optimizer = self.optimizer()
-        self._grads_and_vars = [
-            (g, v) for (g, v) in self.gradients(self._optimizer, self._loss)
-            if g is not None
-        ]
-        self._grads = [g for (g, v) in self._grads_and_vars]
-        self._variables = ray.experimental.tf_utils.TensorFlowVariables(
-            self._loss, self._sess)
-
-        # gather update ops for any batch norm layers
-        if not self._update_ops:
-            self._update_ops = tf.get_collection(
-                tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
-        if self._update_ops:
-            logger.debug("Update ops to run on apply gradient: {}".format(
-                self._update_ops))
-        with tf.control_dependencies(self._update_ops):
-            self._apply_op = self.build_apply_op(self._optimizer,
-                                                 self._grads_and_vars)
-
-        if log_once("loss_used"):
-            logger.debug(
-                "These tensors were used in the loss_fn:\n\n{}\n".format(
-                    summarize(self._loss_input_dict)))
-
-        self._sess.run(tf.global_variables_initializer())
-
-    @override(Policy)
-    def compute_actions(self,
-                        obs_batch,
-                        state_batches=None,
-                        prev_action_batch=None,
-                        prev_reward_batch=None,
-                        info_batch=None,
-                        episodes=None,
-                        **kwargs):
-        builder = TFRunBuilder(self._sess, "compute_actions")
-        fetches = self._build_compute_actions(builder, obs_batch,
-                                              state_batches, prev_action_batch,
-                                              prev_reward_batch)
-        return builder.get(fetches)
-
-    @override(Policy)
-    def compute_gradients(self, postprocessed_batch):
-        assert self._loss is not None, "Loss not initialized"
-        builder = TFRunBuilder(self._sess, "compute_gradients")
-        fetches = self._build_compute_gradients(builder, postprocessed_batch)
-        return builder.get(fetches)
-
-    @override(Policy)
-    def apply_gradients(self, gradients):
-        assert self._loss is not None, "Loss not initialized"
-        builder = TFRunBuilder(self._sess, "apply_gradients")
-        fetches = self._build_apply_gradients(builder, gradients)
-        builder.get(fetches)
-
-    @override(Policy)
-    def learn_on_batch(self, postprocessed_batch):
-        assert self._loss is not None, "Loss not initialized"
-        builder = TFRunBuilder(self._sess, "learn_on_batch")
-        fetches = self._build_learn_on_batch(builder, postprocessed_batch)
-        return builder.get(fetches)
-
-    @override(Policy)
-    def get_weights(self):
-        return self._variables.get_flat()
-
-    @override(Policy)
-    def set_weights(self, weights):
-        return self._variables.set_flat(weights)
-
-    @override(Policy)
-    def export_model(self, export_dir):
-        """Export tensorflow graph to export_dir for serving."""
-        with self._sess.graph.as_default():
-            builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
-            signature_def_map = self._build_signature_def()
-            builder.add_meta_graph_and_variables(
-                self._sess, [tf.saved_model.tag_constants.SERVING],
-                signature_def_map=signature_def_map)
-            builder.save()
-
-    @override(Policy)
-    def export_checkpoint(self, export_dir, filename_prefix="model"):
-        """Export tensorflow checkpoint to export_dir."""
-        try:
-            os.makedirs(export_dir)
-        except OSError as e:
-            # ignore error if export dir already exists
-            if e.errno != errno.EEXIST:
-                raise
-        save_path = os.path.join(export_dir, filename_prefix)
-        with self._sess.graph.as_default():
-            saver = tf.train.Saver()
-            saver.save(self._sess, save_path)
-
-    @DeveloperAPI
-    def copy(self, existing_inputs):
-        """Creates a copy of self using existing input placeholders.
-
-        Optional, only required to work with the multi-GPU optimizer."""
-        raise NotImplementedError
-
-    @DeveloperAPI
-    def extra_compute_action_feed_dict(self):
-        """Extra dict to pass to the compute actions session run."""
-        return {}
-
-    @DeveloperAPI
-    def extra_compute_action_fetches(self):
-        """Extra values to fetch and return from compute_actions().
-
-        By default we only return action probability info (if present).
-        """
-        if self._action_prob is not None:
-            return {"action_prob": self._action_prob}
-        else:
-            return {}
-
-    @DeveloperAPI
-    def extra_compute_grad_feed_dict(self):
-        """Extra dict to pass to the compute gradients session run."""
-        return {}  # e.g, kl_coeff
-
-    @DeveloperAPI
-    def extra_compute_grad_fetches(self):
-        """Extra values to fetch and return from compute_gradients()."""
-        return {LEARNER_STATS_KEY: {}}  # e.g, stats, td error, etc.
-
-    @DeveloperAPI
-    def optimizer(self):
-        """TF optimizer to use for policy optimization."""
-        if hasattr(self, "config"):
-            return tf.train.AdamOptimizer(self.config["lr"])
-        else:
-            return tf.train.AdamOptimizer()
-
-    @DeveloperAPI
-    def gradients(self, optimizer, loss):
-        """Override for custom gradient computation."""
-        return optimizer.compute_gradients(loss)
-
-    @DeveloperAPI
-    def build_apply_op(self, optimizer, grads_and_vars):
-        """Override for custom gradient apply computation."""
-
-        # specify global_step for TD3 which needs to count the num updates
-        return optimizer.apply_gradients(
-            self._grads_and_vars,
-            global_step=tf.train.get_or_create_global_step())
-
-    @DeveloperAPI
-    def _get_is_training_placeholder(self):
-        """Get the placeholder for _is_training, i.e., for batch norm layers.
-
-        This can be called safely before __init__ has run.
-        """
-        if not hasattr(self, "_is_training"):
-            self._is_training = tf.placeholder_with_default(False, ())
-        return self._is_training
-
-    def _extra_input_signature_def(self):
-        """Extra input signatures to add when exporting tf model.
-        Inferred from extra_compute_action_feed_dict()
-        """
-        feed_dict = self.extra_compute_action_feed_dict()
-        return {
-            k.name: tf.saved_model.utils.build_tensor_info(k)
-            for k in feed_dict.keys()
-        }
-
-    def _extra_output_signature_def(self):
-        """Extra output signatures to add when exporting tf model.
-        Inferred from extra_compute_action_fetches()
-        """
-        fetches = self.extra_compute_action_fetches()
-        return {
-            k: tf.saved_model.utils.build_tensor_info(fetches[k])
-            for k in fetches.keys()
-        }
-
-    def _build_signature_def(self):
-        """Build signature def map for tensorflow SavedModelBuilder.
-        """
-        # build input signatures
-        input_signature = self._extra_input_signature_def()
-        input_signature["observations"] = \
-            tf.saved_model.utils.build_tensor_info(self._obs_input)
-
-        if self._seq_lens is not None:
-            input_signature["seq_lens"] = \
-                tf.saved_model.utils.build_tensor_info(self._seq_lens)
-        if self._prev_action_input is not None:
-            input_signature["prev_action"] = \
-                tf.saved_model.utils.build_tensor_info(self._prev_action_input)
-        if self._prev_reward_input is not None:
-            input_signature["prev_reward"] = \
-                tf.saved_model.utils.build_tensor_info(self._prev_reward_input)
-        input_signature["is_training"] = \
-            tf.saved_model.utils.build_tensor_info(self._is_training)
-
-        for state_input in self._state_inputs:
-            input_signature[state_input.name] = \
-                tf.saved_model.utils.build_tensor_info(state_input)
-
-        # build output signatures
-        output_signature = self._extra_output_signature_def()
-        output_signature["actions"] = \
-            tf.saved_model.utils.build_tensor_info(self._sampler)
-        for state_output in self._state_outputs:
-            output_signature[state_output.name] = \
-                tf.saved_model.utils.build_tensor_info(state_output)
-        signature_def = (
-            tf.saved_model.signature_def_utils.build_signature_def(
-                input_signature, output_signature,
-                tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
-        signature_def_key = (tf.saved_model.signature_constants.
-                             DEFAULT_SERVING_SIGNATURE_DEF_KEY)
-        signature_def_map = {signature_def_key: signature_def}
-        return signature_def_map
-
-    def _build_compute_actions(self,
-                               builder,
-                               obs_batch,
-                               state_batches=None,
-                               prev_action_batch=None,
-                               prev_reward_batch=None,
-                               episodes=None):
-        state_batches = state_batches or []
-        if len(self._state_inputs) != len(state_batches):
-            raise ValueError(
-                "Must pass in RNN state batches for placeholders {}, got {}".
-                format(self._state_inputs, state_batches))
-        builder.add_feed_dict(self.extra_compute_action_feed_dict())
-        builder.add_feed_dict({self._obs_input: obs_batch})
-        if state_batches:
-            builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
-        if self._prev_action_input is not None and prev_action_batch:
-            builder.add_feed_dict({self._prev_action_input: prev_action_batch})
-        if self._prev_reward_input is not None and prev_reward_batch:
-            builder.add_feed_dict({self._prev_reward_input: prev_reward_batch})
-        builder.add_feed_dict({self._is_training: False})
-        builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
-        fetches = builder.add_fetches([self._sampler] + self._state_outputs +
-                                      [self.extra_compute_action_fetches()])
-        return fetches[0], fetches[1:-1], fetches[-1]
-
-    def _build_compute_gradients(self, builder, postprocessed_batch):
-        builder.add_feed_dict(self.extra_compute_grad_feed_dict())
-        builder.add_feed_dict({self._is_training: True})
-        builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch))
-        fetches = builder.add_fetches(
-            [self._grads, self._get_grad_and_stats_fetches()])
-        return fetches[0], fetches[1]
-
-    def _build_apply_gradients(self, builder, gradients):
-        if len(gradients) != len(self._grads):
-            raise ValueError(
-                "Unexpected number of gradients to apply, got {} for {}".
-                format(gradients, self._grads))
-        builder.add_feed_dict({self._is_training: True})
-        builder.add_feed_dict(dict(zip(self._grads, gradients)))
-        fetches = builder.add_fetches([self._apply_op])
-        return fetches[0]
-
-    def _build_learn_on_batch(self, builder, postprocessed_batch):
-        builder.add_feed_dict(self.extra_compute_grad_feed_dict())
-        builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch))
-        builder.add_feed_dict({self._is_training: True})
-        fetches = builder.add_fetches([
-            self._apply_op,
-            self._get_grad_and_stats_fetches(),
-        ])
-        return fetches[1]
-
-    def _get_grad_and_stats_fetches(self):
-        fetches = self.extra_compute_grad_fetches()
-        if LEARNER_STATS_KEY not in fetches:
-            raise ValueError(
-                "Grad fetches should contain 'stats': {...} entry")
-        if self._stats_fetches:
-            fetches[LEARNER_STATS_KEY] = dict(self._stats_fetches,
-                                              **fetches[LEARNER_STATS_KEY])
-        return fetches
-
-    def _get_loss_inputs_dict(self, batch):
-        feed_dict = {}
-        if self._batch_divisibility_req > 1:
-            meets_divisibility_reqs = (
-                len(batch[SampleBatch.CUR_OBS]) %
-                self._batch_divisibility_req == 0
-                and max(batch[SampleBatch.AGENT_INDEX]) == 0)  # not multiagent
-        else:
-            meets_divisibility_reqs = True
-
-        # Simple case: not RNN nor do we need to pad
-        if not self._state_inputs and meets_divisibility_reqs:
-            for k, ph in self._loss_inputs:
-                feed_dict[ph] = batch[k]
-            return feed_dict
-
-        if self._state_inputs:
-            max_seq_len = self._max_seq_len
-            dynamic_max = True
-        else:
-            max_seq_len = self._batch_divisibility_req
-            dynamic_max = False
-
-        # RNN or multi-agent case
-        feature_keys = [k for k, v in self._loss_inputs]
-        state_keys = [
-            "state_in_{}".format(i) for i in range(len(self._state_inputs))
-        ]
-        feature_sequences, initial_states, seq_lens = chop_into_sequences(
-            batch[SampleBatch.EPS_ID],
-            batch[SampleBatch.UNROLL_ID],
-            batch[SampleBatch.AGENT_INDEX], [batch[k] for k in feature_keys],
-            [batch[k] for k in state_keys],
-            max_seq_len,
-            dynamic_max=dynamic_max)
-        for k, v in zip(feature_keys, feature_sequences):
-            feed_dict[self._loss_input_dict[k]] = v
-        for k, v in zip(state_keys, initial_states):
-            feed_dict[self._loss_input_dict[k]] = v
-        feed_dict[self._seq_lens] = seq_lens
-
-        if log_once("rnn_feed_dict"):
-            logger.info("Padded input for RNN:\n\n{}\n".format(
-                summarize({
-                    "features": feature_sequences,
-                    "initial_states": initial_states,
-                    "seq_lens": seq_lens,
-                    "max_seq_len": max_seq_len,
-                })))
-        return feed_dict
-
-
-@DeveloperAPI
-class LearningRateSchedule(object):
-    """Mixin for TFPolicy that adds a learning rate schedule."""
-
-    @DeveloperAPI
-    def __init__(self, lr, lr_schedule):
-        self.cur_lr = tf.get_variable("lr", initializer=lr)
-        if lr_schedule is None:
-            self.lr_schedule = ConstantSchedule(lr)
-        else:
-            self.lr_schedule = PiecewiseSchedule(
-                lr_schedule, outside_value=lr_schedule[-1][-1])
-
-    @override(Policy)
-    def on_global_var_update(self, global_vars):
-        super(LearningRateSchedule, self).on_global_var_update(global_vars)
-        self.cur_lr.load(
-            self.lr_schedule.value(global_vars["timestep"]),
-            session=self._sess)
-
-    @override(TFPolicy)
-    def optimizer(self):
-        return tf.train.AdamOptimizer(self.cur_lr)
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index 717c9b32b038c..f9aecfc176cc8 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -27,7 +27,7 @@ def build_tf_policy(name,
     """Helper function for creating a dynamic tf policy at runtime.
 
     Arguments:
-        name (str): name of the graph (e.g., "PPOPolicy")
+        name (str): name of the policy (e.g., "PPOPolicy")
         loss_fn (func): function that returns a loss tensor the policy,
             and dict of experience tensor placeholders
         get_default_config (func): optional function that returns the default
@@ -76,7 +76,7 @@ class new_base(mixins.pop(), base):
 
         base = new_base
 
-    class graph_cls(base):
+    class policy_cls(base):
         def __init__(self,
                      obs_space,
                      action_space,
@@ -141,6 +141,6 @@ def extra_compute_action_fetches(self):
                 TFPolicy.extra_compute_action_fetches(self),
                 **self._extra_action_fetches)
 
-    graph_cls.__name__ = name
-    graph_cls.__qualname__ = name
-    return graph_cls
+    policy_cls.__name__ = name
+    policy_cls.__qualname__ = name
+    return policy_cls
diff --git a/python/ray/rllib/evaluation/torch_policy.py b/python/ray/rllib/evaluation/torch_policy.py
deleted file mode 100644
index cd0602935a0a3..0000000000000
--- a/python/ray/rllib/evaluation/torch_policy.py
+++ /dev/null
@@ -1,174 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-from threading import Lock
-
-try:
-    import torch
-except ImportError:
-    pass  # soft dep
-
-from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.tracking_dict import UsageTrackingDict
-
-
-class TorchPolicy(Policy):
-    """Template for a PyTorch policy and loss to use with RLlib.
-
-    This is similar to TFPolicy, but for PyTorch.
-
-    Attributes:
-        observation_space (gym.Space): observation space of the policy.
-        action_space (gym.Space): action space of the policy.
-        lock (Lock): Lock that must be held around PyTorch ops on this graph.
-            This is necessary when using the async sampler.
-    """
-
-    def __init__(self, observation_space, action_space, model, loss,
-                 action_distribution_cls):
-        """Build a policy from policy and loss torch modules.
-
-        Note that model will be placed on GPU device if CUDA_VISIBLE_DEVICES
-        is set. Only single GPU is supported for now.
-
-        Arguments:
-            observation_space (gym.Space): observation space of the policy.
-            action_space (gym.Space): action space of the policy.
-            model (nn.Module): PyTorch policy module. Given observations as
-                input, this module must return a list of outputs where the
-                first item is action logits, and the rest can be any value.
-            loss (func): Function that takes (policy, batch_tensors)
-                and returns a single scalar loss.
-            action_distribution_cls (ActionDistribution): Class for action
-                distribution.
-        """
-        self.observation_space = observation_space
-        self.action_space = action_space
-        self.lock = Lock()
-        self.device = (torch.device("cuda")
-                       if bool(os.environ.get("CUDA_VISIBLE_DEVICES", None))
-                       else torch.device("cpu"))
-        self._model = model.to(self.device)
-        self._loss = loss
-        self._optimizer = self.optimizer()
-        self._action_dist_cls = action_distribution_cls
-
-    @override(Policy)
-    def compute_actions(self,
-                        obs_batch,
-                        state_batches=None,
-                        prev_action_batch=None,
-                        prev_reward_batch=None,
-                        info_batch=None,
-                        episodes=None,
-                        **kwargs):
-        with self.lock:
-            with torch.no_grad():
-                ob = torch.from_numpy(np.array(obs_batch)) \
-                    .float().to(self.device)
-                model_out = self._model({"obs": ob}, state_batches)
-                logits, _, vf, state = model_out
-                action_dist = self._action_dist_cls(logits)
-                actions = action_dist.sample()
-                return (actions.cpu().numpy(),
-                        [h.cpu().numpy() for h in state],
-                        self.extra_action_out(model_out))
-
-    @override(Policy)
-    def learn_on_batch(self, postprocessed_batch):
-        batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
-
-        with self.lock:
-            loss_out = self._loss(self, batch_tensors)
-            self._optimizer.zero_grad()
-            loss_out.backward()
-
-            grad_process_info = self.extra_grad_process()
-            self._optimizer.step()
-
-            grad_info = self.extra_grad_info(batch_tensors)
-            grad_info.update(grad_process_info)
-            return {LEARNER_STATS_KEY: grad_info}
-
-    @override(Policy)
-    def compute_gradients(self, postprocessed_batch):
-        batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
-
-        with self.lock:
-            loss_out = self._loss(self, batch_tensors)
-            self._optimizer.zero_grad()
-            loss_out.backward()
-
-            grad_process_info = self.extra_grad_process()
-
-            # Note that return values are just references;
-            # calling zero_grad will modify the values
-            grads = []
-            for p in self._model.parameters():
-                if p.grad is not None:
-                    grads.append(p.grad.data.cpu().numpy())
-                else:
-                    grads.append(None)
-
-            grad_info = self.extra_grad_info(batch_tensors)
-            grad_info.update(grad_process_info)
-            return grads, {LEARNER_STATS_KEY: grad_info}
-
-    @override(Policy)
-    def apply_gradients(self, gradients):
-        with self.lock:
-            for g, p in zip(gradients, self._model.parameters()):
-                if g is not None:
-                    p.grad = torch.from_numpy(g).to(self.device)
-            self._optimizer.step()
-
-    @override(Policy)
-    def get_weights(self):
-        with self.lock:
-            return {k: v.cpu() for k, v in self._model.state_dict().items()}
-
-    @override(Policy)
-    def set_weights(self, weights):
-        with self.lock:
-            self._model.load_state_dict(weights)
-
-    @override(Policy)
-    def get_initial_state(self):
-        return [s.numpy() for s in self._model.state_init()]
-
-    def extra_grad_process(self):
-        """Allow subclass to do extra processing on gradients and
-           return processing info."""
-        return {}
-
-    def extra_action_out(self, model_out):
-        """Returns dict of extra info to include in experience batch.
-
-        Arguments:
-            model_out (list): Outputs of the policy model module."""
-        return {}
-
-    def extra_grad_info(self, batch_tensors):
-        """Return dict of extra grad info."""
-
-        return {}
-
-    def optimizer(self):
-        """Custom PyTorch optimizer to use."""
-        if hasattr(self, "config"):
-            return torch.optim.Adam(
-                self._model.parameters(), lr=self.config["lr"])
-        else:
-            return torch.optim.Adam(self._model.parameters())
-
-    def _lazy_tensor_dict(self, postprocessed_batch):
-        batch_tensors = UsageTrackingDict(postprocessed_batch)
-        batch_tensors.set_get_interceptor(
-            lambda arr: torch.from_numpy(arr).to(self.device))
-        return batch_tensors
diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py
deleted file mode 100644
index cdba19b3cf0ed..0000000000000
--- a/python/ray/rllib/evaluation/torch_policy_template.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.torch_policy import TorchPolicy
-from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.utils.annotations import override, DeveloperAPI
-
-
-@DeveloperAPI
-def build_torch_policy(name,
-                       loss_fn,
-                       get_default_config=None,
-                       stats_fn=None,
-                       postprocess_fn=None,
-                       extra_action_out_fn=None,
-                       extra_grad_process_fn=None,
-                       optimizer_fn=None,
-                       before_init=None,
-                       after_init=None,
-                       make_model_and_action_dist=None,
-                       mixins=None):
-    """Helper function for creating a torch policy at runtime.
-
-    Arguments:
-        name (str): name of the graph (e.g., "PPOPolicy")
-        loss_fn (func): function that returns a loss tensor the policy,
-            and dict of experience tensor placeholders
-        get_default_config (func): optional function that returns the default
-            config to merge with any overrides
-        stats_fn (func): optional function that returns a dict of
-            values given the policy and batch input tensors
-        postprocess_fn (func): optional experience postprocessing function
-            that takes the same args as Policy.postprocess_trajectory()
-        extra_action_out_fn (func): optional function that returns
-            a dict of extra values to include in experiences
-        extra_grad_process_fn (func): optional function that is called after
-            gradients are computed and returns processing info
-        optimizer_fn (func): optional function that returns a torch optimizer
-            given the policy and config
-        before_init (func): optional function to run at the beginning of
-            policy init that takes the same arguments as the policy constructor
-        after_init (func): optional function to run at the end of policy init
-            that takes the same arguments as the policy constructor
-        make_model_and_action_dist (func): optional func that takes the same
-            arguments as policy init and returns a tuple of model instance and
-            torch action distribution class. If not specified, the default
-            model and action dist from the catalog will be used
-        mixins (list): list of any class mixins for the returned policy class.
-            These mixins will be applied in order and will have higher
-            precedence than the TorchPolicy class
-
-    Returns:
-        a TorchPolicy instance that uses the specified args
-    """
-
-    if not name.endswith("TorchPolicy"):
-        raise ValueError("Name should match *TorchPolicy", name)
-
-    base = TorchPolicy
-    while mixins:
-
-        class new_base(mixins.pop(), base):
-            pass
-
-        base = new_base
-
-    class graph_cls(base):
-        def __init__(self, obs_space, action_space, config):
-            if get_default_config:
-                config = dict(get_default_config(), **config)
-            self.config = config
-
-            if before_init:
-                before_init(self, obs_space, action_space, config)
-
-            if make_model_and_action_dist:
-                self.model, self.dist_class = make_model_and_action_dist(
-                    self, obs_space, action_space, config)
-            else:
-                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
-                    action_space, self.config["model"], torch=True)
-                self.model = ModelCatalog.get_torch_model(
-                    obs_space, logit_dim, self.config["model"])
-
-            TorchPolicy.__init__(self, obs_space, action_space, self.model,
-                                 loss_fn, self.dist_class)
-
-            if after_init:
-                after_init(self, obs_space, action_space, config)
-
-        @override(Policy)
-        def postprocess_trajectory(self,
-                                   sample_batch,
-                                   other_agent_batches=None,
-                                   episode=None):
-            if not postprocess_fn:
-                return sample_batch
-            return postprocess_fn(self, sample_batch, other_agent_batches,
-                                  episode)
-
-        @override(TorchPolicy)
-        def extra_grad_process(self):
-            if extra_grad_process_fn:
-                return extra_grad_process_fn(self)
-            else:
-                return TorchPolicy.extra_grad_process(self)
-
-        @override(TorchPolicy)
-        def extra_action_out(self, model_out):
-            if extra_action_out_fn:
-                return extra_action_out_fn(self, model_out)
-            else:
-                return TorchPolicy.extra_action_out(self, model_out)
-
-        @override(TorchPolicy)
-        def optimizer(self):
-            if optimizer_fn:
-                return optimizer_fn(self, self.config)
-            else:
-                return TorchPolicy.optimizer(self)
-
-        @override(TorchPolicy)
-        def extra_grad_info(self, batch_tensors):
-            if stats_fn:
-                return stats_fn(self, batch_tensors)
-            else:
-                return TorchPolicy.extra_grad_info(self, batch_tensors)
-
-    graph_cls.__name__ = name
-    graph_cls.__qualname__ = name
-    return graph_cls
diff --git a/python/ray/rllib/offline/off_policy_estimator.py b/python/ray/rllib/offline/off_policy_estimator.py
index c738049984bd4..c92cb9015e75c 100644
--- a/python/ray/rllib/offline/off_policy_estimator.py
+++ b/python/ray/rllib/offline/off_policy_estimator.py
@@ -23,7 +23,7 @@ def __init__(self, policy, gamma):
         """Creates an off-policy estimator.
 
         Arguments:
-            policy (Policy): Policy graph to evaluate.
+            policy (Policy): Policy to evaluate.
             gamma (float): Discount of the MDP.
         """
         self.policy = policy
diff --git a/python/ray/rllib/utils/__init__.py b/python/ray/rllib/utils/__init__.py
index a16cba22b6116..e537400b7f311 100644
--- a/python/ray/rllib/utils/__init__.py
+++ b/python/ray/rllib/utils/__init__.py
@@ -10,12 +10,13 @@
 logger = logging.getLogger(__name__)
 
 
-def renamed_class(cls):
-    """Helper class for renaming Agent => Trainer with a warning."""
+def renamed_class(cls, old_name=None):
+    """Helper class for renaming classes with a warning."""
 
     class DeprecationWrapper(cls):
         def __init__(self, config=None, env=None, logger_creator=None):
-            old_name = cls.__name__.replace("Trainer", "Agent")
+            if not old_name:
+                old_name = cls.__name__.replace("Trainer", "Agent")
             new_name = cls.__name__
             logger.warn("DeprecationWarning: {} has been renamed to {}. ".
                         format(old_name, new_name) +

From 046b992f1898e4e5b83d953055e9cdf69b4cfce9 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:11:32 -0700
Subject: [PATCH 05/13] move policy graphs

---
 python/ray/rllib/evaluation/policy_graph.py   |   9 +
 .../ray/rllib/evaluation/tf_policy_graph.py   |   9 +
 .../rllib/evaluation/torch_policy_graph.py    |   9 +
 python/ray/rllib/keras_policy.py              |  65 +++
 python/ray/rllib/policy/__init__.py           |  18 +
 python/ray/rllib/policy/dynamic_tf_policy.py  | 275 ++++++++++
 python/ray/rllib/policy/policy.py             | 287 ++++++++++
 python/ray/rllib/policy/tf_policy.py          | 514 ++++++++++++++++++
 python/ray/rllib/policy/tf_policy_template.py | 146 +++++
 python/ray/rllib/policy/torch_policy.py       | 174 ++++++
 .../ray/rllib/policy/torch_policy_template.py | 133 +++++
 11 files changed, 1639 insertions(+)
 create mode 100644 python/ray/rllib/evaluation/policy_graph.py
 create mode 100644 python/ray/rllib/evaluation/tf_policy_graph.py
 create mode 100644 python/ray/rllib/evaluation/torch_policy_graph.py
 create mode 100644 python/ray/rllib/keras_policy.py
 create mode 100644 python/ray/rllib/policy/__init__.py
 create mode 100644 python/ray/rllib/policy/dynamic_tf_policy.py
 create mode 100644 python/ray/rllib/policy/policy.py
 create mode 100644 python/ray/rllib/policy/tf_policy.py
 create mode 100644 python/ray/rllib/policy/tf_policy_template.py
 create mode 100644 python/ray/rllib/policy/torch_policy.py
 create mode 100644 python/ray/rllib/policy/torch_policy_template.py

diff --git a/python/ray/rllib/evaluation/policy_graph.py b/python/ray/rllib/evaluation/policy_graph.py
new file mode 100644
index 0000000000000..1039269e56cbd
--- /dev/null
+++ b/python/ray/rllib/evaluation/policy_graph.py
@@ -0,0 +1,9 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.policy import Policy
+from ray.rllib.utils import renamed_class
+
+
+PolicyGraph = renamed_class(Policy, old_name="PolicyGraph")
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
new file mode 100644
index 0000000000000..06a5de9c9e51b
--- /dev/null
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -0,0 +1,9 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.policy import TFPolicy
+from ray.rllib.utils import renamed_class
+
+
+TFPolicyGraph = renamed_class(TFPolicy, old_name="TFPolicyGraph")
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py
new file mode 100644
index 0000000000000..56940dc215e37
--- /dev/null
+++ b/python/ray/rllib/evaluation/torch_policy_graph.py
@@ -0,0 +1,9 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.policy import TorchPolicy
+from ray.rllib.utils import renamed_class
+
+
+TorchPolicyGraph = renamed_class(TorchPolicy, old_name="TorchPolicyGraph")
diff --git a/python/ray/rllib/keras_policy.py b/python/ray/rllib/keras_policy.py
new file mode 100644
index 0000000000000..e10f6ec1d178f
--- /dev/null
+++ b/python/ray/rllib/keras_policy.py
@@ -0,0 +1,65 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from ray.rllib.evaluation.policy import Policy
+
+
+def _sample(probs):
+    return [np.random.choice(len(pr), p=pr) for pr in probs]
+
+
+class KerasPolicy(Policy):
+    """Initialize the Keras Policy.
+
+    This is a Policy used for models with actor and critics.
+    Note: This class is built for specific usage of Actor-Critic models,
+    and is less general compared to TFPolicy and TorchPolicies.
+
+    Args:
+        observation_space (gym.Space): Observation space of the policy.
+        action_space (gym.Space): Action space of the policy.
+        config (dict): Policy-specific configuration data.
+        actor (Model): A model that holds the policy.
+        critic (Model): A model that holds the value function.
+    """
+
+    def __init__(self,
+                 observation_space,
+                 action_space,
+                 config,
+                 actor=None,
+                 critic=None):
+        Policy.__init__(self, observation_space, action_space, config)
+        self.actor = actor
+        self.critic = critic
+        self.models = [self.actor, self.critic]
+
+    def compute_actions(self, obs, *args, **kwargs):
+        state = np.array(obs)
+        policy = self.actor.predict(state)
+        value = self.critic.predict(state)
+        return _sample(policy), [], {"vf_preds": value.flatten()}
+
+    def learn_on_batch(self, batch, *args):
+        self.actor.fit(
+            batch["obs"],
+            batch["adv_targets"],
+            epochs=1,
+            verbose=0,
+            steps_per_epoch=20)
+        self.critic.fit(
+            batch["obs"],
+            batch["value_targets"],
+            epochs=1,
+            verbose=0,
+            steps_per_epoch=20)
+        return {}
+
+    def get_weights(self):
+        return [model.get_weights() for model in self.models]
+
+    def set_weights(self, weights):
+        return [model.set_weights(w) for model, w in zip(self.models, weights)]
diff --git a/python/ray/rllib/policy/__init__.py b/python/ray/rllib/policy/__init__.py
new file mode 100644
index 0000000000000..6d7bafabe442d
--- /dev/null
+++ b/python/ray/rllib/policy/__init__.py
@@ -0,0 +1,18 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.torch_policy import TorchPolicy
+from ray.rllib.policy.tf_policy import TFPolicy
+from ray.rllib.policy.torch_policy_template import build_torch_policy
+from ray.rllib.policy.tf_policy_template import build_tf_policy
+
+
+__all__ = [
+    "Policy",
+    "TFPolicy",
+    "TorchPolicy",
+    "build_tf_policy",
+    "build_torch_policy",
+]
diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py
new file mode 100644
index 0000000000000..a82e751825a8f
--- /dev/null
+++ b/python/ray/rllib/policy/dynamic_tf_policy.py
@@ -0,0 +1,275 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+import logging
+import numpy as np
+
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils import try_import_tf
+from ray.rllib.utils.debug import log_once, summarize
+from ray.rllib.utils.tracking_dict import UsageTrackingDict
+
+tf = try_import_tf()
+
+logger = logging.getLogger(__name__)
+
+
+class DynamicTFPolicy(TFPolicy):
+    """A TFPolicy that auto-defines placeholders dynamically at runtime.
+
+    Initialization of this class occurs in two phases.
+      * Phase 1: the model is created and model variables are initialized.
+      * Phase 2: a fake batch of data is created, sent to the trajectory
+        postprocessor, and then used to create placeholders for the loss
+        function. The loss and stats functions are initialized with these
+        placeholders.
+    """
+
+    def __init__(self,
+                 obs_space,
+                 action_space,
+                 config,
+                 loss_fn,
+                 stats_fn=None,
+                 grad_stats_fn=None,
+                 before_loss_init=None,
+                 make_action_sampler=None,
+                 existing_inputs=None,
+                 get_batch_divisibility_req=None):
+        """Initialize a dynamic TF policy.
+
+        Arguments:
+            observation_space (gym.Space): Observation space of the policy.
+            action_space (gym.Space): Action space of the policy.
+            config (dict): Policy-specific configuration data.
+            loss_fn (func): function that returns a loss tensor the policy
+                graph, and dict of experience tensor placeholders
+            stats_fn (func): optional function that returns a dict of
+                TF fetches given the policy and batch input tensors
+            grad_stats_fn (func): optional function that returns a dict of
+                TF fetches given the policy and loss gradient tensors
+            before_loss_init (func): optional function to run prior to loss
+                init that takes the same arguments as __init__
+            make_action_sampler (func): optional function that returns a
+                tuple of action and action prob tensors. The function takes
+                (policy, input_dict, obs_space, action_space, config) as its
+                arguments
+            existing_inputs (OrderedDict): when copying a policy, this
+                specifies an existing dict of placeholders to use instead of
+                defining new ones
+            get_batch_divisibility_req (func): optional function that returns
+                the divisibility requirement for sample batches
+        """
+        self.config = config
+        self._loss_fn = loss_fn
+        self._stats_fn = stats_fn
+        self._grad_stats_fn = grad_stats_fn
+
+        # Setup standard placeholders
+        if existing_inputs is not None:
+            obs = existing_inputs[SampleBatch.CUR_OBS]
+            prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS]
+            prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS]
+        else:
+            obs = tf.placeholder(
+                tf.float32,
+                shape=[None] + list(obs_space.shape),
+                name="observation")
+            prev_actions = ModelCatalog.get_action_placeholder(action_space)
+            prev_rewards = tf.placeholder(
+                tf.float32, [None], name="prev_reward")
+
+        input_dict = {
+            "obs": obs,
+            "prev_actions": prev_actions,
+            "prev_rewards": prev_rewards,
+            "is_training": self._get_is_training_placeholder(),
+        }
+
+        # Create the model network and action outputs
+        if make_action_sampler:
+            assert not existing_inputs, \
+                "Cloning not supported with custom action sampler"
+            self.model = None
+            self.dist_class = None
+            self.action_dist = None
+            action_sampler, action_prob = make_action_sampler(
+                self, input_dict, obs_space, action_space, config)
+        else:
+            self.dist_class, logit_dim = ModelCatalog.get_action_dist(
+                action_space, self.config["model"])
+            if existing_inputs:
+                existing_state_in = [
+                    v for k, v in existing_inputs.items()
+                    if k.startswith("state_in_")
+                ]
+                if existing_state_in:
+                    existing_seq_lens = existing_inputs["seq_lens"]
+                else:
+                    existing_seq_lens = None
+            else:
+                existing_state_in = []
+                existing_seq_lens = None
+            self.model = ModelCatalog.get_model(
+                input_dict,
+                obs_space,
+                action_space,
+                logit_dim,
+                self.config["model"],
+                state_in=existing_state_in,
+                seq_lens=existing_seq_lens)
+            self.action_dist = self.dist_class(self.model.outputs)
+            action_sampler = self.action_dist.sample()
+            action_prob = self.action_dist.sampled_action_prob()
+
+        # Phase 1 init
+        sess = tf.get_default_session()
+        if get_batch_divisibility_req:
+            batch_divisibility_req = get_batch_divisibility_req(self)
+        else:
+            batch_divisibility_req = 1
+        TFPolicy.__init__(
+            self,
+            obs_space,
+            action_space,
+            sess,
+            obs_input=obs,
+            action_sampler=action_sampler,
+            action_prob=action_prob,
+            loss=None,  # dynamically initialized on run
+            loss_inputs=[],
+            model=self.model,
+            state_inputs=self.model and self.model.state_in,
+            state_outputs=self.model and self.model.state_out,
+            prev_action_input=prev_actions,
+            prev_reward_input=prev_rewards,
+            seq_lens=self.model and self.model.seq_lens,
+            max_seq_len=config["model"]["max_seq_len"],
+            batch_divisibility_req=batch_divisibility_req)
+
+        # Phase 2 init
+        before_loss_init(self, obs_space, action_space, config)
+        if not existing_inputs:
+            self._initialize_loss()
+
+    @override(TFPolicy)
+    def copy(self, existing_inputs):
+        """Creates a copy of self using existing input placeholders."""
+
+        # Note that there might be RNN state inputs at the end of the list
+        if self._state_inputs:
+            num_state_inputs = len(self._state_inputs) + 1
+        else:
+            num_state_inputs = 0
+        if len(self._loss_inputs) + num_state_inputs != len(existing_inputs):
+            raise ValueError("Tensor list mismatch", self._loss_inputs,
+                             self._state_inputs, existing_inputs)
+        for i, (k, v) in enumerate(self._loss_inputs):
+            if v.shape.as_list() != existing_inputs[i].shape.as_list():
+                raise ValueError("Tensor shape mismatch", i, k, v.shape,
+                                 existing_inputs[i].shape)
+        # By convention, the loss inputs are followed by state inputs and then
+        # the seq len tensor
+        rnn_inputs = []
+        for i in range(len(self._state_inputs)):
+            rnn_inputs.append(("state_in_{}".format(i),
+                               existing_inputs[len(self._loss_inputs) + i]))
+        if rnn_inputs:
+            rnn_inputs.append(("seq_lens", existing_inputs[-1]))
+        input_dict = OrderedDict(
+            [(k, existing_inputs[i])
+             for i, (k, _) in enumerate(self._loss_inputs)] + rnn_inputs)
+        instance = self.__class__(
+            self.observation_space,
+            self.action_space,
+            self.config,
+            existing_inputs=input_dict)
+        loss = instance._loss_fn(instance, input_dict)
+        if instance._stats_fn:
+            instance._stats_fetches.update(
+                instance._stats_fn(instance, input_dict))
+        TFPolicy._initialize_loss(
+            instance, loss, [(k, existing_inputs[i])
+                             for i, (k, _) in enumerate(self._loss_inputs)])
+        if instance._grad_stats_fn:
+            instance._stats_fetches.update(
+                instance._grad_stats_fn(instance, instance._grads))
+        return instance
+
+    @override(Policy)
+    def get_initial_state(self):
+        if self.model:
+            return self.model.state_init
+        else:
+            return []
+
+    def _initialize_loss(self):
+        def fake_array(tensor):
+            shape = tensor.shape.as_list()
+            shape[0] = 1
+            return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype)
+
+        dummy_batch = {
+            SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input),
+            SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input),
+            SampleBatch.CUR_OBS: fake_array(self._obs_input),
+            SampleBatch.NEXT_OBS: fake_array(self._obs_input),
+            SampleBatch.ACTIONS: fake_array(self._prev_action_input),
+            SampleBatch.REWARDS: np.array([0], dtype=np.float32),
+            SampleBatch.DONES: np.array([False], dtype=np.bool),
+        }
+        state_init = self.get_initial_state()
+        for i, h in enumerate(state_init):
+            dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0)
+            dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0)
+        if state_init:
+            dummy_batch["seq_lens"] = np.array([1], dtype=np.int32)
+        for k, v in self.extra_compute_action_fetches().items():
+            dummy_batch[k] = fake_array(v)
+
+        # postprocessing might depend on variable init, so run it first here
+        self._sess.run(tf.global_variables_initializer())
+        postprocessed_batch = self.postprocess_trajectory(
+            SampleBatch(dummy_batch))
+
+        batch_tensors = UsageTrackingDict({
+            SampleBatch.PREV_ACTIONS: self._prev_action_input,
+            SampleBatch.PREV_REWARDS: self._prev_reward_input,
+            SampleBatch.CUR_OBS: self._obs_input,
+        })
+        loss_inputs = [
+            (SampleBatch.PREV_ACTIONS, self._prev_action_input),
+            (SampleBatch.PREV_REWARDS, self._prev_reward_input),
+            (SampleBatch.CUR_OBS, self._obs_input),
+        ]
+
+        for k, v in postprocessed_batch.items():
+            if k in batch_tensors:
+                continue
+            elif v.dtype == np.object:
+                continue  # can't handle arbitrary objects in TF
+            shape = (None, ) + v.shape[1:]
+            dtype = np.float32 if v.dtype == np.float64 else v.dtype
+            placeholder = tf.placeholder(dtype, shape=shape, name=k)
+            batch_tensors[k] = placeholder
+
+        if log_once("loss_init"):
+            logger.info(
+                "Initializing loss function with dummy input:\n\n{}\n".format(
+                    summarize(batch_tensors)))
+
+        loss = self._loss_fn(self, batch_tensors)
+        if self._stats_fn:
+            self._stats_fetches.update(self._stats_fn(self, batch_tensors))
+        for k in sorted(batch_tensors.accessed_keys):
+            loss_inputs.append((k, batch_tensors[k]))
+        TFPolicy._initialize_loss(self, loss, loss_inputs)
+        if self._grad_stats_fn:
+            self._stats_fetches.update(self._grad_stats_fn(self, self._grads))
+        self._sess.run(tf.global_variables_initializer())
diff --git a/python/ray/rllib/policy/policy.py b/python/ray/rllib/policy/policy.py
new file mode 100644
index 0000000000000..72393e7826c53
--- /dev/null
+++ b/python/ray/rllib/policy/policy.py
@@ -0,0 +1,287 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import gym
+
+from ray.rllib.utils.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+class Policy(object):
+    """An agent policy and loss, i.e., a TFPolicy or other subclass.
+
+    This object defines how to act in the environment, and also losses used to
+    improve the policy based on its experiences. Note that both policy and
+    loss are defined together for convenience, though the policy itself is
+    logically separate.
+
+    All policies can directly extend Policy, however TensorFlow users may
+    find TFPolicy simpler to implement. TFPolicy also enables RLlib
+    to apply TensorFlow-specific optimizations such as fusing multiple policy
+    graphs and multi-GPU support.
+
+    Attributes:
+        observation_space (gym.Space): Observation space of the policy.
+        action_space (gym.Space): Action space of the policy.
+    """
+
+    @DeveloperAPI
+    def __init__(self, observation_space, action_space, config):
+        """Initialize the graph.
+
+        This is the standard constructor for policies. The policy
+        class you pass into PolicyEvaluator will be constructed with
+        these arguments.
+
+        Args:
+            observation_space (gym.Space): Observation space of the policy.
+            action_space (gym.Space): Action space of the policy.
+            config (dict): Policy-specific configuration data.
+        """
+
+        self.observation_space = observation_space
+        self.action_space = action_space
+
+    @DeveloperAPI
+    def compute_actions(self,
+                        obs_batch,
+                        state_batches,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
+                        info_batch=None,
+                        episodes=None,
+                        **kwargs):
+        """Compute actions for the current policy.
+
+        Arguments:
+            obs_batch (np.ndarray): batch of observations
+            state_batches (list): list of RNN state input batches, if any
+            prev_action_batch (np.ndarray): batch of previous action values
+            prev_reward_batch (np.ndarray): batch of previous rewards
+            info_batch (info): batch of info objects
+            episodes (list): MultiAgentEpisode for each obs in obs_batch.
+                This provides access to all of the internal episode state,
+                which may be useful for model-based or multiagent algorithms.
+            kwargs: forward compatibility placeholder
+
+        Returns:
+            actions (np.ndarray): batch of output actions, with shape like
+                [BATCH_SIZE, ACTION_SHAPE].
+            state_outs (list): list of RNN state output batches, if any, with
+                shape like [STATE_SIZE, BATCH_SIZE].
+            info (dict): dictionary of extra feature batches, if any, with
+                shape like {"f1": [BATCH_SIZE, ...], "f2": [BATCH_SIZE, ...]}.
+        """
+        raise NotImplementedError
+
+    @DeveloperAPI
+    def compute_single_action(self,
+                              obs,
+                              state,
+                              prev_action=None,
+                              prev_reward=None,
+                              info=None,
+                              episode=None,
+                              clip_actions=False,
+                              **kwargs):
+        """Unbatched version of compute_actions.
+
+        Arguments:
+            obs (obj): single observation
+            state_batches (list): list of RNN state inputs, if any
+            prev_action (obj): previous action value, if any
+            prev_reward (int): previous reward, if any
+            info (dict): info object, if any
+            episode (MultiAgentEpisode): this provides access to all of the
+                internal episode state, which may be useful for model-based or
+                multi-agent algorithms.
+            clip_actions (bool): should the action be clipped
+            kwargs: forward compatibility placeholder
+
+        Returns:
+            actions (obj): single action
+            state_outs (list): list of RNN state outputs, if any
+            info (dict): dictionary of extra features, if any
+        """
+
+        prev_action_batch = None
+        prev_reward_batch = None
+        info_batch = None
+        episodes = None
+        if prev_action is not None:
+            prev_action_batch = [prev_action]
+        if prev_reward is not None:
+            prev_reward_batch = [prev_reward]
+        if info is not None:
+            info_batch = [info]
+        if episode is not None:
+            episodes = [episode]
+        [action], state_out, info = self.compute_actions(
+            [obs], [[s] for s in state],
+            prev_action_batch=prev_action_batch,
+            prev_reward_batch=prev_reward_batch,
+            info_batch=info_batch,
+            episodes=episodes)
+        if clip_actions:
+            action = clip_action(action, self.action_space)
+        return action, [s[0] for s in state_out], \
+            {k: v[0] for k, v in info.items()}
+
+    @DeveloperAPI
+    def postprocess_trajectory(self,
+                               sample_batch,
+                               other_agent_batches=None,
+                               episode=None):
+        """Implements algorithm-specific trajectory postprocessing.
+
+        This will be called on each trajectory fragment computed during policy
+        evaluation. Each fragment is guaranteed to be only from one episode.
+
+        Arguments:
+            sample_batch (SampleBatch): batch of experiences for the policy,
+                which will contain at most one episode trajectory.
+            other_agent_batches (dict): In a multi-agent env, this contains a
+                mapping of agent ids to (policy, agent_batch) tuples
+                containing the policy and experiences of the other agent.
+            episode (MultiAgentEpisode): this provides access to all of the
+                internal episode state, which may be useful for model-based or
+                multi-agent algorithms.
+
+        Returns:
+            SampleBatch: postprocessed sample batch.
+        """
+        return sample_batch
+
+    @DeveloperAPI
+    def learn_on_batch(self, samples):
+        """Fused compute gradients and apply gradients call.
+
+        Either this or the combination of compute/apply grads must be
+        implemented by subclasses.
+
+        Returns:
+            grad_info: dictionary of extra metadata from compute_gradients().
+
+        Examples:
+            >>> batch = ev.sample()
+            >>> ev.learn_on_batch(samples)
+        """
+
+        grads, grad_info = self.compute_gradients(samples)
+        self.apply_gradients(grads)
+        return grad_info
+
+    @DeveloperAPI
+    def compute_gradients(self, postprocessed_batch):
+        """Computes gradients against a batch of experiences.
+
+        Either this or learn_on_batch() must be implemented by subclasses.
+
+        Returns:
+            grads (list): List of gradient output values
+            info (dict): Extra policy-specific values
+        """
+        raise NotImplementedError
+
+    @DeveloperAPI
+    def apply_gradients(self, gradients):
+        """Applies previously computed gradients.
+
+        Either this or learn_on_batch() must be implemented by subclasses.
+        """
+        raise NotImplementedError
+
+    @DeveloperAPI
+    def get_weights(self):
+        """Returns model weights.
+
+        Returns:
+            weights (obj): Serializable copy or view of model weights
+        """
+        raise NotImplementedError
+
+    @DeveloperAPI
+    def set_weights(self, weights):
+        """Sets model weights.
+
+        Arguments:
+            weights (obj): Serializable copy or view of model weights
+        """
+        raise NotImplementedError
+
+    @DeveloperAPI
+    def get_initial_state(self):
+        """Returns initial RNN state for the current policy."""
+        return []
+
+    @DeveloperAPI
+    def get_state(self):
+        """Saves all local state.
+
+        Returns:
+            state (obj): Serialized local state.
+        """
+        return self.get_weights()
+
+    @DeveloperAPI
+    def set_state(self, state):
+        """Restores all local state.
+
+        Arguments:
+            state (obj): Serialized local state.
+        """
+        self.set_weights(state)
+
+    @DeveloperAPI
+    def on_global_var_update(self, global_vars):
+        """Called on an update to global vars.
+
+        Arguments:
+            global_vars (dict): Global variables broadcast from the driver.
+        """
+        pass
+
+    @DeveloperAPI
+    def export_model(self, export_dir):
+        """Export Policy to local directory for serving.
+
+        Arguments:
+            export_dir (str): Local writable directory.
+        """
+        raise NotImplementedError
+
+    @DeveloperAPI
+    def export_checkpoint(self, export_dir):
+        """Export Policy checkpoint to local directory.
+
+        Argument:
+            export_dir (str): Local writable directory.
+        """
+        raise NotImplementedError
+
+
+def clip_action(action, space):
+    """Called to clip actions to the specified range of this policy.
+
+    Arguments:
+        action: Single action.
+        space: Action space the actions should be present in.
+
+    Returns:
+        Clipped batch of actions.
+    """
+
+    if isinstance(space, gym.spaces.Box):
+        return np.clip(action, space.low, space.high)
+    elif isinstance(space, gym.spaces.Tuple):
+        if type(action) not in (tuple, list):
+            raise ValueError("Expected tuple space for actions {}: {}".format(
+                action, space))
+        out = []
+        for a, s in zip(action, space.spaces):
+            out.append(clip_action(a, s))
+        return out
+    else:
+        return action
diff --git a/python/ray/rllib/policy/tf_policy.py b/python/ray/rllib/policy/tf_policy.py
new file mode 100644
index 0000000000000..c6cc619af524d
--- /dev/null
+++ b/python/ray/rllib/policy/tf_policy.py
@@ -0,0 +1,514 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import errno
+import logging
+import numpy as np
+
+import ray
+import ray.experimental.tf_utils
+from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.models.lstm import chop_into_sequences
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.debug import log_once, summarize
+from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
+from ray.rllib.utils.tf_run_builder import TFRunBuilder
+from ray.rllib.utils import try_import_tf
+
+tf = try_import_tf()
+logger = logging.getLogger(__name__)
+
+
+@DeveloperAPI
+class TFPolicy(Policy):
+    """An agent policy and loss implemented in TensorFlow.
+
+    Extending this class enables RLlib to perform TensorFlow specific
+    optimizations on the policy, e.g., parallelization across gpus or
+    fusing multiple graphs together in the multi-agent setting.
+
+    Input tensors are typically shaped like [BATCH_SIZE, ...].
+
+    Attributes:
+        observation_space (gym.Space): observation space of the policy.
+        action_space (gym.Space): action space of the policy.
+        model (rllib.models.Model): RLlib model used for the policy.
+
+    Examples:
+        >>> policy = TFPolicySubclass(
+            sess, obs_input, action_sampler, loss, loss_inputs)
+
+        >>> print(policy.compute_actions([1, 0, 2]))
+        (array([0, 1, 1]), [], {})
+
+        >>> print(policy.postprocess_trajectory(SampleBatch({...})))
+        SampleBatch({"action": ..., "advantages": ..., ...})
+    """
+
+    @DeveloperAPI
+    def __init__(self,
+                 observation_space,
+                 action_space,
+                 sess,
+                 obs_input,
+                 action_sampler,
+                 loss,
+                 loss_inputs,
+                 model=None,
+                 action_prob=None,
+                 state_inputs=None,
+                 state_outputs=None,
+                 prev_action_input=None,
+                 prev_reward_input=None,
+                 seq_lens=None,
+                 max_seq_len=20,
+                 batch_divisibility_req=1,
+                 update_ops=None):
+        """Initialize the policy.
+
+        Arguments:
+            observation_space (gym.Space): Observation space of the env.
+            action_space (gym.Space): Action space of the env.
+            sess (Session): TensorFlow session to use.
+            obs_input (Tensor): input placeholder for observations, of shape
+                [BATCH_SIZE, obs...].
+            action_sampler (Tensor): Tensor for sampling an action, of shape
+                [BATCH_SIZE, action...]
+            loss (Tensor): scalar policy loss output tensor.
+            loss_inputs (list): a (name, placeholder) tuple for each loss
+                input argument. Each placeholder name must correspond to a
+                SampleBatch column key returned by postprocess_trajectory(),
+                and has shape [BATCH_SIZE, data...]. These keys will be read
+                from postprocessed sample batches and fed into the specified
+                placeholders during loss computation.
+            model (rllib.models.Model): used to integrate custom losses and
+                stats from user-defined RLlib models.
+            action_prob (Tensor): probability of the sampled action.
+            state_inputs (list): list of RNN state input Tensors.
+            state_outputs (list): list of RNN state output Tensors.
+            prev_action_input (Tensor): placeholder for previous actions
+            prev_reward_input (Tensor): placeholder for previous rewards
+            seq_lens (Tensor): placeholder for RNN sequence lengths, of shape
+                [NUM_SEQUENCES]. Note that NUM_SEQUENCES << BATCH_SIZE. See
+                models/lstm.py for more information.
+            max_seq_len (int): max sequence length for LSTM training.
+            batch_divisibility_req (int): pad all agent experiences batches to
+                multiples of this value. This only has an effect if not using
+                a LSTM model.
+            update_ops (list): override the batchnorm update ops to run when
+                applying gradients. Otherwise we run all update ops found in
+                the current variable scope.
+        """
+
+        self.observation_space = observation_space
+        self.action_space = action_space
+        self.model = model
+        self._sess = sess
+        self._obs_input = obs_input
+        self._prev_action_input = prev_action_input
+        self._prev_reward_input = prev_reward_input
+        self._sampler = action_sampler
+        self._is_training = self._get_is_training_placeholder()
+        self._action_prob = action_prob
+        self._state_inputs = state_inputs or []
+        self._state_outputs = state_outputs or []
+        self._seq_lens = seq_lens
+        self._max_seq_len = max_seq_len
+        self._batch_divisibility_req = batch_divisibility_req
+        self._update_ops = update_ops
+        self._stats_fetches = {}
+
+        if loss is not None:
+            self._initialize_loss(loss, loss_inputs)
+        else:
+            self._loss = None
+
+        if len(self._state_inputs) != len(self._state_outputs):
+            raise ValueError(
+                "Number of state input and output tensors must match, got: "
+                "{} vs {}".format(self._state_inputs, self._state_outputs))
+        if len(self.get_initial_state()) != len(self._state_inputs):
+            raise ValueError(
+                "Length of initial state must match number of state inputs, "
+                "got: {} vs {}".format(self.get_initial_state(),
+                                       self._state_inputs))
+        if self._state_inputs and self._seq_lens is None:
+            raise ValueError(
+                "seq_lens tensor must be given if state inputs are defined")
+
+    def _initialize_loss(self, loss, loss_inputs):
+        self._loss_inputs = loss_inputs
+        self._loss_input_dict = dict(self._loss_inputs)
+        for i, ph in enumerate(self._state_inputs):
+            self._loss_input_dict["state_in_{}".format(i)] = ph
+
+        if self.model:
+            self._loss = self.model.custom_loss(loss, self._loss_input_dict)
+            self._stats_fetches.update({"model": self.model.custom_stats()})
+        else:
+            self._loss = loss
+
+        self._optimizer = self.optimizer()
+        self._grads_and_vars = [
+            (g, v) for (g, v) in self.gradients(self._optimizer, self._loss)
+            if g is not None
+        ]
+        self._grads = [g for (g, v) in self._grads_and_vars]
+        self._variables = ray.experimental.tf_utils.TensorFlowVariables(
+            self._loss, self._sess)
+
+        # gather update ops for any batch norm layers
+        if not self._update_ops:
+            self._update_ops = tf.get_collection(
+                tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
+        if self._update_ops:
+            logger.debug("Update ops to run on apply gradient: {}".format(
+                self._update_ops))
+        with tf.control_dependencies(self._update_ops):
+            self._apply_op = self.build_apply_op(self._optimizer,
+                                                 self._grads_and_vars)
+
+        if log_once("loss_used"):
+            logger.debug(
+                "These tensors were used in the loss_fn:\n\n{}\n".format(
+                    summarize(self._loss_input_dict)))
+
+        self._sess.run(tf.global_variables_initializer())
+
+    @override(Policy)
+    def compute_actions(self,
+                        obs_batch,
+                        state_batches=None,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
+                        info_batch=None,
+                        episodes=None,
+                        **kwargs):
+        builder = TFRunBuilder(self._sess, "compute_actions")
+        fetches = self._build_compute_actions(builder, obs_batch,
+                                              state_batches, prev_action_batch,
+                                              prev_reward_batch)
+        return builder.get(fetches)
+
+    @override(Policy)
+    def compute_gradients(self, postprocessed_batch):
+        assert self._loss is not None, "Loss not initialized"
+        builder = TFRunBuilder(self._sess, "compute_gradients")
+        fetches = self._build_compute_gradients(builder, postprocessed_batch)
+        return builder.get(fetches)
+
+    @override(Policy)
+    def apply_gradients(self, gradients):
+        assert self._loss is not None, "Loss not initialized"
+        builder = TFRunBuilder(self._sess, "apply_gradients")
+        fetches = self._build_apply_gradients(builder, gradients)
+        builder.get(fetches)
+
+    @override(Policy)
+    def learn_on_batch(self, postprocessed_batch):
+        assert self._loss is not None, "Loss not initialized"
+        builder = TFRunBuilder(self._sess, "learn_on_batch")
+        fetches = self._build_learn_on_batch(builder, postprocessed_batch)
+        return builder.get(fetches)
+
+    @override(Policy)
+    def get_weights(self):
+        return self._variables.get_flat()
+
+    @override(Policy)
+    def set_weights(self, weights):
+        return self._variables.set_flat(weights)
+
+    @override(Policy)
+    def export_model(self, export_dir):
+        """Export tensorflow graph to export_dir for serving."""
+        with self._sess.graph.as_default():
+            builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
+            signature_def_map = self._build_signature_def()
+            builder.add_meta_graph_and_variables(
+                self._sess, [tf.saved_model.tag_constants.SERVING],
+                signature_def_map=signature_def_map)
+            builder.save()
+
+    @override(Policy)
+    def export_checkpoint(self, export_dir, filename_prefix="model"):
+        """Export tensorflow checkpoint to export_dir."""
+        try:
+            os.makedirs(export_dir)
+        except OSError as e:
+            # ignore error if export dir already exists
+            if e.errno != errno.EEXIST:
+                raise
+        save_path = os.path.join(export_dir, filename_prefix)
+        with self._sess.graph.as_default():
+            saver = tf.train.Saver()
+            saver.save(self._sess, save_path)
+
+    @DeveloperAPI
+    def copy(self, existing_inputs):
+        """Creates a copy of self using existing input placeholders.
+
+        Optional, only required to work with the multi-GPU optimizer."""
+        raise NotImplementedError
+
+    @DeveloperAPI
+    def extra_compute_action_feed_dict(self):
+        """Extra dict to pass to the compute actions session run."""
+        return {}
+
+    @DeveloperAPI
+    def extra_compute_action_fetches(self):
+        """Extra values to fetch and return from compute_actions().
+
+        By default we only return action probability info (if present).
+        """
+        if self._action_prob is not None:
+            return {"action_prob": self._action_prob}
+        else:
+            return {}
+
+    @DeveloperAPI
+    def extra_compute_grad_feed_dict(self):
+        """Extra dict to pass to the compute gradients session run."""
+        return {}  # e.g, kl_coeff
+
+    @DeveloperAPI
+    def extra_compute_grad_fetches(self):
+        """Extra values to fetch and return from compute_gradients()."""
+        return {LEARNER_STATS_KEY: {}}  # e.g, stats, td error, etc.
+
+    @DeveloperAPI
+    def optimizer(self):
+        """TF optimizer to use for policy optimization."""
+        if hasattr(self, "config"):
+            return tf.train.AdamOptimizer(self.config["lr"])
+        else:
+            return tf.train.AdamOptimizer()
+
+    @DeveloperAPI
+    def gradients(self, optimizer, loss):
+        """Override for custom gradient computation."""
+        return optimizer.compute_gradients(loss)
+
+    @DeveloperAPI
+    def build_apply_op(self, optimizer, grads_and_vars):
+        """Override for custom gradient apply computation."""
+
+        # specify global_step for TD3 which needs to count the num updates
+        return optimizer.apply_gradients(
+            self._grads_and_vars,
+            global_step=tf.train.get_or_create_global_step())
+
+    @DeveloperAPI
+    def _get_is_training_placeholder(self):
+        """Get the placeholder for _is_training, i.e., for batch norm layers.
+
+        This can be called safely before __init__ has run.
+        """
+        if not hasattr(self, "_is_training"):
+            self._is_training = tf.placeholder_with_default(False, ())
+        return self._is_training
+
+    def _extra_input_signature_def(self):
+        """Extra input signatures to add when exporting tf model.
+        Inferred from extra_compute_action_feed_dict()
+        """
+        feed_dict = self.extra_compute_action_feed_dict()
+        return {
+            k.name: tf.saved_model.utils.build_tensor_info(k)
+            for k in feed_dict.keys()
+        }
+
+    def _extra_output_signature_def(self):
+        """Extra output signatures to add when exporting tf model.
+        Inferred from extra_compute_action_fetches()
+        """
+        fetches = self.extra_compute_action_fetches()
+        return {
+            k: tf.saved_model.utils.build_tensor_info(fetches[k])
+            for k in fetches.keys()
+        }
+
+    def _build_signature_def(self):
+        """Build signature def map for tensorflow SavedModelBuilder.
+        """
+        # build input signatures
+        input_signature = self._extra_input_signature_def()
+        input_signature["observations"] = \
+            tf.saved_model.utils.build_tensor_info(self._obs_input)
+
+        if self._seq_lens is not None:
+            input_signature["seq_lens"] = \
+                tf.saved_model.utils.build_tensor_info(self._seq_lens)
+        if self._prev_action_input is not None:
+            input_signature["prev_action"] = \
+                tf.saved_model.utils.build_tensor_info(self._prev_action_input)
+        if self._prev_reward_input is not None:
+            input_signature["prev_reward"] = \
+                tf.saved_model.utils.build_tensor_info(self._prev_reward_input)
+        input_signature["is_training"] = \
+            tf.saved_model.utils.build_tensor_info(self._is_training)
+
+        for state_input in self._state_inputs:
+            input_signature[state_input.name] = \
+                tf.saved_model.utils.build_tensor_info(state_input)
+
+        # build output signatures
+        output_signature = self._extra_output_signature_def()
+        output_signature["actions"] = \
+            tf.saved_model.utils.build_tensor_info(self._sampler)
+        for state_output in self._state_outputs:
+            output_signature[state_output.name] = \
+                tf.saved_model.utils.build_tensor_info(state_output)
+        signature_def = (
+            tf.saved_model.signature_def_utils.build_signature_def(
+                input_signature, output_signature,
+                tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
+        signature_def_key = (tf.saved_model.signature_constants.
+                             DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+        signature_def_map = {signature_def_key: signature_def}
+        return signature_def_map
+
+    def _build_compute_actions(self,
+                               builder,
+                               obs_batch,
+                               state_batches=None,
+                               prev_action_batch=None,
+                               prev_reward_batch=None,
+                               episodes=None):
+        state_batches = state_batches or []
+        if len(self._state_inputs) != len(state_batches):
+            raise ValueError(
+                "Must pass in RNN state batches for placeholders {}, got {}".
+                format(self._state_inputs, state_batches))
+        builder.add_feed_dict(self.extra_compute_action_feed_dict())
+        builder.add_feed_dict({self._obs_input: obs_batch})
+        if state_batches:
+            builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
+        if self._prev_action_input is not None and prev_action_batch:
+            builder.add_feed_dict({self._prev_action_input: prev_action_batch})
+        if self._prev_reward_input is not None and prev_reward_batch:
+            builder.add_feed_dict({self._prev_reward_input: prev_reward_batch})
+        builder.add_feed_dict({self._is_training: False})
+        builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
+        fetches = builder.add_fetches([self._sampler] + self._state_outputs +
+                                      [self.extra_compute_action_fetches()])
+        return fetches[0], fetches[1:-1], fetches[-1]
+
+    def _build_compute_gradients(self, builder, postprocessed_batch):
+        builder.add_feed_dict(self.extra_compute_grad_feed_dict())
+        builder.add_feed_dict({self._is_training: True})
+        builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch))
+        fetches = builder.add_fetches(
+            [self._grads, self._get_grad_and_stats_fetches()])
+        return fetches[0], fetches[1]
+
+    def _build_apply_gradients(self, builder, gradients):
+        if len(gradients) != len(self._grads):
+            raise ValueError(
+                "Unexpected number of gradients to apply, got {} for {}".
+                format(gradients, self._grads))
+        builder.add_feed_dict({self._is_training: True})
+        builder.add_feed_dict(dict(zip(self._grads, gradients)))
+        fetches = builder.add_fetches([self._apply_op])
+        return fetches[0]
+
+    def _build_learn_on_batch(self, builder, postprocessed_batch):
+        builder.add_feed_dict(self.extra_compute_grad_feed_dict())
+        builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch))
+        builder.add_feed_dict({self._is_training: True})
+        fetches = builder.add_fetches([
+            self._apply_op,
+            self._get_grad_and_stats_fetches(),
+        ])
+        return fetches[1]
+
+    def _get_grad_and_stats_fetches(self):
+        fetches = self.extra_compute_grad_fetches()
+        if LEARNER_STATS_KEY not in fetches:
+            raise ValueError(
+                "Grad fetches should contain 'stats': {...} entry")
+        if self._stats_fetches:
+            fetches[LEARNER_STATS_KEY] = dict(self._stats_fetches,
+                                              **fetches[LEARNER_STATS_KEY])
+        return fetches
+
+    def _get_loss_inputs_dict(self, batch):
+        feed_dict = {}
+        if self._batch_divisibility_req > 1:
+            meets_divisibility_reqs = (
+                len(batch[SampleBatch.CUR_OBS]) %
+                self._batch_divisibility_req == 0
+                and max(batch[SampleBatch.AGENT_INDEX]) == 0)  # not multiagent
+        else:
+            meets_divisibility_reqs = True
+
+        # Simple case: not RNN nor do we need to pad
+        if not self._state_inputs and meets_divisibility_reqs:
+            for k, ph in self._loss_inputs:
+                feed_dict[ph] = batch[k]
+            return feed_dict
+
+        if self._state_inputs:
+            max_seq_len = self._max_seq_len
+            dynamic_max = True
+        else:
+            max_seq_len = self._batch_divisibility_req
+            dynamic_max = False
+
+        # RNN or multi-agent case
+        feature_keys = [k for k, v in self._loss_inputs]
+        state_keys = [
+            "state_in_{}".format(i) for i in range(len(self._state_inputs))
+        ]
+        feature_sequences, initial_states, seq_lens = chop_into_sequences(
+            batch[SampleBatch.EPS_ID],
+            batch[SampleBatch.UNROLL_ID],
+            batch[SampleBatch.AGENT_INDEX], [batch[k] for k in feature_keys],
+            [batch[k] for k in state_keys],
+            max_seq_len,
+            dynamic_max=dynamic_max)
+        for k, v in zip(feature_keys, feature_sequences):
+            feed_dict[self._loss_input_dict[k]] = v
+        for k, v in zip(state_keys, initial_states):
+            feed_dict[self._loss_input_dict[k]] = v
+        feed_dict[self._seq_lens] = seq_lens
+
+        if log_once("rnn_feed_dict"):
+            logger.info("Padded input for RNN:\n\n{}\n".format(
+                summarize({
+                    "features": feature_sequences,
+                    "initial_states": initial_states,
+                    "seq_lens": seq_lens,
+                    "max_seq_len": max_seq_len,
+                })))
+        return feed_dict
+
+
+@DeveloperAPI
+class LearningRateSchedule(object):
+    """Mixin for TFPolicy that adds a learning rate schedule."""
+
+    @DeveloperAPI
+    def __init__(self, lr, lr_schedule):
+        self.cur_lr = tf.get_variable("lr", initializer=lr)
+        if lr_schedule is None:
+            self.lr_schedule = ConstantSchedule(lr)
+        else:
+            self.lr_schedule = PiecewiseSchedule(
+                lr_schedule, outside_value=lr_schedule[-1][-1])
+
+    @override(Policy)
+    def on_global_var_update(self, global_vars):
+        super(LearningRateSchedule, self).on_global_var_update(global_vars)
+        self.cur_lr.load(
+            self.lr_schedule.value(global_vars["timestep"]),
+            session=self._sess)
+
+    @override(TFPolicy)
+    def optimizer(self):
+        return tf.train.AdamOptimizer(self.cur_lr)
diff --git a/python/ray/rllib/policy/tf_policy_template.py b/python/ray/rllib/policy/tf_policy_template.py
new file mode 100644
index 0000000000000..f9aecfc176cc8
--- /dev/null
+++ b/python/ray/rllib/policy/tf_policy_template.py
@@ -0,0 +1,146 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.evaluation.dynamic_tf_policy import DynamicTFPolicy
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.utils.annotations import override, DeveloperAPI
+
+
+@DeveloperAPI
+def build_tf_policy(name,
+                    loss_fn,
+                    get_default_config=None,
+                    stats_fn=None,
+                    grad_stats_fn=None,
+                    extra_action_fetches_fn=None,
+                    postprocess_fn=None,
+                    optimizer_fn=None,
+                    gradients_fn=None,
+                    before_init=None,
+                    before_loss_init=None,
+                    after_init=None,
+                    make_action_sampler=None,
+                    mixins=None,
+                    get_batch_divisibility_req=None):
+    """Helper function for creating a dynamic tf policy at runtime.
+
+    Arguments:
+        name (str): name of the policy (e.g., "PPOPolicy")
+        loss_fn (func): function that returns a loss tensor the policy,
+            and dict of experience tensor placeholders
+        get_default_config (func): optional function that returns the default
+            config to merge with any overrides
+        stats_fn (func): optional function that returns a dict of
+            TF fetches given the policy and batch input tensors
+        grad_stats_fn (func): optional function that returns a dict of
+            TF fetches given the policy and loss gradient tensors
+        extra_action_fetches_fn (func): optional function that returns
+            a dict of TF fetches given the policy object
+        postprocess_fn (func): optional experience postprocessing function
+            that takes the same args as Policy.postprocess_trajectory()
+        optimizer_fn (func): optional function that returns a tf.Optimizer
+            given the policy and config
+        gradients_fn (func): optional function that returns a list of gradients
+            given a tf optimizer and loss tensor. If not specified, this
+            defaults to optimizer.compute_gradients(loss)
+        before_init (func): optional function to run at the beginning of
+            policy init that takes the same arguments as the policy constructor
+        before_loss_init (func): optional function to run prior to loss
+            init that takes the same arguments as the policy constructor
+        after_init (func): optional function to run at the end of policy init
+            that takes the same arguments as the policy constructor
+        make_action_sampler (func): optional function that returns a
+            tuple of action and action prob tensors. The function takes
+            (policy, input_dict, obs_space, action_space, config) as its
+            arguments
+        mixins (list): list of any class mixins for the returned policy class.
+            These mixins will be applied in order and will have higher
+            precedence than the DynamicTFPolicy class
+        get_batch_divisibility_req (func): optional function that returns
+            the divisibility requirement for sample batches
+
+    Returns:
+        a DynamicTFPolicy instance that uses the specified args
+    """
+
+    if not name.endswith("TFPolicy"):
+        raise ValueError("Name should match *TFPolicy", name)
+
+    base = DynamicTFPolicy
+    while mixins:
+
+        class new_base(mixins.pop(), base):
+            pass
+
+        base = new_base
+
+    class policy_cls(base):
+        def __init__(self,
+                     obs_space,
+                     action_space,
+                     config,
+                     existing_inputs=None):
+            if get_default_config:
+                config = dict(get_default_config(), **config)
+
+            if before_init:
+                before_init(self, obs_space, action_space, config)
+
+            def before_loss_init_wrapper(policy, obs_space, action_space,
+                                         config):
+                if before_loss_init:
+                    before_loss_init(policy, obs_space, action_space, config)
+                if extra_action_fetches_fn is None:
+                    self._extra_action_fetches = {}
+                else:
+                    self._extra_action_fetches = extra_action_fetches_fn(self)
+
+            DynamicTFPolicy.__init__(
+                self,
+                obs_space,
+                action_space,
+                config,
+                loss_fn,
+                stats_fn=stats_fn,
+                grad_stats_fn=grad_stats_fn,
+                before_loss_init=before_loss_init_wrapper,
+                existing_inputs=existing_inputs)
+
+            if after_init:
+                after_init(self, obs_space, action_space, config)
+
+        @override(Policy)
+        def postprocess_trajectory(self,
+                                   sample_batch,
+                                   other_agent_batches=None,
+                                   episode=None):
+            if not postprocess_fn:
+                return sample_batch
+            return postprocess_fn(self, sample_batch, other_agent_batches,
+                                  episode)
+
+        @override(TFPolicy)
+        def optimizer(self):
+            if optimizer_fn:
+                return optimizer_fn(self, self.config)
+            else:
+                return TFPolicy.optimizer(self)
+
+        @override(TFPolicy)
+        def gradients(self, optimizer, loss):
+            if gradients_fn:
+                return gradients_fn(self, optimizer, loss)
+            else:
+                return TFPolicy.gradients(self, optimizer, loss)
+
+        @override(TFPolicy)
+        def extra_compute_action_fetches(self):
+            return dict(
+                TFPolicy.extra_compute_action_fetches(self),
+                **self._extra_action_fetches)
+
+    policy_cls.__name__ = name
+    policy_cls.__qualname__ = name
+    return policy_cls
diff --git a/python/ray/rllib/policy/torch_policy.py b/python/ray/rllib/policy/torch_policy.py
new file mode 100644
index 0000000000000..cd0602935a0a3
--- /dev/null
+++ b/python/ray/rllib/policy/torch_policy.py
@@ -0,0 +1,174 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+from threading import Lock
+
+try:
+    import torch
+except ImportError:
+    pass  # soft dep
+
+from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.tracking_dict import UsageTrackingDict
+
+
+class TorchPolicy(Policy):
+    """Template for a PyTorch policy and loss to use with RLlib.
+
+    This is similar to TFPolicy, but for PyTorch.
+
+    Attributes:
+        observation_space (gym.Space): observation space of the policy.
+        action_space (gym.Space): action space of the policy.
+        lock (Lock): Lock that must be held around PyTorch ops on this graph.
+            This is necessary when using the async sampler.
+    """
+
+    def __init__(self, observation_space, action_space, model, loss,
+                 action_distribution_cls):
+        """Build a policy from policy and loss torch modules.
+
+        Note that model will be placed on GPU device if CUDA_VISIBLE_DEVICES
+        is set. Only single GPU is supported for now.
+
+        Arguments:
+            observation_space (gym.Space): observation space of the policy.
+            action_space (gym.Space): action space of the policy.
+            model (nn.Module): PyTorch policy module. Given observations as
+                input, this module must return a list of outputs where the
+                first item is action logits, and the rest can be any value.
+            loss (func): Function that takes (policy, batch_tensors)
+                and returns a single scalar loss.
+            action_distribution_cls (ActionDistribution): Class for action
+                distribution.
+        """
+        self.observation_space = observation_space
+        self.action_space = action_space
+        self.lock = Lock()
+        self.device = (torch.device("cuda")
+                       if bool(os.environ.get("CUDA_VISIBLE_DEVICES", None))
+                       else torch.device("cpu"))
+        self._model = model.to(self.device)
+        self._loss = loss
+        self._optimizer = self.optimizer()
+        self._action_dist_cls = action_distribution_cls
+
+    @override(Policy)
+    def compute_actions(self,
+                        obs_batch,
+                        state_batches=None,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
+                        info_batch=None,
+                        episodes=None,
+                        **kwargs):
+        with self.lock:
+            with torch.no_grad():
+                ob = torch.from_numpy(np.array(obs_batch)) \
+                    .float().to(self.device)
+                model_out = self._model({"obs": ob}, state_batches)
+                logits, _, vf, state = model_out
+                action_dist = self._action_dist_cls(logits)
+                actions = action_dist.sample()
+                return (actions.cpu().numpy(),
+                        [h.cpu().numpy() for h in state],
+                        self.extra_action_out(model_out))
+
+    @override(Policy)
+    def learn_on_batch(self, postprocessed_batch):
+        batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
+
+        with self.lock:
+            loss_out = self._loss(self, batch_tensors)
+            self._optimizer.zero_grad()
+            loss_out.backward()
+
+            grad_process_info = self.extra_grad_process()
+            self._optimizer.step()
+
+            grad_info = self.extra_grad_info(batch_tensors)
+            grad_info.update(grad_process_info)
+            return {LEARNER_STATS_KEY: grad_info}
+
+    @override(Policy)
+    def compute_gradients(self, postprocessed_batch):
+        batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
+
+        with self.lock:
+            loss_out = self._loss(self, batch_tensors)
+            self._optimizer.zero_grad()
+            loss_out.backward()
+
+            grad_process_info = self.extra_grad_process()
+
+            # Note that return values are just references;
+            # calling zero_grad will modify the values
+            grads = []
+            for p in self._model.parameters():
+                if p.grad is not None:
+                    grads.append(p.grad.data.cpu().numpy())
+                else:
+                    grads.append(None)
+
+            grad_info = self.extra_grad_info(batch_tensors)
+            grad_info.update(grad_process_info)
+            return grads, {LEARNER_STATS_KEY: grad_info}
+
+    @override(Policy)
+    def apply_gradients(self, gradients):
+        with self.lock:
+            for g, p in zip(gradients, self._model.parameters()):
+                if g is not None:
+                    p.grad = torch.from_numpy(g).to(self.device)
+            self._optimizer.step()
+
+    @override(Policy)
+    def get_weights(self):
+        with self.lock:
+            return {k: v.cpu() for k, v in self._model.state_dict().items()}
+
+    @override(Policy)
+    def set_weights(self, weights):
+        with self.lock:
+            self._model.load_state_dict(weights)
+
+    @override(Policy)
+    def get_initial_state(self):
+        return [s.numpy() for s in self._model.state_init()]
+
+    def extra_grad_process(self):
+        """Allow subclass to do extra processing on gradients and
+           return processing info."""
+        return {}
+
+    def extra_action_out(self, model_out):
+        """Returns dict of extra info to include in experience batch.
+
+        Arguments:
+            model_out (list): Outputs of the policy model module."""
+        return {}
+
+    def extra_grad_info(self, batch_tensors):
+        """Return dict of extra grad info."""
+
+        return {}
+
+    def optimizer(self):
+        """Custom PyTorch optimizer to use."""
+        if hasattr(self, "config"):
+            return torch.optim.Adam(
+                self._model.parameters(), lr=self.config["lr"])
+        else:
+            return torch.optim.Adam(self._model.parameters())
+
+    def _lazy_tensor_dict(self, postprocessed_batch):
+        batch_tensors = UsageTrackingDict(postprocessed_batch)
+        batch_tensors.set_get_interceptor(
+            lambda arr: torch.from_numpy(arr).to(self.device))
+        return batch_tensors
diff --git a/python/ray/rllib/policy/torch_policy_template.py b/python/ray/rllib/policy/torch_policy_template.py
new file mode 100644
index 0000000000000..76bd3deb75b95
--- /dev/null
+++ b/python/ray/rllib/policy/torch_policy_template.py
@@ -0,0 +1,133 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.evaluation.policy import Policy
+from ray.rllib.evaluation.torch_policy import TorchPolicy
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.utils.annotations import override, DeveloperAPI
+
+
+@DeveloperAPI
+def build_torch_policy(name,
+                       loss_fn,
+                       get_default_config=None,
+                       stats_fn=None,
+                       postprocess_fn=None,
+                       extra_action_out_fn=None,
+                       extra_grad_process_fn=None,
+                       optimizer_fn=None,
+                       before_init=None,
+                       after_init=None,
+                       make_model_and_action_dist=None,
+                       mixins=None):
+    """Helper function for creating a torch policy at runtime.
+
+    Arguments:
+        name (str): name of the policy (e.g., "PPOPolicy")
+        loss_fn (func): function that returns a loss tensor the policy,
+            and dict of experience tensor placeholders
+        get_default_config (func): optional function that returns the default
+            config to merge with any overrides
+        stats_fn (func): optional function that returns a dict of
+            values given the policy and batch input tensors
+        postprocess_fn (func): optional experience postprocessing function
+            that takes the same args as Policy.postprocess_trajectory()
+        extra_action_out_fn (func): optional function that returns
+            a dict of extra values to include in experiences
+        extra_grad_process_fn (func): optional function that is called after
+            gradients are computed and returns processing info
+        optimizer_fn (func): optional function that returns a torch optimizer
+            given the policy and config
+        before_init (func): optional function to run at the beginning of
+            policy init that takes the same arguments as the policy constructor
+        after_init (func): optional function to run at the end of policy init
+            that takes the same arguments as the policy constructor
+        make_model_and_action_dist (func): optional func that takes the same
+            arguments as policy init and returns a tuple of model instance and
+            torch action distribution class. If not specified, the default
+            model and action dist from the catalog will be used
+        mixins (list): list of any class mixins for the returned policy class.
+            These mixins will be applied in order and will have higher
+            precedence than the TorchPolicy class
+
+    Returns:
+        a TorchPolicy instance that uses the specified args
+    """
+
+    if not name.endswith("TorchPolicy"):
+        raise ValueError("Name should match *TorchPolicy", name)
+
+    base = TorchPolicy
+    while mixins:
+
+        class new_base(mixins.pop(), base):
+            pass
+
+        base = new_base
+
+    class graph_cls(base):
+        def __init__(self, obs_space, action_space, config):
+            if get_default_config:
+                config = dict(get_default_config(), **config)
+            self.config = config
+
+            if before_init:
+                before_init(self, obs_space, action_space, config)
+
+            if make_model_and_action_dist:
+                self.model, self.dist_class = make_model_and_action_dist(
+                    self, obs_space, action_space, config)
+            else:
+                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
+                    action_space, self.config["model"], torch=True)
+                self.model = ModelCatalog.get_torch_model(
+                    obs_space, logit_dim, self.config["model"])
+
+            TorchPolicy.__init__(self, obs_space, action_space, self.model,
+                                 loss_fn, self.dist_class)
+
+            if after_init:
+                after_init(self, obs_space, action_space, config)
+
+        @override(Policy)
+        def postprocess_trajectory(self,
+                                   sample_batch,
+                                   other_agent_batches=None,
+                                   episode=None):
+            if not postprocess_fn:
+                return sample_batch
+            return postprocess_fn(self, sample_batch, other_agent_batches,
+                                  episode)
+
+        @override(TorchPolicy)
+        def extra_grad_process(self):
+            if extra_grad_process_fn:
+                return extra_grad_process_fn(self)
+            else:
+                return TorchPolicy.extra_grad_process(self)
+
+        @override(TorchPolicy)
+        def extra_action_out(self, model_out):
+            if extra_action_out_fn:
+                return extra_action_out_fn(self, model_out)
+            else:
+                return TorchPolicy.extra_action_out(self, model_out)
+
+        @override(TorchPolicy)
+        def optimizer(self):
+            if optimizer_fn:
+                return optimizer_fn(self, self.config)
+            else:
+                return TorchPolicy.optimizer(self)
+
+        @override(TorchPolicy)
+        def extra_grad_info(self, batch_tensors):
+            if stats_fn:
+                return stats_fn(self, batch_tensors)
+            else:
+                return TorchPolicy.extra_grad_info(self, batch_tensors)
+
+    graph_cls.__name__ = name
+    graph_cls.__qualname__ = name
+    return graph_cls

From 13982f53a433bae274450338624ae88ee291cbd9 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:11:50 -0700
Subject: [PATCH 06/13] move imports

---
 python/ray/rllib/__init__.py                           | 10 +++++-----
 python/ray/rllib/agents/a3c/a3c_tf_policy.py           |  4 ++--
 python/ray/rllib/agents/ddpg/ddpg_policy.py            |  4 ++--
 python/ray/rllib/agents/dqn/dqn_policy.py              |  4 ++--
 python/ray/rllib/agents/impala/vtrace_policy.py        |  4 ++--
 python/ray/rllib/agents/marwil/marwil_policy.py        |  4 ++--
 python/ray/rllib/agents/pg/pg_policy.py                |  2 +-
 python/ray/rllib/agents/ppo/appo_policy.py             |  4 ++--
 python/ray/rllib/agents/ppo/ppo_policy.py              |  4 ++--
 python/ray/rllib/agents/qmix/qmix_policy.py            |  2 +-
 python/ray/rllib/agents/trainer.py                     |  2 +-
 python/ray/rllib/evaluation/__init__.py                | 10 +++++-----
 python/ray/rllib/evaluation/policy_evaluator.py        |  4 ++--
 python/ray/rllib/evaluation/sampler.py                 |  4 ++--
 python/ray/rllib/evaluation/tf_policy_template.py      |  6 +++---
 python/ray/rllib/keras_policy.py                       |  2 +-
 python/ray/rllib/optimizers/multi_gpu_optimizer.py     |  2 +-
 python/ray/rllib/policy/dynamic_tf_policy.py           |  4 ++--
 python/ray/rllib/policy/tf_policy.py                   |  2 +-
 python/ray/rllib/policy/tf_policy_template.py          |  6 +++---
 python/ray/rllib/policy/torch_policy.py                |  2 +-
 python/ray/rllib/policy/torch_policy_template.py       |  2 +-
 python/ray/rllib/tests/test_external_env.py            |  2 +-
 .../ray/rllib/tests/test_external_multi_agent_env.py   |  2 +-
 python/ray/rllib/tests/test_multi_agent_env.py         |  4 ++--
 python/ray/rllib/tests/test_optimizers.py              |  2 +-
 python/ray/rllib/tests/test_perf.py                    |  2 +-
 python/ray/rllib/tests/test_policy_evaluator.py        |  4 ++--
 28 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py
index f8d41ff447858..1a5c67d2930e7 100644
--- a/python/ray/rllib/__init__.py
+++ b/python/ray/rllib/__init__.py
@@ -8,15 +8,15 @@
 # This file is imported from the tune module in order to register RLlib agents.
 from ray.tune.registry import register_trainable
 
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy import TFPolicy
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_graph import PolicyGraph
+from ray.rllib.policy.tf_policy import TFPolicy
+from ray.rllib.policy.tf_policy_graph import TFPolicyGraph
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.env.external_env import ExternalEnv
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator
 from ray.rllib.evaluation.sample_batch import SampleBatch
 
 
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy.py b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
index 94063dcdf90be..488f8bb8ebc8b 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
@@ -11,10 +11,10 @@
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.explained_variance import explained_variance
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.tf_policy import TFPolicy, \
+from ray.rllib.policy.tf_policy import TFPolicy, \
     LearningRateSchedule
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.annotations import override
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy.py b/python/ray/rllib/agents/ddpg/ddpg_policy.py
index aa8b91f8143b6..b59cf8a3e6590 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy.py
@@ -14,8 +14,8 @@
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.utils import try_import_tf
 
 tf = try_import_tf()
diff --git a/python/ray/rllib/agents/dqn/dqn_policy.py b/python/ray/rllib/agents/dqn/dqn_policy.py
index 903d4d72be42e..7078ac98124f2 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy.py
@@ -12,8 +12,8 @@
 from ray.rllib.models import ModelCatalog, Categorical
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.tf_policy import TFPolicy, \
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tf_policy import TFPolicy, \
     LearningRateSchedule
 from ray.rllib.utils import try_import_tf
 
diff --git a/python/ray/rllib/agents/impala/vtrace_policy.py b/python/ray/rllib/agents/impala/vtrace_policy.py
index ccf8a87f3d04d..4d067087e4a35 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy.py
@@ -11,9 +11,9 @@
 import numpy as np
 from ray.rllib.agents.impala import vtrace
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy import TFPolicy, \
+from ray.rllib.policy.tf_policy import TFPolicy, \
     LearningRateSchedule
 from ray.rllib.models.action_dist import MultiCategorical
 from ray.rllib.models.catalog import ModelCatalog
diff --git a/python/ray/rllib/agents/marwil/marwil_policy.py b/python/ray/rllib/agents/marwil/marwil_policy.py
index a6b38e4f8667a..47bc82a1627f3 100644
--- a/python/ray/rllib/agents/marwil/marwil_policy.py
+++ b/python/ray/rllib/agents/marwil/marwil_policy.py
@@ -9,8 +9,8 @@
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.utils.annotations import override
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.agents.dqn.dqn_policy import _scope_vars
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils import try_import_tf
diff --git a/python/ray/rllib/agents/pg/pg_policy.py b/python/ray/rllib/agents/pg/pg_policy.py
index 54fcd041cc729..7e7e4e025d366 100644
--- a/python/ray/rllib/agents/pg/pg_policy.py
+++ b/python/ray/rllib/agents/pg/pg_policy.py
@@ -5,7 +5,7 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.tf_policy_template import build_tf_policy
+from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils import try_import_tf
 
diff --git a/python/ray/rllib/agents/ppo/appo_policy.py b/python/ray/rllib/agents/ppo/appo_policy.py
index a5e5f58245683..78886c20d41b4 100644
--- a/python/ray/rllib/agents/ppo/appo_policy.py
+++ b/python/ray/rllib/agents/ppo/appo_policy.py
@@ -14,8 +14,8 @@
 from ray.rllib.agents.impala import vtrace
 from ray.rllib.evaluation.postprocessing import Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy_template import build_tf_policy
-from ray.rllib.evaluation.tf_policy import LearningRateSchedule
+from ray.rllib.policy.tf_policy_template import build_tf_policy
+from ray.rllib.policy.tf_policy import LearningRateSchedule
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.utils import try_import_tf
diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py
index 42e749c83f204..7f1a4bdb56aca 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy.py
@@ -8,8 +8,8 @@
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy import LearningRateSchedule
-from ray.rllib.evaluation.tf_policy_template import build_tf_policy
+from ray.rllib.policy.tf_policy import LearningRateSchedule
+from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils import try_import_tf
diff --git a/python/ray/rllib/agents/qmix/qmix_policy.py b/python/ray/rllib/agents/qmix/qmix_policy.py
index 334e1dddb0a88..88b98ddabeb09 100644
--- a/python/ray/rllib/agents/qmix/qmix_policy.py
+++ b/python/ray/rllib/agents/qmix/qmix_policy.py
@@ -14,7 +14,7 @@
 from ray.rllib.agents.qmix.mixers import VDNMixer, QMixer
 from ray.rllib.agents.qmix.model import RNNModel, _get_size
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.models.action_dist import TupleActions
 from ray.rllib.models.catalog import ModelCatalog
diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py
index ea7961ed04d43..9726a0db4f747 100644
--- a/python/ray/rllib/agents/trainer.py
+++ b/python/ray/rllib/agents/trainer.py
@@ -17,7 +17,7 @@
 from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \
     ShuffledInput
 from ray.rllib.models import MODEL_DEFAULTS
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator, \
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator, \
     _validate_multiagent_config
 from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.evaluation.metrics import collect_metrics
diff --git a/python/ray/rllib/evaluation/__init__.py b/python/ray/rllib/evaluation/__init__.py
index 95e9aba2180a1..3828444151921 100644
--- a/python/ray/rllib/evaluation/__init__.py
+++ b/python/ray/rllib/evaluation/__init__.py
@@ -1,10 +1,10 @@
 from ray.rllib.evaluation.episode import MultiAgentEpisode
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator
 from ray.rllib.evaluation.interface import EvaluatorInterface
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.tf_policy import TFPolicy
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_graph import PolicyGraph
+from ray.rllib.policy.tf_policy import TFPolicy
+from ray.rllib.policy.tf_policy_graph import TFPolicyGraph
 from ray.rllib.evaluation.torch_policy import TorchPolicy
 from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index 1aaf72236143e..1a5a167fe926e 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -18,8 +18,8 @@
 from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \
     DEFAULT_POLICY_ID
 from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.offline import NoopOutput, IOContext, OutputWriter, InputReader
 from ray.rllib.offline.is_estimator import ImportanceSamplingEstimator
 from ray.rllib.offline.wis_estimator import WeightedImportanceSamplingEstimator
diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py
index 368c6f01ce10a..47964c3c561b5 100644
--- a/python/ray/rllib/evaluation/sampler.py
+++ b/python/ray/rllib/evaluation/sampler.py
@@ -12,7 +12,7 @@
 from ray.rllib.evaluation.episode import MultiAgentEpisode, _flatten_action
 from ray.rllib.evaluation.sample_batch_builder import \
     MultiAgentSampleBatchBuilder
-from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.env.base_env import BaseEnv, ASYNC_RESET_RETURN
 from ray.rllib.env.atari_wrappers import get_wrapper_by_cls, MonitorEnv
 from ray.rllib.models.action_dist import TupleActions
@@ -20,7 +20,7 @@
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.debug import log_once, summarize
 from ray.rllib.utils.tf_run_builder import TFRunBuilder
-from ray.rllib.evaluation.policy import clip_action
+from ray.rllib.policy.policy import clip_action
 
 logger = logging.getLogger(__name__)
 
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index f9aecfc176cc8..75ff0212a7a1c 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -2,9 +2,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.evaluation.dynamic_tf_policy import DynamicTFPolicy
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.policy.dynamic_tf_policy import DynamicTFPolicy
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.utils.annotations import override, DeveloperAPI
 
 
diff --git a/python/ray/rllib/keras_policy.py b/python/ray/rllib/keras_policy.py
index e10f6ec1d178f..3008e133c1c6d 100644
--- a/python/ray/rllib/keras_policy.py
+++ b/python/ray/rllib/keras_policy.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 
 
 def _sample(probs):
diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
index c11ee1e51ca70..d10428e93132e 100644
--- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py
+++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
@@ -9,7 +9,7 @@
 
 import ray
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
 from ray.rllib.optimizers.multi_gpu_impl import LocalSyncParallelOptimizer
 from ray.rllib.optimizers.rollout import collect_samples, \
diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py
index a82e751825a8f..fb280bfb9a7a3 100644
--- a/python/ray/rllib/policy/dynamic_tf_policy.py
+++ b/python/ray/rllib/policy/dynamic_tf_policy.py
@@ -6,9 +6,9 @@
 import logging
 import numpy as np
 
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils import try_import_tf
diff --git a/python/ray/rllib/policy/tf_policy.py b/python/ray/rllib/policy/tf_policy.py
index c6cc619af524d..06d97aad03d9d 100644
--- a/python/ray/rllib/policy/tf_policy.py
+++ b/python/ray/rllib/policy/tf_policy.py
@@ -10,7 +10,7 @@
 import ray
 import ray.experimental.tf_utils
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.models.lstm import chop_into_sequences
 from ray.rllib.utils.annotations import override, DeveloperAPI
diff --git a/python/ray/rllib/policy/tf_policy_template.py b/python/ray/rllib/policy/tf_policy_template.py
index f9aecfc176cc8..75ff0212a7a1c 100644
--- a/python/ray/rllib/policy/tf_policy_template.py
+++ b/python/ray/rllib/policy/tf_policy_template.py
@@ -2,9 +2,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.evaluation.dynamic_tf_policy import DynamicTFPolicy
-from ray.rllib.evaluation.policy import Policy
-from ray.rllib.evaluation.tf_policy import TFPolicy
+from ray.rllib.policy.dynamic_tf_policy import DynamicTFPolicy
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.utils.annotations import override, DeveloperAPI
 
 
diff --git a/python/ray/rllib/policy/torch_policy.py b/python/ray/rllib/policy/torch_policy.py
index cd0602935a0a3..622dee8777483 100644
--- a/python/ray/rllib/policy/torch_policy.py
+++ b/python/ray/rllib/policy/torch_policy.py
@@ -13,7 +13,7 @@
     pass  # soft dep
 
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.tracking_dict import UsageTrackingDict
 
diff --git a/python/ray/rllib/policy/torch_policy_template.py b/python/ray/rllib/policy/torch_policy_template.py
index 76bd3deb75b95..4a1b4b9e09a36 100644
--- a/python/ray/rllib/policy/torch_policy_template.py
+++ b/python/ray/rllib/policy/torch_policy_template.py
@@ -2,7 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.torch_policy import TorchPolicy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.annotations import override, DeveloperAPI
diff --git a/python/ray/rllib/tests/test_external_env.py b/python/ray/rllib/tests/test_external_env.py
index 3b2158959267c..d1d1231045aaf 100644
--- a/python/ray/rllib/tests/test_external_env.py
+++ b/python/ray/rllib/tests/test_external_env.py
@@ -11,7 +11,7 @@
 import ray
 from ray.rllib.agents.dqn import DQNTrainer
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.external_env import ExternalEnv
 from ray.rllib.tests.test_policy_evaluator import (BadPolicy, MockPolicy,
                                                    MockEnv)
diff --git a/python/ray/rllib/tests/test_external_multi_agent_env.py b/python/ray/rllib/tests/test_external_multi_agent_env.py
index fcb3de634cbea..f6cc8e51d8b02 100644
--- a/python/ray/rllib/tests/test_external_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_external_multi_agent_env.py
@@ -10,7 +10,7 @@
 import ray
 from ray.rllib.agents.pg.pg_policy import PGTFPolicy
 from ray.rllib.optimizers import SyncSamplesOptimizer
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
 from ray.rllib.tests.test_policy_evaluator import MockPolicy
 from ray.rllib.tests.test_external_env import make_simple_serving
diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py
index 281d797f7f54f..a6195c5f601f9 100644
--- a/python/ray/rllib/tests/test_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_multi_agent_env.py
@@ -14,8 +14,8 @@
                                   AsyncGradientsOptimizer)
 from ray.rllib.tests.test_policy_evaluator import (MockEnv, MockEnv2,
                                                    MockPolicy)
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator
+from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
diff --git a/python/ray/rllib/tests/test_optimizers.py b/python/ray/rllib/tests/test_optimizers.py
index f851cfc33f128..4b6ea9e6d8217 100644
--- a/python/ray/rllib/tests/test_optimizers.py
+++ b/python/ray/rllib/tests/test_optimizers.py
@@ -11,7 +11,7 @@
 from ray.rllib.agents.ppo import PPOTrainer
 from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
 from ray.rllib.evaluation import SampleBatch
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator
 from ray.rllib.optimizers import AsyncGradientsOptimizer, AsyncSamplesOptimizer
 from ray.rllib.optimizers.aso_tree_aggregator import TreeAggregator
 from ray.rllib.tests.mock_evaluator import _MockEvaluator
diff --git a/python/ray/rllib/tests/test_perf.py b/python/ray/rllib/tests/test_perf.py
index e31530f44ced6..8001b31d65184 100644
--- a/python/ray/rllib/tests/test_perf.py
+++ b/python/ray/rllib/tests/test_perf.py
@@ -7,7 +7,7 @@
 import unittest
 
 import ray
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator
 from ray.rllib.tests.test_policy_evaluator import MockPolicy
 
 
diff --git a/python/ray/rllib/tests/test_policy_evaluator.py b/python/ray/rllib/tests/test_policy_evaluator.py
index ba66ee94552d3..b7410d9e0ab23 100644
--- a/python/ray/rllib/tests/test_policy_evaluator.py
+++ b/python/ray/rllib/tests/test_policy_evaluator.py
@@ -12,9 +12,9 @@
 import ray
 from ray.rllib.agents.pg import PGTrainer
 from ray.rllib.agents.a3c import A2CTrainer
-from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
+from ray.rllib.policy.policy_evaluator import PolicyEvaluator
 from ray.rllib.evaluation.metrics import collect_metrics
-from ray.rllib.evaluation.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID, SampleBatch
 from ray.rllib.env.vector_env import VectorEnv

From 4f5392253073d7308c2b6b291cc17466445b4bf1 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:29:34 -0700
Subject: [PATCH 07/13] move to policy dir

---
 python/ray/rllib/__init__.py                  |  12 +-
 python/ray/rllib/agents/a3c/a3c_tf_policy.py  |   2 +-
 .../ray/rllib/agents/a3c/a3c_torch_policy.py  |   4 +-
 python/ray/rllib/agents/ars/ars.py            |   2 +-
 python/ray/rllib/agents/ddpg/ddpg_policy.py   |   2 +-
 python/ray/rllib/agents/dqn/dqn.py            |   2 +-
 python/ray/rllib/agents/dqn/dqn_policy.py     |   2 +-
 python/ray/rllib/agents/es/es.py              |   2 +-
 .../ray/rllib/agents/impala/vtrace_policy.py  |   2 +-
 .../ray/rllib/agents/marwil/marwil_policy.py  |   2 +-
 python/ray/rllib/agents/pg/pg_policy.py       |   2 +-
 python/ray/rllib/agents/pg/torch_pg_policy.py |   4 +-
 python/ray/rllib/agents/ppo/appo_policy.py    |   2 +-
 python/ray/rllib/agents/ppo/ppo_policy.py     |   2 +-
 python/ray/rllib/agents/qmix/qmix_policy.py   |   2 +-
 python/ray/rllib/agents/trainer.py            |   4 +-
 python/ray/rllib/evaluation/__init__.py       |  13 +-
 python/ray/rllib/evaluation/metrics.py        |   7 +-
 .../ray/rllib/evaluation/policy_evaluator.py  |   3 +-
 python/ray/rllib/evaluation/policy_graph.py   |   2 +-
 python/ray/rllib/evaluation/postprocessing.py |   2 +-
 python/ray/rllib/evaluation/sample_batch.py   | 294 +----------------
 .../rllib/evaluation/sample_batch_builder.py  |   2 +-
 .../ray/rllib/evaluation/tf_policy_graph.py   |   2 +-
 .../rllib/evaluation/torch_policy_graph.py    |   2 +-
 .../examples/multiagent_custom_policy.py      |   2 +-
 .../policy_evaluator_custom_workflow.py       |   3 +-
 python/ray/rllib/offline/input_reader.py      |   2 +-
 python/ray/rllib/offline/json_reader.py       |   2 +-
 python/ray/rllib/offline/json_writer.py       |   2 +-
 .../ray/rllib/offline/off_policy_estimator.py |   2 +-
 .../rllib/optimizers/aso_multi_gpu_learner.py |   2 +-
 .../optimizers/async_replay_optimizer.py      |   2 +-
 .../rllib/optimizers/multi_gpu_optimizer.py   |   2 +-
 python/ray/rllib/optimizers/rollout.py        |   2 +-
 .../optimizers/sync_batch_replay_optimizer.py |   2 +-
 .../rllib/optimizers/sync_replay_optimizer.py |   2 +-
 .../optimizers/sync_samples_optimizer.py      |   2 +-
 python/ray/rllib/policy/dynamic_tf_policy.py  |   2 +-
 python/ray/rllib/policy/policy.py             |   4 +
 python/ray/rllib/policy/sample_batch.py       | 296 ++++++++++++++++++
 python/ray/rllib/policy/tf_policy.py          |   5 +-
 python/ray/rllib/policy/torch_policy.py       |   3 +-
 .../ray/rllib/policy/torch_policy_template.py |   2 +-
 python/ray/rllib/rollout.py                   |   2 +-
 python/ray/rllib/tests/test_external_env.py   |   2 +-
 .../tests/test_external_multi_agent_env.py    |   2 +-
 .../ray/rllib/tests/test_multi_agent_env.py   |   2 +-
 python/ray/rllib/tests/test_optimizers.py     |   2 +-
 python/ray/rllib/tests/test_perf.py           |   2 +-
 .../ray/rllib/tests/test_policy_evaluator.py  |   4 +-
 python/ray/rllib/utils/__init__.py            |  13 +-
 python/ray/rllib/utils/debug.py               |   2 +-
 53 files changed, 377 insertions(+), 368 deletions(-)
 create mode 100644 python/ray/rllib/policy/sample_batch.py

diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py
index 1a5c67d2930e7..05f88ac653c4b 100644
--- a/python/ray/rllib/__init__.py
+++ b/python/ray/rllib/__init__.py
@@ -8,16 +8,16 @@
 # This file is imported from the tune module in order to register RLlib agents.
 from ray.tune.registry import register_trainable
 
-from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.policy_graph import PolicyGraph
-from ray.rllib.policy.tf_policy import TFPolicy
-from ray.rllib.policy.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.env.external_env import ExternalEnv
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tf_policy import TFPolicy
+from ray.rllib.policy.sample_batch import SampleBatch
 
 
 def _setup_logger():
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy.py b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
index 488f8bb8ebc8b..c1b1e1b746ba4 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
@@ -8,7 +8,7 @@
 
 import ray
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.policy.policy import Policy
diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy.py b/python/ray/rllib/agents/a3c/a3c_torch_policy.py
index fa6f857f9eca7..6ccf6c48d35f6 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy.py
@@ -9,8 +9,8 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.torch_policy_template import build_torch_policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_policy_template import build_torch_policy
 
 
 def actor_critic_loss(policy, batch_tensors):
diff --git a/python/ray/rllib/agents/ars/ars.py b/python/ray/rllib/agents/ars/ars.py
index 65738a620b304..4330f0d90db0c 100644
--- a/python/ray/rllib/agents/ars/ars.py
+++ b/python/ray/rllib/agents/ars/ars.py
@@ -17,7 +17,7 @@
 from ray.rllib.agents.ars import optimizers
 from ray.rllib.agents.ars import policies
 from ray.rllib.agents.ars import utils
-from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.memory import ray_get_and_free
 from ray.rllib.utils import FilterManager
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy.py b/python/ray/rllib/agents/ddpg/ddpg_policy.py
index b59cf8a3e6590..45f186351b3be 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy.py
@@ -9,7 +9,7 @@
 import ray.experimental.tf_utils
 from ray.rllib.agents.dqn.dqn_policy import (_huber_loss, _minimize_and_clip,
                                              _scope_vars, _postprocess_dqn)
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.annotations import override
diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py
index 9a9cc2821b5dc..442565007dd82 100644
--- a/python/ray/rllib/agents/dqn/dqn.py
+++ b/python/ray/rllib/agents/dqn/dqn.py
@@ -10,7 +10,7 @@
 from ray.rllib.agents.trainer import Trainer, with_common_config
 from ray.rllib.agents.dqn.dqn_policy import DQNPolicy
 from ray.rllib.evaluation.metrics import collect_metrics
-from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
 
diff --git a/python/ray/rllib/agents/dqn/dqn_policy.py b/python/ray/rllib/agents/dqn/dqn_policy.py
index 7078ac98124f2..b7a0af5ab5fa1 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy.py
@@ -7,7 +7,7 @@
 from scipy.stats import entropy
 
 import ray
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.models import ModelCatalog, Categorical
 from ray.rllib.utils.annotations import override
diff --git a/python/ray/rllib/agents/es/es.py b/python/ray/rllib/agents/es/es.py
index 2328b90e9ed09..e167129c6a93b 100644
--- a/python/ray/rllib/agents/es/es.py
+++ b/python/ray/rllib/agents/es/es.py
@@ -16,7 +16,7 @@
 from ray.rllib.agents.es import optimizers
 from ray.rllib.agents.es import policies
 from ray.rllib.agents.es import utils
-from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.memory import ray_get_and_free
 from ray.rllib.utils import FilterManager
diff --git a/python/ray/rllib/agents/impala/vtrace_policy.py b/python/ray/rllib/agents/impala/vtrace_policy.py
index 4d067087e4a35..0af2ddb401d0a 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy.py
@@ -12,7 +12,7 @@
 from ray.rllib.agents.impala import vtrace
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.policy.policy import Policy
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy import TFPolicy, \
     LearningRateSchedule
 from ray.rllib.models.action_dist import MultiCategorical
diff --git a/python/ray/rllib/agents/marwil/marwil_policy.py b/python/ray/rllib/agents/marwil/marwil_policy.py
index 47bc82a1627f3..add021025c9ca 100644
--- a/python/ray/rllib/agents/marwil/marwil_policy.py
+++ b/python/ray/rllib/agents/marwil/marwil_policy.py
@@ -6,7 +6,7 @@
 from ray.rllib.models import ModelCatalog
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.utils.annotations import override
 from ray.rllib.policy.policy import Policy
diff --git a/python/ray/rllib/agents/pg/pg_policy.py b/python/ray/rllib/agents/pg/pg_policy.py
index 7e7e4e025d366..7cca613928fb9 100644
--- a/python/ray/rllib/agents/pg/pg_policy.py
+++ b/python/ray/rllib/agents/pg/pg_policy.py
@@ -6,7 +6,7 @@
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
 from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils import try_import_tf
 
 tf = try_import_tf()
diff --git a/python/ray/rllib/agents/pg/torch_pg_policy.py b/python/ray/rllib/agents/pg/torch_pg_policy.py
index cda1b6eb50579..d0f1cda71cc70 100644
--- a/python/ray/rllib/agents/pg/torch_pg_policy.py
+++ b/python/ray/rllib/agents/pg/torch_pg_policy.py
@@ -5,8 +5,8 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.torch_policy_template import build_torch_policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_policy_template import build_torch_policy
 
 
 def pg_torch_loss(policy, batch_tensors):
diff --git a/python/ray/rllib/agents/ppo/appo_policy.py b/python/ray/rllib/agents/ppo/appo_policy.py
index 78886c20d41b4..b4041c790d9b7 100644
--- a/python/ray/rllib/agents/ppo/appo_policy.py
+++ b/python/ray/rllib/agents/ppo/appo_policy.py
@@ -13,7 +13,7 @@
 import ray
 from ray.rllib.agents.impala import vtrace
 from ray.rllib.evaluation.postprocessing import Postprocessing
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.policy.tf_policy import LearningRateSchedule
 from ray.rllib.utils.explained_variance import explained_variance
diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py
index 7f1a4bdb56aca..5a17d6c6d60cb 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy.py
@@ -7,7 +7,7 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy import LearningRateSchedule
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.models.catalog import ModelCatalog
diff --git a/python/ray/rllib/agents/qmix/qmix_policy.py b/python/ray/rllib/agents/qmix/qmix_policy.py
index 88b98ddabeb09..24c42fbcb13c7 100644
--- a/python/ray/rllib/agents/qmix/qmix_policy.py
+++ b/python/ray/rllib/agents/qmix/qmix_policy.py
@@ -15,7 +15,7 @@
 from ray.rllib.agents.qmix.model import RNNModel, _get_size
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.policy.policy import Policy
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.models.action_dist import TupleActions
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.models.lstm import chop_into_sequences
diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py
index 9726a0db4f747..83b00a896b71b 100644
--- a/python/ray/rllib/agents/trainer.py
+++ b/python/ray/rllib/agents/trainer.py
@@ -17,9 +17,9 @@
 from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \
     ShuffledInput
 from ray.rllib.models import MODEL_DEFAULTS
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator, \
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator, \
     _validate_multiagent_config
-from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
 from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI
diff --git a/python/ray/rllib/evaluation/__init__.py b/python/ray/rllib/evaluation/__init__.py
index 3828444151921..5e4fcd2a423dd 100644
--- a/python/ray/rllib/evaluation/__init__.py
+++ b/python/ray/rllib/evaluation/__init__.py
@@ -1,11 +1,8 @@
 from ray.rllib.evaluation.episode import MultiAgentEpisode
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.evaluation.interface import EvaluatorInterface
-from ray.rllib.policy.policy import Policy
-from ray.rllib.policy.policy_graph import PolicyGraph
-from ray.rllib.policy.tf_policy import TFPolicy
-from ray.rllib.policy.tf_policy_graph import TFPolicyGraph
-from ray.rllib.evaluation.torch_policy import TorchPolicy
+from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.evaluation.sample_batch_builder import (
@@ -15,8 +12,8 @@
 from ray.rllib.evaluation.metrics import collect_metrics
 
 __all__ = [
-    "EvaluatorInterface", "PolicyEvaluator", "Policy", "TFPolicy",
-    "TorchPolicy", "PolicyGraph", "TFPolicyGraph", "TorchPolicyGraph",
+    "EvaluatorInterface", "PolicyEvaluator",
+    "PolicyGraph", "TFPolicyGraph", "TorchPolicyGraph",
     "SampleBatch", "MultiAgentBatch", "SampleBatchBuilder",
     "MultiAgentSampleBatchBuilder", "SyncSampler", "AsyncSampler",
     "compute_advantages", "collect_metrics", "MultiAgentEpisode"
diff --git a/python/ray/rllib/evaluation/metrics.py b/python/ray/rllib/evaluation/metrics.py
index 99de4dd3b2e47..d8b3122fed4b8 100644
--- a/python/ray/rllib/evaluation/metrics.py
+++ b/python/ray/rllib/evaluation/metrics.py
@@ -7,17 +7,14 @@
 import collections
 
 import ray
-from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.offline.off_policy_estimator import OffPolicyEstimate
+from ray.rllib.policy.policy import LEARNER_STATS_KEY
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.utils.memory import ray_get_and_free
 
 logger = logging.getLogger(__name__)
 
-# By convention, metrics from optimizing the loss can be reported in the
-# `grad_info` dict returned by learn_on_batch() / compute_grads() via this key.
-LEARNER_STATS_KEY = "learner_stats"
-
 
 @DeveloperAPI
 def get_learner_stats(grad_info):
diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index 1a5a167fe926e..40df71006a8cd 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -15,9 +15,8 @@
 from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
 from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.evaluation.interface import EvaluatorInterface
-from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \
-    DEFAULT_POLICY_ID
 from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler
+from ray.rllib.policy.sample_batch import MultiAgentBatch, DEFAULT_POLICY_ID
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.offline import NoopOutput, IOContext, OutputWriter, InputReader
diff --git a/python/ray/rllib/evaluation/policy_graph.py b/python/ray/rllib/evaluation/policy_graph.py
index 1039269e56cbd..7cb44db2ef886 100644
--- a/python/ray/rllib/evaluation/policy_graph.py
+++ b/python/ray/rllib/evaluation/policy_graph.py
@@ -2,7 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.policy import Policy
+from ray.rllib.policy.policy import Policy
 from ray.rllib.utils import renamed_class
 
 
diff --git a/python/ray/rllib/evaluation/postprocessing.py b/python/ray/rllib/evaluation/postprocessing.py
index aa2835f87e044..f236df6ed7638 100644
--- a/python/ray/rllib/evaluation/postprocessing.py
+++ b/python/ray/rllib/evaluation/postprocessing.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 import scipy.signal
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import DeveloperAPI
 
 
diff --git a/python/ray/rllib/evaluation/sample_batch.py b/python/ray/rllib/evaluation/sample_batch.py
index c80f22bdbd1a1..906fc8974db1d 100644
--- a/python/ray/rllib/evaluation/sample_batch.py
+++ b/python/ray/rllib/evaluation/sample_batch.py
@@ -2,295 +2,9 @@
 from __future__ import division
 from __future__ import print_function
 
-import six
-import collections
-import numpy as np
+from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
+from ray.rllib.utils import renamed_class
 
-from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI
-from ray.rllib.utils.compression import pack, unpack, is_compressed
-from ray.rllib.utils.memory import concat_aligned
 
-# Defaults policy id for single agent environments
-DEFAULT_POLICY_ID = "default_policy"
-
-
-@PublicAPI
-class MultiAgentBatch(object):
-    """A batch of experiences from multiple policies in the environment.
-
-    Attributes:
-        policy_batches (dict): Mapping from policy id to a normal SampleBatch
-            of experiences. Note that these batches may be of different length.
-        count (int): The number of timesteps in the environment this batch
-            contains. This will be less than the number of transitions this
-            batch contains across all policies in total.
-    """
-
-    @PublicAPI
-    def __init__(self, policy_batches, count):
-        self.policy_batches = policy_batches
-        self.count = count
-
-    @staticmethod
-    @PublicAPI
-    def wrap_as_needed(batches, count):
-        if len(batches) == 1 and DEFAULT_POLICY_ID in batches:
-            return batches[DEFAULT_POLICY_ID]
-        return MultiAgentBatch(batches, count)
-
-    @staticmethod
-    @PublicAPI
-    def concat_samples(samples):
-        policy_batches = collections.defaultdict(list)
-        total_count = 0
-        for s in samples:
-            assert isinstance(s, MultiAgentBatch)
-            for policy_id, batch in s.policy_batches.items():
-                policy_batches[policy_id].append(batch)
-            total_count += s.count
-        out = {}
-        for policy_id, batches in policy_batches.items():
-            out[policy_id] = SampleBatch.concat_samples(batches)
-        return MultiAgentBatch(out, total_count)
-
-    @PublicAPI
-    def copy(self):
-        return MultiAgentBatch(
-            {k: v.copy()
-             for (k, v) in self.policy_batches.items()}, self.count)
-
-    @PublicAPI
-    def total(self):
-        ct = 0
-        for batch in self.policy_batches.values():
-            ct += batch.count
-        return ct
-
-    @DeveloperAPI
-    def compress(self, bulk=False, columns=frozenset(["obs", "new_obs"])):
-        for batch in self.policy_batches.values():
-            batch.compress(bulk=bulk, columns=columns)
-
-    @DeveloperAPI
-    def decompress_if_needed(self, columns=frozenset(["obs", "new_obs"])):
-        for batch in self.policy_batches.values():
-            batch.decompress_if_needed(columns)
-
-    def __str__(self):
-        return "MultiAgentBatch({}, count={})".format(
-            str(self.policy_batches), self.count)
-
-    def __repr__(self):
-        return "MultiAgentBatch({}, count={})".format(
-            str(self.policy_batches), self.count)
-
-
-@PublicAPI
-class SampleBatch(object):
-    """Wrapper around a dictionary with string keys and array-like values.
-
-    For example, {"obs": [1, 2, 3], "reward": [0, -1, 1]} is a batch of three
-    samples, each with an "obs" and "reward" attribute.
-    """
-
-    # Outputs from interacting with the environment
-    CUR_OBS = "obs"
-    NEXT_OBS = "new_obs"
-    ACTIONS = "actions"
-    REWARDS = "rewards"
-    PREV_ACTIONS = "prev_actions"
-    PREV_REWARDS = "prev_rewards"
-    DONES = "dones"
-    INFOS = "infos"
-
-    # Uniquely identifies an episode
-    EPS_ID = "eps_id"
-
-    # Uniquely identifies a sample batch. This is important to distinguish RNN
-    # sequences from the same episode when multiple sample batches are
-    # concatenated (fusing sequences across batches can be unsafe).
-    UNROLL_ID = "unroll_id"
-
-    # Uniquely identifies an agent within an episode
-    AGENT_INDEX = "agent_index"
-
-    # Value function predictions emitted by the behaviour policy
-    VF_PREDS = "vf_preds"
-
-    @PublicAPI
-    def __init__(self, *args, **kwargs):
-        """Constructs a sample batch (same params as dict constructor)."""
-
-        self.data = dict(*args, **kwargs)
-        lengths = []
-        for k, v in self.data.copy().items():
-            assert isinstance(k, six.string_types), self
-            lengths.append(len(v))
-            self.data[k] = np.array(v, copy=False)
-        if not lengths:
-            raise ValueError("Empty sample batch")
-        assert len(set(lengths)) == 1, "data columns must be same length"
-        self.count = lengths[0]
-
-    @staticmethod
-    @PublicAPI
-    def concat_samples(samples):
-        if isinstance(samples[0], MultiAgentBatch):
-            return MultiAgentBatch.concat_samples(samples)
-        out = {}
-        samples = [s for s in samples if s.count > 0]
-        for k in samples[0].keys():
-            out[k] = concat_aligned([s[k] for s in samples])
-        return SampleBatch(out)
-
-    @PublicAPI
-    def concat(self, other):
-        """Returns a new SampleBatch with each data column concatenated.
-
-        Examples:
-            >>> b1 = SampleBatch({"a": [1, 2]})
-            >>> b2 = SampleBatch({"a": [3, 4, 5]})
-            >>> print(b1.concat(b2))
-            {"a": [1, 2, 3, 4, 5]}
-        """
-
-        assert self.keys() == other.keys(), "must have same columns"
-        out = {}
-        for k in self.keys():
-            out[k] = concat_aligned([self[k], other[k]])
-        return SampleBatch(out)
-
-    @PublicAPI
-    def copy(self):
-        return SampleBatch(
-            {k: np.array(v, copy=True)
-             for (k, v) in self.data.items()})
-
-    @PublicAPI
-    def rows(self):
-        """Returns an iterator over data rows, i.e. dicts with column values.
-
-        Examples:
-            >>> batch = SampleBatch({"a": [1, 2, 3], "b": [4, 5, 6]})
-            >>> for row in batch.rows():
-                   print(row)
-            {"a": 1, "b": 4}
-            {"a": 2, "b": 5}
-            {"a": 3, "b": 6}
-        """
-
-        for i in range(self.count):
-            row = {}
-            for k in self.keys():
-                row[k] = self[k][i]
-            yield row
-
-    @PublicAPI
-    def columns(self, keys):
-        """Returns a list of just the specified columns.
-
-        Examples:
-            >>> batch = SampleBatch({"a": [1], "b": [2], "c": [3]})
-            >>> print(batch.columns(["a", "b"]))
-            [[1], [2]]
-        """
-
-        out = []
-        for k in keys:
-            out.append(self[k])
-        return out
-
-    @PublicAPI
-    def shuffle(self):
-        """Shuffles the rows of this batch in-place."""
-
-        permutation = np.random.permutation(self.count)
-        for key, val in self.items():
-            self[key] = val[permutation]
-
-    @PublicAPI
-    def split_by_episode(self):
-        """Splits this batch's data by `eps_id`.
-
-        Returns:
-            list of SampleBatch, one per distinct episode.
-        """
-
-        slices = []
-        cur_eps_id = self.data["eps_id"][0]
-        offset = 0
-        for i in range(self.count):
-            next_eps_id = self.data["eps_id"][i]
-            if next_eps_id != cur_eps_id:
-                slices.append(self.slice(offset, i))
-                offset = i
-                cur_eps_id = next_eps_id
-        slices.append(self.slice(offset, self.count))
-        for s in slices:
-            slen = len(set(s["eps_id"]))
-            assert slen == 1, (s, slen)
-        assert sum(s.count for s in slices) == self.count, (slices, self.count)
-        return slices
-
-    @PublicAPI
-    def slice(self, start, end):
-        """Returns a slice of the row data of this batch.
-
-        Arguments:
-            start (int): Starting index.
-            end (int): Ending index.
-
-        Returns:
-            SampleBatch which has a slice of this batch's data.
-        """
-
-        return SampleBatch({k: v[start:end] for k, v in self.data.items()})
-
-    @PublicAPI
-    def keys(self):
-        return self.data.keys()
-
-    @PublicAPI
-    def items(self):
-        return self.data.items()
-
-    @PublicAPI
-    def __getitem__(self, key):
-        return self.data[key]
-
-    @PublicAPI
-    def __setitem__(self, key, item):
-        self.data[key] = item
-
-    @DeveloperAPI
-    def compress(self, bulk=False, columns=frozenset(["obs", "new_obs"])):
-        for key in columns:
-            if key in self.data:
-                if bulk:
-                    self.data[key] = pack(self.data[key])
-                else:
-                    self.data[key] = np.array(
-                        [pack(o) for o in self.data[key]])
-
-    @DeveloperAPI
-    def decompress_if_needed(self, columns=frozenset(["obs", "new_obs"])):
-        for key in columns:
-            if key in self.data:
-                arr = self.data[key]
-                if is_compressed(arr):
-                    self.data[key] = unpack(arr)
-                elif len(arr) > 0 and is_compressed(arr[0]):
-                    self.data[key] = np.array(
-                        [unpack(o) for o in self.data[key]])
-
-    def __str__(self):
-        return "SampleBatch({})".format(str(self.data))
-
-    def __repr__(self):
-        return "SampleBatch({})".format(str(self.data))
-
-    def __iter__(self):
-        return self.data.__iter__()
-
-    def __contains__(self, x):
-        return x in self.data
+SampleBatch = renamed_class(SampleBatch, old_name="rllib.evaluation.SampleBatch")
+MultiAgentBatch = renamed_class(MultiAgentBatch, old_name="rllib.evaluation.MultiAgentBatch")
diff --git a/python/ray/rllib/evaluation/sample_batch_builder.py b/python/ray/rllib/evaluation/sample_batch_builder.py
index 675cfc24d1a43..0ead77d528479 100644
--- a/python/ray/rllib/evaluation/sample_batch_builder.py
+++ b/python/ray/rllib/evaluation/sample_batch_builder.py
@@ -6,7 +6,7 @@
 import logging
 import numpy as np
 
-from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
+from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI
 from ray.rllib.utils.debug import log_once, summarize
 
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index 06a5de9c9e51b..2ac97e5259e34 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -2,7 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.policy import TFPolicy
+from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.utils import renamed_class
 
 
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py
index 56940dc215e37..e9124e7ffb4bc 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph.py
@@ -2,7 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.policy import TorchPolicy
+from ray.rllib.policy.torch_policy import TorchPolicy
 from ray.rllib.utils import renamed_class
 
 
diff --git a/python/ray/rllib/examples/multiagent_custom_policy.py b/python/ray/rllib/examples/multiagent_custom_policy.py
index 2ef7378eb3871..d34d678098b62 100644
--- a/python/ray/rllib/examples/multiagent_custom_policy.py
+++ b/python/ray/rllib/examples/multiagent_custom_policy.py
@@ -22,7 +22,7 @@
 
 import ray
 from ray import tune
-from ray.rllib.evaluation import Policy
+from ray.rllib.policy import Policy
 from ray.rllib.tests.test_multi_agent_env import MultiCartpole
 from ray.tune.registry import register_env
 
diff --git a/python/ray/rllib/examples/policy_evaluator_custom_workflow.py b/python/ray/rllib/examples/policy_evaluator_custom_workflow.py
index ef62227dae192..a8d80da994d29 100644
--- a/python/ray/rllib/examples/policy_evaluator_custom_workflow.py
+++ b/python/ray/rllib/examples/policy_evaluator_custom_workflow.py
@@ -14,7 +14,8 @@
 
 import ray
 from ray import tune
-from ray.rllib.evaluation import Policy, PolicyEvaluator, SampleBatch
+from ray.rllib.policy import Policy
+from ray.rllib.evaluation import PolicyEvaluator, SampleBatch
 from ray.rllib.evaluation.metrics import collect_metrics
 
 parser = argparse.ArgumentParser()
diff --git a/python/ray/rllib/offline/input_reader.py b/python/ray/rllib/offline/input_reader.py
index 5315773fd8395..053c279343a85 100644
--- a/python/ray/rllib/offline/input_reader.py
+++ b/python/ray/rllib/offline/input_reader.py
@@ -6,7 +6,7 @@
 import numpy as np
 import threading
 
-from ray.rllib.evaluation.sample_batch import MultiAgentBatch
+from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import PublicAPI
 from ray.rllib.utils import try_import_tf
 
diff --git a/python/ray/rllib/offline/json_reader.py b/python/ray/rllib/offline/json_reader.py
index e9568e75c7f4b..55a002fb3ce60 100644
--- a/python/ray/rllib/offline/json_reader.py
+++ b/python/ray/rllib/offline/json_reader.py
@@ -17,7 +17,7 @@
 
 from ray.rllib.offline.input_reader import InputReader
 from ray.rllib.offline.io_context import IOContext
-from ray.rllib.evaluation.sample_batch import MultiAgentBatch, SampleBatch, \
+from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch, \
     DEFAULT_POLICY_ID
 from ray.rllib.utils.annotations import override, PublicAPI
 from ray.rllib.utils.compression import unpack_if_needed
diff --git a/python/ray/rllib/offline/json_writer.py b/python/ray/rllib/offline/json_writer.py
index 5613d1f67dc25..679b00158b9ed 100644
--- a/python/ray/rllib/offline/json_writer.py
+++ b/python/ray/rllib/offline/json_writer.py
@@ -15,7 +15,7 @@
 except ImportError:
     smart_open = None
 
-from ray.rllib.evaluation.sample_batch import MultiAgentBatch
+from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.offline.io_context import IOContext
 from ray.rllib.offline.output_writer import OutputWriter
 from ray.rllib.utils.annotations import override, PublicAPI
diff --git a/python/ray/rllib/offline/off_policy_estimator.py b/python/ray/rllib/offline/off_policy_estimator.py
index c92cb9015e75c..7534e667f0bfe 100644
--- a/python/ray/rllib/offline/off_policy_estimator.py
+++ b/python/ray/rllib/offline/off_policy_estimator.py
@@ -5,7 +5,7 @@
 from collections import namedtuple
 import logging
 
-from ray.rllib.evaluation.sample_batch import MultiAgentBatch
+from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import DeveloperAPI
 
 logger = logging.getLogger(__name__)
diff --git a/python/ray/rllib/optimizers/aso_multi_gpu_learner.py b/python/ray/rllib/optimizers/aso_multi_gpu_learner.py
index 328fee67d548b..b5040e45584ca 100644
--- a/python/ray/rllib/optimizers/aso_multi_gpu_learner.py
+++ b/python/ray/rllib/optimizers/aso_multi_gpu_learner.py
@@ -11,7 +11,7 @@
 from six.moves import queue
 
 from ray.rllib.evaluation.metrics import get_learner_stats
-from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.optimizers.aso_learner import LearnerThread
 from ray.rllib.optimizers.aso_minibatch_buffer import MinibatchBuffer
 from ray.rllib.optimizers.multi_gpu_impl import LocalSyncParallelOptimizer
diff --git a/python/ray/rllib/optimizers/async_replay_optimizer.py b/python/ray/rllib/optimizers/async_replay_optimizer.py
index b040c8e8a99f9..d66f942ae5325 100644
--- a/python/ray/rllib/optimizers/async_replay_optimizer.py
+++ b/python/ray/rllib/optimizers/async_replay_optimizer.py
@@ -17,7 +17,7 @@
 
 import ray
 from ray.rllib.evaluation.metrics import get_learner_stats
-from ray.rllib.evaluation.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
+from ray.rllib.policy.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
     MultiAgentBatch
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
 from ray.rllib.optimizers.replay_buffer import PrioritizedReplayBuffer
diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
index d10428e93132e..a25553c40111e 100644
--- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py
+++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
@@ -16,7 +16,7 @@
     collect_samples_straggler_mitigation
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.timer import TimerStat
-from ray.rllib.evaluation.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
+from ray.rllib.policy.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
     MultiAgentBatch
 from ray.rllib.utils import try_import_tf
 
diff --git a/python/ray/rllib/optimizers/rollout.py b/python/ray/rllib/optimizers/rollout.py
index 063c2ff8999dc..fa1c03f6081ec 100644
--- a/python/ray/rllib/optimizers/rollout.py
+++ b/python/ray/rllib/optimizers/rollout.py
@@ -5,7 +5,7 @@
 import logging
 
 import ray
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.memory import ray_get_and_free
 
 logger = logging.getLogger(__name__)
diff --git a/python/ray/rllib/optimizers/sync_batch_replay_optimizer.py b/python/ray/rllib/optimizers/sync_batch_replay_optimizer.py
index 0a334e84ef799..e13d71c6e4cd2 100644
--- a/python/ray/rllib/optimizers/sync_batch_replay_optimizer.py
+++ b/python/ray/rllib/optimizers/sync_batch_replay_optimizer.py
@@ -7,7 +7,7 @@
 import ray
 from ray.rllib.evaluation.metrics import get_learner_stats
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
-from ray.rllib.evaluation.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
+from ray.rllib.policy.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
     MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.timer import TimerStat
diff --git a/python/ray/rllib/optimizers/sync_replay_optimizer.py b/python/ray/rllib/optimizers/sync_replay_optimizer.py
index 2e765f2d86417..27858f3527c1e 100644
--- a/python/ray/rllib/optimizers/sync_replay_optimizer.py
+++ b/python/ray/rllib/optimizers/sync_replay_optimizer.py
@@ -11,7 +11,7 @@
     PrioritizedReplayBuffer
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
 from ray.rllib.evaluation.metrics import get_learner_stats
-from ray.rllib.evaluation.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
+from ray.rllib.policy.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
     MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.compression import pack_if_needed
diff --git a/python/ray/rllib/optimizers/sync_samples_optimizer.py b/python/ray/rllib/optimizers/sync_samples_optimizer.py
index a08f0345eb2bc..f5807ae343ef7 100644
--- a/python/ray/rllib/optimizers/sync_samples_optimizer.py
+++ b/python/ray/rllib/optimizers/sync_samples_optimizer.py
@@ -6,7 +6,7 @@
 import logging
 from ray.rllib.evaluation.metrics import get_learner_stats
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.filter import RunningStat
 from ray.rllib.utils.timer import TimerStat
diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py
index fb280bfb9a7a3..691fc11862729 100644
--- a/python/ray/rllib/policy/dynamic_tf_policy.py
+++ b/python/ray/rllib/policy/dynamic_tf_policy.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 from ray.rllib.policy.policy import Policy
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.annotations import override
diff --git a/python/ray/rllib/policy/policy.py b/python/ray/rllib/policy/policy.py
index 72393e7826c53..6f456e608007c 100644
--- a/python/ray/rllib/policy/policy.py
+++ b/python/ray/rllib/policy/policy.py
@@ -7,6 +7,10 @@
 
 from ray.rllib.utils.annotations import DeveloperAPI
 
+# By convention, metrics from optimizing the loss can be reported in the
+# `grad_info` dict returned by learn_on_batch() / compute_grads() via this key.
+LEARNER_STATS_KEY = "learner_stats"
+
 
 @DeveloperAPI
 class Policy(object):
diff --git a/python/ray/rllib/policy/sample_batch.py b/python/ray/rllib/policy/sample_batch.py
new file mode 100644
index 0000000000000..c80f22bdbd1a1
--- /dev/null
+++ b/python/ray/rllib/policy/sample_batch.py
@@ -0,0 +1,296 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import collections
+import numpy as np
+
+from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI
+from ray.rllib.utils.compression import pack, unpack, is_compressed
+from ray.rllib.utils.memory import concat_aligned
+
+# Defaults policy id for single agent environments
+DEFAULT_POLICY_ID = "default_policy"
+
+
+@PublicAPI
+class MultiAgentBatch(object):
+    """A batch of experiences from multiple policies in the environment.
+
+    Attributes:
+        policy_batches (dict): Mapping from policy id to a normal SampleBatch
+            of experiences. Note that these batches may be of different length.
+        count (int): The number of timesteps in the environment this batch
+            contains. This will be less than the number of transitions this
+            batch contains across all policies in total.
+    """
+
+    @PublicAPI
+    def __init__(self, policy_batches, count):
+        self.policy_batches = policy_batches
+        self.count = count
+
+    @staticmethod
+    @PublicAPI
+    def wrap_as_needed(batches, count):
+        if len(batches) == 1 and DEFAULT_POLICY_ID in batches:
+            return batches[DEFAULT_POLICY_ID]
+        return MultiAgentBatch(batches, count)
+
+    @staticmethod
+    @PublicAPI
+    def concat_samples(samples):
+        policy_batches = collections.defaultdict(list)
+        total_count = 0
+        for s in samples:
+            assert isinstance(s, MultiAgentBatch)
+            for policy_id, batch in s.policy_batches.items():
+                policy_batches[policy_id].append(batch)
+            total_count += s.count
+        out = {}
+        for policy_id, batches in policy_batches.items():
+            out[policy_id] = SampleBatch.concat_samples(batches)
+        return MultiAgentBatch(out, total_count)
+
+    @PublicAPI
+    def copy(self):
+        return MultiAgentBatch(
+            {k: v.copy()
+             for (k, v) in self.policy_batches.items()}, self.count)
+
+    @PublicAPI
+    def total(self):
+        ct = 0
+        for batch in self.policy_batches.values():
+            ct += batch.count
+        return ct
+
+    @DeveloperAPI
+    def compress(self, bulk=False, columns=frozenset(["obs", "new_obs"])):
+        for batch in self.policy_batches.values():
+            batch.compress(bulk=bulk, columns=columns)
+
+    @DeveloperAPI
+    def decompress_if_needed(self, columns=frozenset(["obs", "new_obs"])):
+        for batch in self.policy_batches.values():
+            batch.decompress_if_needed(columns)
+
+    def __str__(self):
+        return "MultiAgentBatch({}, count={})".format(
+            str(self.policy_batches), self.count)
+
+    def __repr__(self):
+        return "MultiAgentBatch({}, count={})".format(
+            str(self.policy_batches), self.count)
+
+
+@PublicAPI
+class SampleBatch(object):
+    """Wrapper around a dictionary with string keys and array-like values.
+
+    For example, {"obs": [1, 2, 3], "reward": [0, -1, 1]} is a batch of three
+    samples, each with an "obs" and "reward" attribute.
+    """
+
+    # Outputs from interacting with the environment
+    CUR_OBS = "obs"
+    NEXT_OBS = "new_obs"
+    ACTIONS = "actions"
+    REWARDS = "rewards"
+    PREV_ACTIONS = "prev_actions"
+    PREV_REWARDS = "prev_rewards"
+    DONES = "dones"
+    INFOS = "infos"
+
+    # Uniquely identifies an episode
+    EPS_ID = "eps_id"
+
+    # Uniquely identifies a sample batch. This is important to distinguish RNN
+    # sequences from the same episode when multiple sample batches are
+    # concatenated (fusing sequences across batches can be unsafe).
+    UNROLL_ID = "unroll_id"
+
+    # Uniquely identifies an agent within an episode
+    AGENT_INDEX = "agent_index"
+
+    # Value function predictions emitted by the behaviour policy
+    VF_PREDS = "vf_preds"
+
+    @PublicAPI
+    def __init__(self, *args, **kwargs):
+        """Constructs a sample batch (same params as dict constructor)."""
+
+        self.data = dict(*args, **kwargs)
+        lengths = []
+        for k, v in self.data.copy().items():
+            assert isinstance(k, six.string_types), self
+            lengths.append(len(v))
+            self.data[k] = np.array(v, copy=False)
+        if not lengths:
+            raise ValueError("Empty sample batch")
+        assert len(set(lengths)) == 1, "data columns must be same length"
+        self.count = lengths[0]
+
+    @staticmethod
+    @PublicAPI
+    def concat_samples(samples):
+        if isinstance(samples[0], MultiAgentBatch):
+            return MultiAgentBatch.concat_samples(samples)
+        out = {}
+        samples = [s for s in samples if s.count > 0]
+        for k in samples[0].keys():
+            out[k] = concat_aligned([s[k] for s in samples])
+        return SampleBatch(out)
+
+    @PublicAPI
+    def concat(self, other):
+        """Returns a new SampleBatch with each data column concatenated.
+
+        Examples:
+            >>> b1 = SampleBatch({"a": [1, 2]})
+            >>> b2 = SampleBatch({"a": [3, 4, 5]})
+            >>> print(b1.concat(b2))
+            {"a": [1, 2, 3, 4, 5]}
+        """
+
+        assert self.keys() == other.keys(), "must have same columns"
+        out = {}
+        for k in self.keys():
+            out[k] = concat_aligned([self[k], other[k]])
+        return SampleBatch(out)
+
+    @PublicAPI
+    def copy(self):
+        return SampleBatch(
+            {k: np.array(v, copy=True)
+             for (k, v) in self.data.items()})
+
+    @PublicAPI
+    def rows(self):
+        """Returns an iterator over data rows, i.e. dicts with column values.
+
+        Examples:
+            >>> batch = SampleBatch({"a": [1, 2, 3], "b": [4, 5, 6]})
+            >>> for row in batch.rows():
+                   print(row)
+            {"a": 1, "b": 4}
+            {"a": 2, "b": 5}
+            {"a": 3, "b": 6}
+        """
+
+        for i in range(self.count):
+            row = {}
+            for k in self.keys():
+                row[k] = self[k][i]
+            yield row
+
+    @PublicAPI
+    def columns(self, keys):
+        """Returns a list of just the specified columns.
+
+        Examples:
+            >>> batch = SampleBatch({"a": [1], "b": [2], "c": [3]})
+            >>> print(batch.columns(["a", "b"]))
+            [[1], [2]]
+        """
+
+        out = []
+        for k in keys:
+            out.append(self[k])
+        return out
+
+    @PublicAPI
+    def shuffle(self):
+        """Shuffles the rows of this batch in-place."""
+
+        permutation = np.random.permutation(self.count)
+        for key, val in self.items():
+            self[key] = val[permutation]
+
+    @PublicAPI
+    def split_by_episode(self):
+        """Splits this batch's data by `eps_id`.
+
+        Returns:
+            list of SampleBatch, one per distinct episode.
+        """
+
+        slices = []
+        cur_eps_id = self.data["eps_id"][0]
+        offset = 0
+        for i in range(self.count):
+            next_eps_id = self.data["eps_id"][i]
+            if next_eps_id != cur_eps_id:
+                slices.append(self.slice(offset, i))
+                offset = i
+                cur_eps_id = next_eps_id
+        slices.append(self.slice(offset, self.count))
+        for s in slices:
+            slen = len(set(s["eps_id"]))
+            assert slen == 1, (s, slen)
+        assert sum(s.count for s in slices) == self.count, (slices, self.count)
+        return slices
+
+    @PublicAPI
+    def slice(self, start, end):
+        """Returns a slice of the row data of this batch.
+
+        Arguments:
+            start (int): Starting index.
+            end (int): Ending index.
+
+        Returns:
+            SampleBatch which has a slice of this batch's data.
+        """
+
+        return SampleBatch({k: v[start:end] for k, v in self.data.items()})
+
+    @PublicAPI
+    def keys(self):
+        return self.data.keys()
+
+    @PublicAPI
+    def items(self):
+        return self.data.items()
+
+    @PublicAPI
+    def __getitem__(self, key):
+        return self.data[key]
+
+    @PublicAPI
+    def __setitem__(self, key, item):
+        self.data[key] = item
+
+    @DeveloperAPI
+    def compress(self, bulk=False, columns=frozenset(["obs", "new_obs"])):
+        for key in columns:
+            if key in self.data:
+                if bulk:
+                    self.data[key] = pack(self.data[key])
+                else:
+                    self.data[key] = np.array(
+                        [pack(o) for o in self.data[key]])
+
+    @DeveloperAPI
+    def decompress_if_needed(self, columns=frozenset(["obs", "new_obs"])):
+        for key in columns:
+            if key in self.data:
+                arr = self.data[key]
+                if is_compressed(arr):
+                    self.data[key] = unpack(arr)
+                elif len(arr) > 0 and is_compressed(arr[0]):
+                    self.data[key] = np.array(
+                        [unpack(o) for o in self.data[key]])
+
+    def __str__(self):
+        return "SampleBatch({})".format(str(self.data))
+
+    def __repr__(self):
+        return "SampleBatch({})".format(str(self.data))
+
+    def __iter__(self):
+        return self.data.__iter__()
+
+    def __contains__(self, x):
+        return x in self.data
diff --git a/python/ray/rllib/policy/tf_policy.py b/python/ray/rllib/policy/tf_policy.py
index 06d97aad03d9d..bbb5795e52ab0 100644
--- a/python/ray/rllib/policy/tf_policy.py
+++ b/python/ray/rllib/policy/tf_policy.py
@@ -9,9 +9,8 @@
 
 import ray
 import ray.experimental.tf_utils
-from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.policy.policy import Policy
-from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.models.lstm import chop_into_sequences
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.debug import log_once, summarize
diff --git a/python/ray/rllib/policy/torch_policy.py b/python/ray/rllib/policy/torch_policy.py
index 622dee8777483..633e438c5ad7f 100644
--- a/python/ray/rllib/policy/torch_policy.py
+++ b/python/ray/rllib/policy/torch_policy.py
@@ -12,8 +12,7 @@
 except ImportError:
     pass  # soft dep
 
-from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.tracking_dict import UsageTrackingDict
 
diff --git a/python/ray/rllib/policy/torch_policy_template.py b/python/ray/rllib/policy/torch_policy_template.py
index 4a1b4b9e09a36..ced3cd7c989c2 100644
--- a/python/ray/rllib/policy/torch_policy_template.py
+++ b/python/ray/rllib/policy/torch_policy_template.py
@@ -3,7 +3,7 @@
 from __future__ import print_function
 
 from ray.rllib.policy.policy import Policy
-from ray.rllib.evaluation.torch_policy import TorchPolicy
+from ray.rllib.policy.torch_policy import TorchPolicy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.annotations import override, DeveloperAPI
 
diff --git a/python/ray/rllib/rollout.py b/python/ray/rllib/rollout.py
index 2bb25f5c40afb..efa5743c0a54d 100755
--- a/python/ray/rllib/rollout.py
+++ b/python/ray/rllib/rollout.py
@@ -15,7 +15,7 @@
 from ray.rllib.agents.registry import get_agent_class
 from ray.rllib.env import MultiAgentEnv
 from ray.rllib.env.base_env import _DUMMY_AGENT_ID
-from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.tune.util import merge_dicts
 
 EXAMPLE_USAGE = """
diff --git a/python/ray/rllib/tests/test_external_env.py b/python/ray/rllib/tests/test_external_env.py
index d1d1231045aaf..3b2158959267c 100644
--- a/python/ray/rllib/tests/test_external_env.py
+++ b/python/ray/rllib/tests/test_external_env.py
@@ -11,7 +11,7 @@
 import ray
 from ray.rllib.agents.dqn import DQNTrainer
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.external_env import ExternalEnv
 from ray.rllib.tests.test_policy_evaluator import (BadPolicy, MockPolicy,
                                                    MockEnv)
diff --git a/python/ray/rllib/tests/test_external_multi_agent_env.py b/python/ray/rllib/tests/test_external_multi_agent_env.py
index f6cc8e51d8b02..fcb3de634cbea 100644
--- a/python/ray/rllib/tests/test_external_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_external_multi_agent_env.py
@@ -10,7 +10,7 @@
 import ray
 from ray.rllib.agents.pg.pg_policy import PGTFPolicy
 from ray.rllib.optimizers import SyncSamplesOptimizer
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
 from ray.rllib.tests.test_policy_evaluator import MockPolicy
 from ray.rllib.tests.test_external_env import make_simple_serving
diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py
index a6195c5f601f9..9e9af78a3cce0 100644
--- a/python/ray/rllib/tests/test_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_multi_agent_env.py
@@ -14,7 +14,7 @@
                                   AsyncGradientsOptimizer)
 from ray.rllib.tests.test_policy_evaluator import (MockEnv, MockEnv2,
                                                    MockPolicy)
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
diff --git a/python/ray/rllib/tests/test_optimizers.py b/python/ray/rllib/tests/test_optimizers.py
index 4b6ea9e6d8217..f851cfc33f128 100644
--- a/python/ray/rllib/tests/test_optimizers.py
+++ b/python/ray/rllib/tests/test_optimizers.py
@@ -11,7 +11,7 @@
 from ray.rllib.agents.ppo import PPOTrainer
 from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
 from ray.rllib.evaluation import SampleBatch
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.optimizers import AsyncGradientsOptimizer, AsyncSamplesOptimizer
 from ray.rllib.optimizers.aso_tree_aggregator import TreeAggregator
 from ray.rllib.tests.mock_evaluator import _MockEvaluator
diff --git a/python/ray/rllib/tests/test_perf.py b/python/ray/rllib/tests/test_perf.py
index 8001b31d65184..e31530f44ced6 100644
--- a/python/ray/rllib/tests/test_perf.py
+++ b/python/ray/rllib/tests/test_perf.py
@@ -7,7 +7,7 @@
 import unittest
 
 import ray
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.tests.test_policy_evaluator import MockPolicy
 
 
diff --git a/python/ray/rllib/tests/test_policy_evaluator.py b/python/ray/rllib/tests/test_policy_evaluator.py
index b7410d9e0ab23..dc0dcaff6782d 100644
--- a/python/ray/rllib/tests/test_policy_evaluator.py
+++ b/python/ray/rllib/tests/test_policy_evaluator.py
@@ -12,11 +12,11 @@
 import ray
 from ray.rllib.agents.pg import PGTrainer
 from ray.rllib.agents.a3c import A2CTrainer
-from ray.rllib.policy.policy_evaluator import PolicyEvaluator
+from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.policy.policy import Policy
 from ray.rllib.evaluation.postprocessing import compute_advantages
-from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID, SampleBatch
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
 from ray.rllib.env.vector_env import VectorEnv
 from ray.tune.registry import register_env
 
diff --git a/python/ray/rllib/utils/__init__.py b/python/ray/rllib/utils/__init__.py
index e537400b7f311..28296278257fa 100644
--- a/python/ray/rllib/utils/__init__.py
+++ b/python/ray/rllib/utils/__init__.py
@@ -14,14 +14,17 @@ def renamed_class(cls, old_name=None):
     """Helper class for renaming classes with a warning."""
 
     class DeprecationWrapper(cls):
-        def __init__(self, config=None, env=None, logger_creator=None):
+        def __init__(self, *args, **kw):
             if not old_name:
-                old_name = cls.__name__.replace("Trainer", "Agent")
-            new_name = cls.__name__
+                # special case shorthand for the agent rename
+                prev = cls.__name__.replace("Trainer", "Agent")
+            else:
+                prev = old_name
+            new_name = cls.__module__ + "." + cls.__name__
             logger.warn("DeprecationWarning: {} has been renamed to {}. ".
-                        format(old_name, new_name) +
+                        format(prev, new_name) +
                         "This will raise an error in the future.")
-            cls.__init__(self, config, env, logger_creator)
+            cls.__init__(self, *args, **kw)
 
     DeprecationWrapper.__name__ = cls.__name__
 
diff --git a/python/ray/rllib/utils/debug.py b/python/ray/rllib/utils/debug.py
index ce86326f27a00..0f636b0f00ef2 100644
--- a/python/ray/rllib/utils/debug.py
+++ b/python/ray/rllib/utils/debug.py
@@ -6,7 +6,7 @@
 import pprint
 import time
 
-from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
+from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
 
 _logged = set()
 _disabled = False

From f3abef57d8a9125e8bab34da8cad17dd67b0fcef Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:31:47 -0700
Subject: [PATCH 08/13] fix docs

---
 doc/source/rllib-concepts.rst | 8 ++++----
 doc/source/rllib-models.rst   | 2 +-
 doc/source/rllib-offline.rst  | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index 8ddcd06943a4f..e3e7948c864f4 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -4,11 +4,11 @@ RLlib Concepts
 This page describes the internal concepts used to implement algorithms in RLlib. You might find this useful if modifying or adding new algorithms to RLlib.
 
 Policies
--------------
+--------
 
-Policy graph classes encapsulate the core numerical components of RL algorithms. This typically includes the policy model that determines actions to take, a trajectory postprocessor for experiences, and a loss function to improve the policy given postprocessed experiences. For a simple example, see the policy gradients `graph definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy.py>`__.
+Policy classes encapsulate the core numerical components of RL algorithms. This typically includes the policy model that determines actions to take, a trajectory postprocessor for experiences, and a loss function to improve the policy given postprocessed experiences. For a simple example, see the policy gradients `graph definition <https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg/pg_policy.py>`__.
 
-Most interaction with deep learning frameworks is isolated to the `Policy interface <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy.py>`__, allowing RLlib to support multiple frameworks. To simplify the definition of policies, RLlib includes `Tensorflow <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/tf_policy.py>`__ and `PyTorch-specific <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/torch_policy.py>`__ templates. You can also write your own from scratch. Here is an example:
+Most interaction with deep learning frameworks is isolated to the `Policy interface <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/policy.py>`__, allowing RLlib to support multiple frameworks. To simplify the definition of policies, RLlib includes `Tensorflow <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/tf_policy.py>`__ and `PyTorch-specific <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/torch_policy.py>`__ templates. You can also write your own from scratch. Here is an example:
 
 .. code-block:: python
 
@@ -48,7 +48,7 @@ Most interaction with deep learning frameworks is isolated to the `Policy interf
 Policy Evaluation
 -----------------
 
-Given an environment and policy, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
+Given an environment and policy, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
 
 You can use policy evaluation standalone to produce batches of experiences. This can be done by calling ``ev.sample()`` on an evaluator instance, or ``ev.sample.remote()`` in parallel on evaluator instances created as Ray actors (see ``PolicyEvaluator.as_remote()``).
 
diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst
index 0978d15729643..ae2ab83910f7f 100644
--- a/doc/source/rllib-models.rst
+++ b/doc/source/rllib-models.rst
@@ -175,7 +175,7 @@ Instead of using the ``use_lstm: True`` option, it can be preferable use a custo
 Batch Normalization
 ~~~~~~~~~~~~~~~~~~~
 
-You can use ``tf.layers.batch_normalization(x, training=input_dict["is_training"])`` to add batch norm layers to your custom model: `code example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/batch_norm_model.py>`__. RLlib will automatically run the update ops for the batch norm layers during optimization (see `tf_policy.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/tf_policy.py>`__ and `multi_gpu_impl.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/multi_gpu_impl.py>`__ for the exact handling of these updates).
+You can use ``tf.layers.batch_normalization(x, training=input_dict["is_training"])`` to add batch norm layers to your custom model: `code example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/batch_norm_model.py>`__. RLlib will automatically run the update ops for the batch norm layers during optimization (see `tf_policy.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/tf_policy.py>`__ and `multi_gpu_impl.py <https://github.com/ray-project/ray/blob/master/python/ray/rllib/optimizers/multi_gpu_impl.py>`__ for the exact handling of these updates).
 
 Custom Models (PyTorch)
 -----------------------
diff --git a/doc/source/rllib-offline.rst b/doc/source/rllib-offline.rst
index 8aee4123c6f76..825038af3d539 100644
--- a/doc/source/rllib-offline.rst
+++ b/doc/source/rllib-offline.rst
@@ -6,7 +6,7 @@ Working with Offline Datasets
 
 RLlib's offline dataset APIs enable working with experiences read from offline storage (e.g., disk, cloud storage, streaming systems, HDFS). For example, you might want to read experiences saved from previous training runs, or gathered from policies deployed in `web applications <https://arxiv.org/abs/1811.00260>`__. You can also log new agent experiences produced during online training for future use.
 
-RLlib represents trajectory sequences (i.e., ``(s, a, r, s', ...)`` tuples) with `SampleBatch <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ objects. Using a batch format enables efficient encoding and compression of experiences. During online training, RLlib uses `policy evaluation <rllib-concepts.html#policy-evaluation>`__ actors to generate batches of experiences in parallel using the current policy. RLlib also uses this same batch format for reading and writing experiences to offline storage.
+RLlib represents trajectory sequences (i.e., ``(s, a, r, s', ...)`` tuples) with `SampleBatch <https://github.com/ray-project/ray/blob/master/python/ray/rllib/policy/sample_batch.py>`__ objects. Using a batch format enables efficient encoding and compression of experiences. During online training, RLlib uses `policy evaluation <rllib-concepts.html#policy-evaluation>`__ actors to generate batches of experiences in parallel using the current policy. RLlib also uses this same batch format for reading and writing experiences to offline storage.
 
 Example: Training on previously saved experiences
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -99,7 +99,7 @@ This `runnable example <https://github.com/ray-project/ray/blob/master/python/ra
 On-policy algorithms and experience postprocessing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-RLlib assumes that input batches are of `postprocessed experiences <https://github.com/ray-project/ray/blob/b8a9e3f1064c6f8d754884fd9c75e0b2f88df4d6/python/ray/rllib/evaluation/policy.py#L103>`__. This isn't typically critical for off-policy algorithms (e.g., DQN's `post-processing <https://github.com/ray-project/ray/blob/b8a9e3f1064c6f8d754884fd9c75e0b2f88df4d6/python/ray/rllib/agents/dqn/dqn_policy.py#L514>`__ is only needed if ``n_step > 1`` or ``worker_side_prioritization: True``). For off-policy algorithms, you can also safely set the ``postprocess_inputs: True`` config to auto-postprocess data.
+RLlib assumes that input batches are of `postprocessed experiences <https://github.com/ray-project/ray/blob/b8a9e3f1064c6f8d754884fd9c75e0b2f88df4d6/python/ray/rllib/policy/policy.py#L103>`__. This isn't typically critical for off-policy algorithms (e.g., DQN's `post-processing <https://github.com/ray-project/ray/blob/b8a9e3f1064c6f8d754884fd9c75e0b2f88df4d6/python/ray/rllib/agents/dqn/dqn_policy.py#L514>`__ is only needed if ``n_step > 1`` or ``worker_side_prioritization: True``). For off-policy algorithms, you can also safely set the ``postprocess_inputs: True`` config to auto-postprocess data.
 
 However, for on-policy algorithms like PPO, you'll need to pass in the extra values added during policy evaluation and postprocessing to ``batch_builder.add_values()``, e.g., ``logits``, ``vf_preds``, ``value_target``, and ``advantages`` for PPO. This is needed since the calculation of these values depends on the parameters of the *behaviour* policy, which RLlib does not have access to in the offline setting (in online training, these values are automatically added during policy evaluation).
 

From 23628e32fa7ab02eed7c03a9586c02510221ec37 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:33:19 -0700
Subject: [PATCH 09/13] lint

---
 python/ray/rllib/evaluation/__init__.py           | 5 ++---
 python/ray/rllib/evaluation/policy_graph.py       | 1 -
 python/ray/rllib/evaluation/sample_batch.py       | 7 ++++---
 python/ray/rllib/evaluation/tf_policy_graph.py    | 1 -
 python/ray/rllib/evaluation/torch_policy_graph.py | 1 -
 python/ray/rllib/policy/__init__.py               | 1 -
 6 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/python/ray/rllib/evaluation/__init__.py b/python/ray/rllib/evaluation/__init__.py
index 5e4fcd2a423dd..7e56bb7479a04 100644
--- a/python/ray/rllib/evaluation/__init__.py
+++ b/python/ray/rllib/evaluation/__init__.py
@@ -12,9 +12,8 @@
 from ray.rllib.evaluation.metrics import collect_metrics
 
 __all__ = [
-    "EvaluatorInterface", "PolicyEvaluator",
-    "PolicyGraph", "TFPolicyGraph", "TorchPolicyGraph",
-    "SampleBatch", "MultiAgentBatch", "SampleBatchBuilder",
+    "EvaluatorInterface", "PolicyEvaluator", "PolicyGraph", "TFPolicyGraph",
+    "TorchPolicyGraph", "SampleBatch", "MultiAgentBatch", "SampleBatchBuilder",
     "MultiAgentSampleBatchBuilder", "SyncSampler", "AsyncSampler",
     "compute_advantages", "collect_metrics", "MultiAgentEpisode"
 ]
diff --git a/python/ray/rllib/evaluation/policy_graph.py b/python/ray/rllib/evaluation/policy_graph.py
index 7cb44db2ef886..5d0fdf2a4e570 100644
--- a/python/ray/rllib/evaluation/policy_graph.py
+++ b/python/ray/rllib/evaluation/policy_graph.py
@@ -5,5 +5,4 @@
 from ray.rllib.policy.policy import Policy
 from ray.rllib.utils import renamed_class
 
-
 PolicyGraph = renamed_class(Policy, old_name="PolicyGraph")
diff --git a/python/ray/rllib/evaluation/sample_batch.py b/python/ray/rllib/evaluation/sample_batch.py
index 906fc8974db1d..2c0f119a94b2c 100644
--- a/python/ray/rllib/evaluation/sample_batch.py
+++ b/python/ray/rllib/evaluation/sample_batch.py
@@ -5,6 +5,7 @@
 from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.utils import renamed_class
 
-
-SampleBatch = renamed_class(SampleBatch, old_name="rllib.evaluation.SampleBatch")
-MultiAgentBatch = renamed_class(MultiAgentBatch, old_name="rllib.evaluation.MultiAgentBatch")
+SampleBatch = renamed_class(
+    SampleBatch, old_name="rllib.evaluation.SampleBatch")
+MultiAgentBatch = renamed_class(
+    MultiAgentBatch, old_name="rllib.evaluation.MultiAgentBatch")
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index 2ac97e5259e34..2c4955a17ff10 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -5,5 +5,4 @@
 from ray.rllib.policy.tf_policy import TFPolicy
 from ray.rllib.utils import renamed_class
 
-
 TFPolicyGraph = renamed_class(TFPolicy, old_name="TFPolicyGraph")
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py
index e9124e7ffb4bc..08cc29fed7460 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph.py
@@ -5,5 +5,4 @@
 from ray.rllib.policy.torch_policy import TorchPolicy
 from ray.rllib.utils import renamed_class
 
-
 TorchPolicyGraph = renamed_class(TorchPolicy, old_name="TorchPolicyGraph")
diff --git a/python/ray/rllib/policy/__init__.py b/python/ray/rllib/policy/__init__.py
index 6d7bafabe442d..0f172dcd566d4 100644
--- a/python/ray/rllib/policy/__init__.py
+++ b/python/ray/rllib/policy/__init__.py
@@ -8,7 +8,6 @@
 from ray.rllib.policy.torch_policy_template import build_torch_policy
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 
-
 __all__ = [
     "Policy",
     "TFPolicy",

From a503b002a91f465ac1fafbdb438ea35cad5f41d1 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:34:51 -0700
Subject: [PATCH 10/13] fix test lint

---
 python/ray/rllib/tests/test_multi_agent_env.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py
index 9e9af78a3cce0..f2e56a4d2cb32 100644
--- a/python/ray/rllib/tests/test_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_multi_agent_env.py
@@ -614,9 +614,12 @@ def policy_mapper(agent_id):
             optimizer.step()
             result = collect_metrics(ev, remote_evs)
             if i % 20 == 0:
-                ev.foreach_policy(
-                    lambda p, _: p.update_target() if isinstance(p, DQNPolicy) else None
-                )
+
+                def do_update(p):
+                    if isinstance(p, DQNPolicy):
+                        p.update_target()
+
+                ev.foreach_policy(lambda p, _: do_update(p))
                 print("Iter {}, rew {}".format(i,
                                                result["policy_reward_mean"]))
                 print("Total reward", result["episode_reward_mean"])

From 7d389252464ae8a44f9d766f3b7ad209ada25a75 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:40:03 -0700
Subject: [PATCH 11/13] rename policys to TFPolicy

---
 python/ray/rllib/agents/a3c/a3c.py                   |  4 ++--
 python/ray/rllib/agents/a3c/a3c_tf_policy.py         |  4 ++--
 python/ray/rllib/agents/ddpg/ddpg.py                 |  4 ++--
 python/ray/rllib/agents/ddpg/ddpg_policy.py          |  2 +-
 python/ray/rllib/agents/dqn/dqn.py                   |  4 ++--
 python/ray/rllib/agents/dqn/dqn_policy.py            |  2 +-
 python/ray/rllib/agents/impala/impala.py             |  8 ++++----
 python/ray/rllib/agents/impala/vtrace_policy.py      |  8 ++++----
 python/ray/rllib/agents/ppo/appo_policy.py           |  4 ++--
 python/ray/rllib/agents/qmix/qmix.py                 |  4 ++--
 python/ray/rllib/agents/qmix/qmix_policy.py          |  2 +-
 python/ray/rllib/evaluation/tf_policy_template.py    |  2 +-
 python/ray/rllib/examples/multiagent_two_trainers.py |  4 ++--
 python/ray/rllib/policy/tf_policy_template.py        |  2 +-
 python/ray/rllib/policy/torch_policy_template.py     |  2 +-
 python/ray/rllib/tests/test_multi_agent_env.py       | 12 ++++++------
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py
index ad3544306e103..56d7a09daa0fe 100644
--- a/python/ray/rllib/agents/a3c/a3c.py
+++ b/python/ray/rllib/agents/a3c/a3c.py
@@ -4,7 +4,7 @@
 
 import time
 
-from ray.rllib.agents.a3c.a3c_tf_policy import A3CPolicy
+from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
 from ray.rllib.agents.trainer import Trainer, with_common_config
 from ray.rllib.optimizers import AsyncGradientsOptimizer
 from ray.rllib.utils.annotations import override
@@ -43,7 +43,7 @@ class A3CTrainer(Trainer):
 
     _name = "A3C"
     _default_config = DEFAULT_CONFIG
-    _policy = A3CPolicy
+    _policy = A3CTFPolicy
 
     @override(Trainer)
     def _init(self, config, env_creator):
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy.py b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
index c1b1e1b746ba4..eb5becceaa714 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
@@ -1,4 +1,4 @@
-"""Note: Keep in sync with changes to VTracePolicy."""
+"""Note: Keep in sync with changes to VTraceTFPolicy."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -73,7 +73,7 @@ def postprocess_trajectory(self,
                                   self.config["lambda"])
 
 
-class A3CPolicy(LearningRateSchedule, A3CPostprocessing, TFPolicy):
+class A3CTFPolicy(LearningRateSchedule, A3CPostprocessing, TFPolicy):
     def __init__(self, observation_space, action_space, config):
         config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
         self.config = config
diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py
index 69672c986b07a..66d3810e5e93d 100644
--- a/python/ray/rllib/agents/ddpg/ddpg.py
+++ b/python/ray/rllib/agents/ddpg/ddpg.py
@@ -4,7 +4,7 @@
 
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.dqn.dqn import DQNTrainer
-from ray.rllib.agents.ddpg.ddpg_policy import DDPGPolicy
+from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
 
@@ -163,7 +163,7 @@ class DDPGTrainer(DQNTrainer):
     """DDPG implementation in TensorFlow."""
     _name = "DDPG"
     _default_config = DEFAULT_CONFIG
-    _policy = DDPGPolicy
+    _policy = DDPGTFPolicy
 
     @override(DQNTrainer)
     def _train(self):
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy.py b/python/ray/rllib/agents/ddpg/ddpg_policy.py
index 45f186351b3be..b80cfce4cdaa5 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy.py
@@ -68,7 +68,7 @@ def postprocess_trajectory(self,
         return _postprocess_dqn(self, sample_batch)
 
 
-class DDPGPolicy(DDPGPostprocessing, TFPolicy):
+class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
     def __init__(self, observation_space, action_space, config):
         config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config)
         if not isinstance(action_space, Box):
diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py
index 442565007dd82..7fdb6f66b433a 100644
--- a/python/ray/rllib/agents/dqn/dqn.py
+++ b/python/ray/rllib/agents/dqn/dqn.py
@@ -8,7 +8,7 @@
 from ray import tune
 from ray.rllib import optimizers
 from ray.rllib.agents.trainer import Trainer, with_common_config
-from ray.rllib.agents.dqn.dqn_policy import DQNPolicy
+from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.annotations import override
@@ -133,7 +133,7 @@ class DQNTrainer(Trainer):
 
     _name = "DQN"
     _default_config = DEFAULT_CONFIG
-    _policy = DQNPolicy
+    _policy = DQNTFPolicy
     _optimizer_shared_configs = OPTIMIZER_SHARED_CONFIGS
 
     @override(Trainer)
diff --git a/python/ray/rllib/agents/dqn/dqn_policy.py b/python/ray/rllib/agents/dqn/dqn_policy.py
index b7a0af5ab5fa1..a1affa947a430 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy.py
@@ -345,7 +345,7 @@ def __init__(self, q_values, observations, num_actions, stochastic, eps,
         self.action_prob = None
 
 
-class DQNPolicy(LearningRateSchedule, DQNPostprocessing, TFPolicy):
+class DQNTFPolicy(LearningRateSchedule, DQNPostprocessing, TFPolicy):
     def __init__(self, observation_space, action_space, config):
         config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
         if not isinstance(action_space, Discrete):
diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py
index 77b47c4dedf9c..838f2975ce677 100644
--- a/python/ray/rllib/agents/impala/impala.py
+++ b/python/ray/rllib/agents/impala/impala.py
@@ -4,8 +4,8 @@
 
 import time
 
-from ray.rllib.agents.a3c.a3c_tf_policy import A3CPolicy
-from ray.rllib.agents.impala.vtrace_policy import VTracePolicy
+from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
+from ray.rllib.agents.impala.vtrace_policy import VTraceTFPolicy
 from ray.rllib.agents.trainer import Trainer, with_common_config
 from ray.rllib.optimizers import AsyncSamplesOptimizer
 from ray.rllib.optimizers.aso_tree_aggregator import TreeAggregator
@@ -105,7 +105,7 @@ class ImpalaTrainer(Trainer):
 
     _name = "IMPALA"
     _default_config = DEFAULT_CONFIG
-    _policy = VTracePolicy
+    _policy = VTraceTFPolicy
 
     @override(Trainer)
     def _init(self, config, env_creator):
@@ -162,5 +162,5 @@ def _get_policy(self):
         if self.config["vtrace"]:
             policy_cls = self._policy
         else:
-            policy_cls = A3CPolicy
+            policy_cls = A3CTFPolicy
         return policy_cls
diff --git a/python/ray/rllib/agents/impala/vtrace_policy.py b/python/ray/rllib/agents/impala/vtrace_policy.py
index 0af2ddb401d0a..9b7c57b9355eb 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy.py
@@ -1,6 +1,6 @@
-"""Adapted from A3CPolicy to add V-trace.
+"""Adapted from A3CTFPolicy to add V-trace.
 
-Keep in sync with changes to A3CPolicy and VtraceSurrogatePolicy."""
+Keep in sync with changes to A3CTFPolicy and VtraceSurrogatePolicy."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -126,7 +126,7 @@ def postprocess_trajectory(self,
         return sample_batch
 
 
-class VTracePolicy(LearningRateSchedule, VTracePostprocessing, TFPolicy):
+class VTraceTFPolicy(LearningRateSchedule, VTracePostprocessing, TFPolicy):
     def __init__(self,
                  observation_space,
                  action_space,
@@ -333,7 +333,7 @@ def make_time_major(tensor, drop_last=False):
 
     @override(TFPolicy)
     def copy(self, existing_inputs):
-        return VTracePolicy(
+        return VTraceTFPolicy(
             self.observation_space,
             self.action_space,
             self.config,
diff --git a/python/ray/rllib/agents/ppo/appo_policy.py b/python/ray/rllib/agents/ppo/appo_policy.py
index b4041c790d9b7..b740d6d814302 100644
--- a/python/ray/rllib/agents/ppo/appo_policy.py
+++ b/python/ray/rllib/agents/ppo/appo_policy.py
@@ -1,6 +1,6 @@
-"""Adapted from VTracePolicy to use the PPO surrogate loss.
+"""Adapted from VTraceTFPolicy to use the PPO surrogate loss.
 
-Keep in sync with changes to VTracePolicy."""
+Keep in sync with changes to VTraceTFPolicy."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/python/ray/rllib/agents/qmix/qmix.py b/python/ray/rllib/agents/qmix/qmix.py
index 885094f75c203..2ad6a3e56f952 100644
--- a/python/ray/rllib/agents/qmix/qmix.py
+++ b/python/ray/rllib/agents/qmix/qmix.py
@@ -4,7 +4,7 @@
 
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.dqn.dqn import DQNTrainer
-from ray.rllib.agents.qmix.qmix_policy import QMixPolicy
+from ray.rllib.agents.qmix.qmix_policy import QMixTorchPolicy
 
 # yapf: disable
 # __sphinx_doc_begin__
@@ -95,7 +95,7 @@ class QMixTrainer(DQNTrainer):
 
     _name = "QMIX"
     _default_config = DEFAULT_CONFIG
-    _policy = QMixPolicy
+    _policy = QMixTorchPolicy
     _optimizer_shared_configs = [
         "learning_starts", "buffer_size", "train_batch_size"
     ]
diff --git a/python/ray/rllib/agents/qmix/qmix_policy.py b/python/ray/rllib/agents/qmix/qmix_policy.py
index 24c42fbcb13c7..26ec387de0041 100644
--- a/python/ray/rllib/agents/qmix/qmix_policy.py
+++ b/python/ray/rllib/agents/qmix/qmix_policy.py
@@ -130,7 +130,7 @@ def forward(self, rewards, actions, terminated, mask, obs, next_obs,
         return loss, mask, masked_td_error, chosen_action_qvals, targets
 
 
-class QMixPolicy(Policy):
+class QMixTorchPolicy(Policy):
     """QMix impl. Assumes homogeneous agents for now.
 
     You must use MultiAgentEnv.with_agent_groups() to group agents
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index 75ff0212a7a1c..36f482f18bf8b 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -27,7 +27,7 @@ def build_tf_policy(name,
     """Helper function for creating a dynamic tf policy at runtime.
 
     Arguments:
-        name (str): name of the policy (e.g., "PPOPolicy")
+        name (str): name of the policy (e.g., "PPOTFPolicy")
         loss_fn (func): function that returns a loss tensor the policy,
             and dict of experience tensor placeholders
         get_default_config (func): optional function that returns the default
diff --git a/python/ray/rllib/examples/multiagent_two_trainers.py b/python/ray/rllib/examples/multiagent_two_trainers.py
index 7c78e1dd625e0..68c0e742e8578 100644
--- a/python/ray/rllib/examples/multiagent_two_trainers.py
+++ b/python/ray/rllib/examples/multiagent_two_trainers.py
@@ -16,7 +16,7 @@
 
 import ray
 from ray.rllib.agents.dqn.dqn import DQNTrainer
-from ray.rllib.agents.dqn.dqn_policy import DQNPolicy
+from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from ray.rllib.agents.ppo.ppo import PPOTrainer
 from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
 from ray.rllib.tests.test_multi_agent_env import MultiCartpole
@@ -40,7 +40,7 @@
     # show one each for PPO and DQN.
     policies = {
         "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}),
-        "dqn_policy": (DQNPolicy, obs_space, act_space, {}),
+        "dqn_policy": (DQNTFPolicy, obs_space, act_space, {}),
     }
 
     def policy_mapping_fn(agent_id):
diff --git a/python/ray/rllib/policy/tf_policy_template.py b/python/ray/rllib/policy/tf_policy_template.py
index 75ff0212a7a1c..36f482f18bf8b 100644
--- a/python/ray/rllib/policy/tf_policy_template.py
+++ b/python/ray/rllib/policy/tf_policy_template.py
@@ -27,7 +27,7 @@ def build_tf_policy(name,
     """Helper function for creating a dynamic tf policy at runtime.
 
     Arguments:
-        name (str): name of the policy (e.g., "PPOPolicy")
+        name (str): name of the policy (e.g., "PPOTFPolicy")
         loss_fn (func): function that returns a loss tensor the policy,
             and dict of experience tensor placeholders
         get_default_config (func): optional function that returns the default
diff --git a/python/ray/rllib/policy/torch_policy_template.py b/python/ray/rllib/policy/torch_policy_template.py
index ced3cd7c989c2..049591c04671a 100644
--- a/python/ray/rllib/policy/torch_policy_template.py
+++ b/python/ray/rllib/policy/torch_policy_template.py
@@ -24,7 +24,7 @@ def build_torch_policy(name,
     """Helper function for creating a torch policy at runtime.
 
     Arguments:
-        name (str): name of the policy (e.g., "PPOPolicy")
+        name (str): name of the policy (e.g., "PPOTFPolicy")
         loss_fn (func): function that returns a loss tensor the policy,
             and dict of experience tensor placeholders
         get_default_config (func): optional function that returns the default
diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py
index f2e56a4d2cb32..be4bfcd3428ff 100644
--- a/python/ray/rllib/tests/test_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_multi_agent_env.py
@@ -9,7 +9,7 @@
 import ray
 from ray.rllib.agents.pg import PGTrainer
 from ray.rllib.agents.pg.pg_policy import PGTFPolicy
-from ray.rllib.agents.dqn.dqn_policy import DQNPolicy
+from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer,
                                   AsyncGradientsOptimizer)
 from ray.rllib.tests.test_policy_evaluator import (MockEnv, MockEnv2,
@@ -579,13 +579,13 @@ def _testWithOptimizer(self, optimizer_cls):
             # happen since the replay buffer doesn't encode extra fields like
             # "advantages" that PG uses.
             policies = {
-                "p1": (DQNPolicy, obs_space, act_space, dqn_config),
-                "p2": (DQNPolicy, obs_space, act_space, dqn_config),
+                "p1": (DQNTFPolicy, obs_space, act_space, dqn_config),
+                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
             }
         else:
             policies = {
                 "p1": (PGTFPolicy, obs_space, act_space, {}),
-                "p2": (DQNPolicy, obs_space, act_space, dqn_config),
+                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
             }
         ev = PolicyEvaluator(
             env_creator=lambda _: MultiCartpole(n),
@@ -610,13 +610,13 @@ def policy_mapper(agent_id):
         for i in range(200):
             ev.foreach_policy(lambda p, _: p.set_epsilon(
                 max(0.02, 1 - i * .02))
-                              if isinstance(p, DQNPolicy) else None)
+                              if isinstance(p, DQNTFPolicy) else None)
             optimizer.step()
             result = collect_metrics(ev, remote_evs)
             if i % 20 == 0:
 
                 def do_update(p):
-                    if isinstance(p, DQNPolicy):
+                    if isinstance(p, DQNTFPolicy):
                         p.update_target()
 
                 ev.foreach_policy(lambda p, _: do_update(p))

From 09e18a5e6636260edd05291ea19fa01d57ff9697 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 15:40:55 -0700
Subject: [PATCH 12/13] update doc

---
 doc/source/rllib-models.rst             | 8 ++++----
 python/ray/rllib/policy/sample_batch.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst
index ae2ab83910f7f..cdf42ea228c70 100644
--- a/doc/source/rllib-models.rst
+++ b/doc/source/rllib-models.rst
@@ -320,7 +320,7 @@ For deeper customization of algorithms, you can modify the policies of the train
 .. code-block:: python
 
     from ray.rllib.models import ModelCatalog
-    from ray.rllib.agents.ddpg.ddpg_policy import DDPGPolicy as BaseDDPGPolicy
+    from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy as BaseDDPGTFPolicy
 
     class CustomPNetwork(object):
         def __init__(self, dim_actions, hiddens, activation):
@@ -336,7 +336,7 @@ For deeper customization of algorithms, you can modify the policies of the train
             self.value = layers.fully_connected(
                 q_out, num_outputs=1, activation_fn=None)
 
-    class CustomDDPGPolicy(BaseDDPGPolicy):
+    class CustomDDPGTFPolicy(BaseDDPGTFPolicy):
         def _build_p_network(self, obs):
             return CustomPNetwork(
                 self.dim_actions,
@@ -354,9 +354,9 @@ Then, you can create an trainer with your custom policy by:
 .. code-block:: python
 
     from ray.rllib.agents.ddpg.ddpg import DDPGTrainer
-    from custom_policy import CustomDDPGPolicy
+    from custom_policy import CustomDDPGTFPolicy
 
-    DDPGTrainer._policy = CustomDDPGPolicy
+    DDPGTrainer._policy = CustomDDPGTFPolicy
     trainer = DDPGTrainer(...)
 
 In this example we overrode existing methods of the existing DDPG policy, i.e., `_build_q_network`, `_build_p_network`, `_build_action_network`, `_build_actor_critic_loss`, but you can also replace the entire graph class entirely.
diff --git a/python/ray/rllib/policy/sample_batch.py b/python/ray/rllib/policy/sample_batch.py
index c80f22bdbd1a1..a9515eeeac5aa 100644
--- a/python/ray/rllib/policy/sample_batch.py
+++ b/python/ray/rllib/policy/sample_batch.py
@@ -10,7 +10,7 @@
 from ray.rllib.utils.compression import pack, unpack, is_compressed
 from ray.rllib.utils.memory import concat_aligned
 
-# Defaults policy id for single agent environments
+# Default policy id for single agent environments
 DEFAULT_POLICY_ID = "default_policy"
 
 

From edb1df4abd5a969478201d7b14d248276bc881d3 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sat, 18 May 2019 16:59:40 -0700
Subject: [PATCH 13/13] fix renamed class wrapper for agent

---
 python/ray/rllib/agents/a3c/__init__.py    |  6 ++---
 python/ray/rllib/agents/agent.py           |  4 ++--
 python/ray/rllib/agents/ars/__init__.py    |  4 ++--
 python/ray/rllib/agents/ddpg/__init__.py   |  6 ++---
 python/ray/rllib/agents/dqn/__init__.py    |  6 ++---
 python/ray/rllib/agents/es/__init__.py     |  4 ++--
 python/ray/rllib/agents/impala/__init__.py |  4 ++--
 python/ray/rllib/agents/pg/__init__.py     |  4 ++--
 python/ray/rllib/agents/ppo/__init__.py    |  4 ++--
 python/ray/rllib/utils/__init__.py         | 27 ++++++++++++++++------
 10 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/python/ray/rllib/agents/a3c/__init__.py b/python/ray/rllib/agents/a3c/__init__.py
index 9c8205389ea2b..4a8480eab6956 100644
--- a/python/ray/rllib/agents/a3c/__init__.py
+++ b/python/ray/rllib/agents/a3c/__init__.py
@@ -1,9 +1,9 @@
 from ray.rllib.agents.a3c.a3c import A3CTrainer, DEFAULT_CONFIG
 from ray.rllib.agents.a3c.a2c import A2CTrainer
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-A2CAgent = renamed_class(A2CTrainer)
-A3CAgent = renamed_class(A3CTrainer)
+A2CAgent = renamed_agent(A2CTrainer)
+A3CAgent = renamed_agent(A3CTrainer)
 
 __all__ = [
     "A2CAgent", "A3CAgent", "A2CTrainer", "A3CTrainer", "DEFAULT_CONFIG"
diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py
index 5b0ecf268fe78..17da952ddedf6 100644
--- a/python/ray/rllib/agents/agent.py
+++ b/python/ray/rllib/agents/agent.py
@@ -3,6 +3,6 @@
 from __future__ import print_function
 
 from ray.rllib.agents.trainer import Trainer
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-Agent = renamed_class(Trainer)
+Agent = renamed_agent(Trainer)
diff --git a/python/ray/rllib/agents/ars/__init__.py b/python/ray/rllib/agents/ars/__init__.py
index a1120ff8ce313..0681efe7ab375 100644
--- a/python/ray/rllib/agents/ars/__init__.py
+++ b/python/ray/rllib/agents/ars/__init__.py
@@ -1,6 +1,6 @@
 from ray.rllib.agents.ars.ars import (ARSTrainer, DEFAULT_CONFIG)
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-ARSAgent = renamed_class(ARSTrainer)
+ARSAgent = renamed_agent(ARSTrainer)
 
 __all__ = ["ARSAgent", "ARSTrainer", "DEFAULT_CONFIG"]
diff --git a/python/ray/rllib/agents/ddpg/__init__.py b/python/ray/rllib/agents/ddpg/__init__.py
index 9b90ca842ae53..3d681b8356c90 100644
--- a/python/ray/rllib/agents/ddpg/__init__.py
+++ b/python/ray/rllib/agents/ddpg/__init__.py
@@ -5,10 +5,10 @@
 from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
 from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
 from ray.rllib.agents.ddpg.td3 import TD3Trainer
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-ApexDDPGAgent = renamed_class(ApexDDPGTrainer)
-DDPGAgent = renamed_class(DDPGTrainer)
+ApexDDPGAgent = renamed_agent(ApexDDPGTrainer)
+DDPGAgent = renamed_agent(DDPGTrainer)
 
 __all__ = [
     "DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
diff --git a/python/ray/rllib/agents/dqn/__init__.py b/python/ray/rllib/agents/dqn/__init__.py
index 415ceae6c1de2..d3de8cb802cc3 100644
--- a/python/ray/rllib/agents/dqn/__init__.py
+++ b/python/ray/rllib/agents/dqn/__init__.py
@@ -4,10 +4,10 @@
 
 from ray.rllib.agents.dqn.apex import ApexTrainer
 from ray.rllib.agents.dqn.dqn import DQNTrainer, DEFAULT_CONFIG
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-DQNAgent = renamed_class(DQNTrainer)
-ApexAgent = renamed_class(ApexTrainer)
+DQNAgent = renamed_agent(DQNTrainer)
+ApexAgent = renamed_agent(ApexTrainer)
 
 __all__ = [
     "DQNAgent", "ApexAgent", "ApexTrainer", "DQNTrainer", "DEFAULT_CONFIG"
diff --git a/python/ray/rllib/agents/es/__init__.py b/python/ray/rllib/agents/es/__init__.py
index d7bec2a9e0025..38b2b772ec575 100644
--- a/python/ray/rllib/agents/es/__init__.py
+++ b/python/ray/rllib/agents/es/__init__.py
@@ -1,6 +1,6 @@
 from ray.rllib.agents.es.es import (ESTrainer, DEFAULT_CONFIG)
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-ESAgent = renamed_class(ESTrainer)
+ESAgent = renamed_agent(ESTrainer)
 
 __all__ = ["ESAgent", "ESTrainer", "DEFAULT_CONFIG"]
diff --git a/python/ray/rllib/agents/impala/__init__.py b/python/ray/rllib/agents/impala/__init__.py
index 81c64e8891ab9..d7bdd7210fdd2 100644
--- a/python/ray/rllib/agents/impala/__init__.py
+++ b/python/ray/rllib/agents/impala/__init__.py
@@ -1,6 +1,6 @@
 from ray.rllib.agents.impala.impala import ImpalaTrainer, DEFAULT_CONFIG
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-ImpalaAgent = renamed_class(ImpalaTrainer)
+ImpalaAgent = renamed_agent(ImpalaTrainer)
 
 __all__ = ["ImpalaAgent", "ImpalaTrainer", "DEFAULT_CONFIG"]
diff --git a/python/ray/rllib/agents/pg/__init__.py b/python/ray/rllib/agents/pg/__init__.py
index 2203188a7ca6c..eb11c99bf625f 100644
--- a/python/ray/rllib/agents/pg/__init__.py
+++ b/python/ray/rllib/agents/pg/__init__.py
@@ -1,6 +1,6 @@
 from ray.rllib.agents.pg.pg import PGTrainer, DEFAULT_CONFIG
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-PGAgent = renamed_class(PGTrainer)
+PGAgent = renamed_agent(PGTrainer)
 
 __all__ = ["PGAgent", "PGTrainer", "DEFAULT_CONFIG"]
diff --git a/python/ray/rllib/agents/ppo/__init__.py b/python/ray/rllib/agents/ppo/__init__.py
index a02cbc23c6846..a3d492baf24aa 100644
--- a/python/ray/rllib/agents/ppo/__init__.py
+++ b/python/ray/rllib/agents/ppo/__init__.py
@@ -1,7 +1,7 @@
 from ray.rllib.agents.ppo.ppo import PPOTrainer, DEFAULT_CONFIG
 from ray.rllib.agents.ppo.appo import APPOTrainer
-from ray.rllib.utils import renamed_class
+from ray.rllib.utils import renamed_agent
 
-PPOAgent = renamed_class(PPOTrainer)
+PPOAgent = renamed_agent(PPOTrainer)
 
 __all__ = ["PPOAgent", "APPOTrainer", "PPOTrainer", "DEFAULT_CONFIG"]
diff --git a/python/ray/rllib/utils/__init__.py b/python/ray/rllib/utils/__init__.py
index 28296278257fa..aad5590fd0977 100644
--- a/python/ray/rllib/utils/__init__.py
+++ b/python/ray/rllib/utils/__init__.py
@@ -10,19 +10,15 @@
 logger = logging.getLogger(__name__)
 
 
-def renamed_class(cls, old_name=None):
+def renamed_class(cls, old_name):
     """Helper class for renaming classes with a warning."""
 
     class DeprecationWrapper(cls):
+        # note: **kw not supported for ray.remote classes
         def __init__(self, *args, **kw):
-            if not old_name:
-                # special case shorthand for the agent rename
-                prev = cls.__name__.replace("Trainer", "Agent")
-            else:
-                prev = old_name
             new_name = cls.__module__ + "." + cls.__name__
             logger.warn("DeprecationWarning: {} has been renamed to {}. ".
-                        format(prev, new_name) +
+                        format(old_name, new_name) +
                         "This will raise an error in the future.")
             cls.__init__(self, *args, **kw)
 
@@ -31,6 +27,23 @@ def __init__(self, *args, **kw):
     return DeprecationWrapper
 
 
+def renamed_agent(cls):
+    """Helper class for renaming Agent => Trainer with a warning."""
+
+    class DeprecationWrapper(cls):
+        def __init__(self, config=None, env=None, logger_creator=None):
+            old_name = cls.__name__.replace("Trainer", "Agent")
+            new_name = cls.__module__ + "." + cls.__name__
+            logger.warn("DeprecationWarning: {} has been renamed to {}. ".
+                        format(old_name, new_name) +
+                        "This will raise an error in the future.")
+            cls.__init__(self, config, env, logger_creator)
+
+    DeprecationWrapper.__name__ = cls.__name__
+
+    return DeprecationWrapper
+
+
 def try_import_tf():
     if "RLLIB_TEST_NO_TF_IMPORT" in os.environ:
         logger.warning("Not importing TensorFlow for test purposes")