From d0fec3ecb9cf7752d03a59e3d08bd998b7636e28 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 15 May 2019 16:39:08 -0700
Subject: [PATCH 01/39] dynamic graph

---
 python/ray/rllib/agents/pg/pg.py              |  13 +-
 python/ray/rllib/agents/pg/pg_policy_graph.py | 101 ++------------
 .../evaluation/dynamic_tf_policy_graph.py     | 128 ++++++++++++++++++
 .../ray/rllib/evaluation/tf_policy_graph.py   |  51 ++++---
 4 files changed, 183 insertions(+), 110 deletions(-)
 create mode 100644 python/ray/rllib/evaluation/dynamic_tf_policy_graph.py

diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index e70fdcc8b2c6..8ca36647dcb6 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -3,8 +3,9 @@
 from __future__ import print_function
 
 from ray.rllib.agents.trainer import Trainer, with_common_config
-from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
-
+from ray.rllib.agents.pg.pg_policy_graph import postprocess_advantages, \
+    policy_gradient_loss, make_optimizer
+from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph
 from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.rllib.utils.annotations import override
 
@@ -22,6 +23,14 @@
 # yapf: enable
 
 
+PGPolicyGraph = build_tf_graph(
+    name="PG",
+    default_config=DEFAULT_CONFIG,
+    postprocess_fn=postprocess_advantages,
+    loss_fn=policy_gradient_loss,
+    make_optimizer=make_optimizer)
+
+
 class PGTrainer(Trainer):
     """Simple policy gradient agent.
 
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index a55af79b1e61..f3bef7c6c296 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -3,102 +3,29 @@
 from __future__ import print_function
 
 import ray
-from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
-from ray.rllib.utils.annotations import override
 from ray.rllib.utils import try_import_tf
 
 tf = try_import_tf()
 
 
-class PGLoss(object):
-    """The basic policy gradient loss."""
+# The basic policy gradients loss
+def policy_gradient_loss(postprocessed_batch, action_dist):
+    actions = postprocessed_batch[SampleBatch.ACTIONS]
+    advantages = postprocessed_batch[Postprocessing.ADVANTAGES]
+    return -tf.reduce_mean(action_dist.logp(actions) * advantages)
 
-    def __init__(self, action_dist, actions, advantages):
-        self.loss = -tf.reduce_mean(action_dist.logp(actions) * advantages)
 
+# This adds the "advantages" column to the sample batch.
+def postprocess_advantages(graph,
+                           sample_batch,
+                           other_agent_batches=None,
+                           episode=None):
+    return compute_advantages(
+        sample_batch, 0.0, graph.config["gamma"], use_gae=False)
 
-class PGPostprocessing(object):
-    """Adds the advantages field to the trajectory."""
 
-    @override(PolicyGraph)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        # This adds the "advantages" column to the sample batch
-        return compute_advantages(
-            sample_batch, 0.0, self.config["gamma"], use_gae=False)
-
-
-class PGPolicyGraph(PGPostprocessing, TFPolicyGraph):
-    """Simple policy gradient example of defining a policy graph."""
-
-    def __init__(self, obs_space, action_space, config):
-        config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config)
-        self.config = config
-
-        # Setup placeholders
-        obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
-        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"])
-        prev_actions = ModelCatalog.get_action_placeholder(action_space)
-        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
-
-        # Create the model network and action outputs
-        self.model = ModelCatalog.get_model({
-            "obs": obs,
-            "prev_actions": prev_actions,
-            "prev_rewards": prev_rewards,
-            "is_training": self._get_is_training_placeholder(),
-        }, obs_space, action_space, self.logit_dim, self.config["model"])
-        action_dist = dist_class(self.model.outputs)  # logit for each action
-
-        # Setup policy loss
-        actions = ModelCatalog.get_action_placeholder(action_space)
-        advantages = tf.placeholder(tf.float32, [None], name="adv")
-        loss = PGLoss(action_dist, actions, advantages).loss
-
-        # Mapping from sample batch keys to placeholders. These keys will be
-        # read from postprocessed sample batches and fed into the specified
-        # placeholders during loss computation.
-        loss_in = [
-            (SampleBatch.CUR_OBS, obs),
-            (SampleBatch.ACTIONS, actions),
-            (SampleBatch.PREV_ACTIONS, prev_actions),
-            (SampleBatch.PREV_REWARDS, prev_rewards),
-            (Postprocessing.ADVANTAGES, advantages),
-        ]
-
-        # Initialize TFPolicyGraph
-        sess = tf.get_default_session()
-        TFPolicyGraph.__init__(
-            self,
-            obs_space,
-            action_space,
-            sess,
-            obs_input=obs,
-            action_sampler=action_dist.sample(),
-            action_prob=action_dist.sampled_action_prob(),
-            loss=loss,
-            loss_inputs=loss_in,
-            model=self.model,
-            state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out,
-            prev_action_input=prev_actions,
-            prev_reward_input=prev_rewards,
-            seq_lens=self.model.seq_lens,
-            max_seq_len=config["model"]["max_seq_len"])
-        sess.run(tf.global_variables_initializer())
-
-    @override(PolicyGraph)
-    def get_initial_state(self):
-        return self.model.state_init
-
-    @override(TFPolicyGraph)
-    def optimizer(self):
-        return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
+def make_optimizer(graph):
+    return tf.train.AdamOptimizer(learning_rate=graph.config["lr"])
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
new file mode 100644
index 000000000000..2185ca8785eb
--- /dev/null
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -0,0 +1,128 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils import try_import_tf
+
+tf = try_import_tf()
+
+
+def build_tf_graph(
+    name, default_config, postprocess_fn, loss_fn, make_optimizer=None):
+
+    class graph_cls(DynamicTFPolicyGraph):
+        def __init__(self, obs_space, action_space, config):
+            config = dict(default_config, **config)
+            DynamicTFPolicyGraph.__init__(
+                self, obs_space, action_space, config, loss_fn)
+
+        @override(PolicyGraph)
+        def postprocess_trajectory(self,
+                                   sample_batch,
+                                   other_agent_batches=None,
+                                   episode=None):
+            return postprocess_fn(
+                self, sample_batch, other_agent_batches, episode)
+
+        @override(TFPolicyGraph)
+        def optimizer(self):
+            if make_optimizer:
+                return make_optimizer(self)
+            else:
+                return TFPolicyGraph.optimizer(self)
+
+    graph_cls.__name__ = name
+    return graph_cls
+
+
+class DynamicTFPolicyGraph(TFPolicyGraph):
+    def __init__(self, obs_space, action_space, config, loss_fn):
+        self.config = config
+        self._build_loss = loss_fn
+
+        # Setup standard placeholders
+        obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
+        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
+            action_space, self.config["model"])
+        prev_actions = ModelCatalog.get_action_placeholder(action_space)
+        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
+
+        # Create the model network and action outputs
+        self.model = ModelCatalog.get_model({
+            "obs": obs,
+            "prev_actions": prev_actions,
+            "prev_rewards": prev_rewards,
+            "is_training": self._get_is_training_placeholder(),
+        }, obs_space, action_space, self.logit_dim, self.config["model"])
+        self.action_dist = dist_class(self.model.outputs)
+
+        sess = tf.get_default_session()
+        TFPolicyGraph.__init__(
+            self,
+            obs_space,
+            action_space,
+            sess,
+            obs_input=obs,
+            action_sampler=self.action_dist.sample(),
+            action_prob=self.action_dist.sampled_action_prob(),
+            loss=None,  # dynamically initialized on run
+            loss_inputs=[],
+            model=self.model,
+            state_inputs=self.model.state_in,
+            state_outputs=self.model.state_out,
+            prev_action_input=prev_actions,
+            prev_reward_input=prev_rewards,
+            seq_lens=self.model.seq_lens,
+            max_seq_len=config["model"]["max_seq_len"])
+        sess.run(tf.global_variables_initializer())
+
+    @override(PolicyGraph)
+    def get_initial_state(self):
+        return self.model.state_init
+
+    def _initialize_loss_if_needed(self, postprocessed_batch):
+        if self._loss is not None:
+            return  # already created
+
+        with self._sess.graph.as_default():
+            unroll_tensors = {
+                SampleBatch.PREV_ACTIONS: self._prev_action_input,
+                SampleBatch.PREV_REWARDS: self._prev_reward_input,
+                SampleBatch.CUR_OBS: self._obs_input,
+            }
+            loss_inputs = [
+                (SampleBatch.PREV_ACTIONS, self._prev_action_input),
+                (SampleBatch.PREV_REWARDS, self._prev_reward_input),
+                (SampleBatch.CUR_OBS, self._obs_input),
+            ]
+
+            for k, v in postprocessed_batch.items():
+                if k in unroll_tensors:
+                    continue
+                elif v.dtype == np.object:
+                    continue  # can't handle arbitrary objects in TF
+                shape = (None,) + v.shape[1:]
+                placeholder = tf.placeholder(v.dtype, shape=shape, name=k)
+                unroll_tensors[k] = placeholder
+                loss_inputs.append((k, placeholder))  # TODO: prune to used only
+
+            loss = self._build_loss(unroll_tensors, self.action_dist)
+            TFPolicyGraph._initialize_loss(self, loss, loss_inputs)
+            self._sess.run(tf.global_variables_initializer())
+
+    @override(PolicyGraph)
+    def compute_gradients(self, postprocessed_batch):
+        self._initialize_loss_if_needed(postprocessed_batch)
+        return TFPolicyGraph.compute_gradients(self, postprocessed_batch)
+
+    @override(PolicyGraph)
+    def learn_on_batch(self, postprocessed_batch):
+        self._initialize_loss_if_needed(postprocessed_batch)
+        return TFPolicyGraph.learn_on_batch(self, postprocessed_batch)
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index 2b1eca9e8d5b..e5cf697ac32a 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -112,17 +112,38 @@ def __init__(self,
         self._prev_action_input = prev_action_input
         self._prev_reward_input = prev_reward_input
         self._sampler = action_sampler
-        self._loss_inputs = loss_inputs
-        self._loss_input_dict = dict(self._loss_inputs)
         self._is_training = self._get_is_training_placeholder()
         self._action_prob = action_prob
         self._state_inputs = state_inputs or []
         self._state_outputs = state_outputs or []
-        for i, ph in enumerate(self._state_inputs):
-            self._loss_input_dict["state_in_{}".format(i)] = ph
         self._seq_lens = seq_lens
         self._max_seq_len = max_seq_len
         self._batch_divisibility_req = batch_divisibility_req
+        self._update_ops = update_ops
+
+        if loss is not None:
+            self._initialize_loss(loss, loss_inputs)
+        else:
+            self._loss = None
+
+        if len(self._state_inputs) != len(self._state_outputs):
+            raise ValueError(
+                "Number of state input and output tensors must match, got: "
+                "{} vs {}".format(self._state_inputs, self._state_outputs))
+        if len(self.get_initial_state()) != len(self._state_inputs):
+            raise ValueError(
+                "Length of initial state must match number of state inputs, "
+                "got: {} vs {}".format(self.get_initial_state(),
+                                       self._state_inputs))
+        if self._state_inputs and self._seq_lens is None:
+            raise ValueError(
+                "seq_lens tensor must be given if state inputs are defined")
+
+    def _initialize_loss(self, loss, loss_inputs):
+        self._loss_inputs = loss_inputs
+        self._loss_input_dict = dict(self._loss_inputs)
+        for i, ph in enumerate(self._state_inputs):
+            self._loss_input_dict["state_in_{}".format(i)] = ph
 
         if self.model:
             self._loss = self.model.custom_loss(loss, self._loss_input_dict)
@@ -141,9 +162,7 @@ def __init__(self,
             self._loss, self._sess)
 
         # gather update ops for any batch norm layers
-        if update_ops:
-            self._update_ops = update_ops
-        else:
+        if not self._update_ops:
             self._update_ops = tf.get_collection(
                 tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
         if self._update_ops:
@@ -153,20 +172,7 @@ def __init__(self,
             self._apply_op = self.build_apply_op(self._optimizer,
                                                  self._grads_and_vars)
 
-        if len(self._state_inputs) != len(self._state_outputs):
-            raise ValueError(
-                "Number of state input and output tensors must match, got: "
-                "{} vs {}".format(self._state_inputs, self._state_outputs))
-        if len(self.get_initial_state()) != len(self._state_inputs):
-            raise ValueError(
-                "Length of initial state must match number of state inputs, "
-                "got: {} vs {}".format(self.get_initial_state(),
-                                       self._state_inputs))
-        if self._state_inputs and self._seq_lens is None:
-            raise ValueError(
-                "seq_lens tensor must be given if state inputs are defined")
-
-        logger.debug("Created {} with loss inputs: {}".format(
+        logger.debug("Initialized {} with loss inputs: {}".format(
             self, self._loss_input_dict))
 
     @override(PolicyGraph)
@@ -186,18 +192,21 @@ def compute_actions(self,
 
     @override(PolicyGraph)
     def compute_gradients(self, postprocessed_batch):
+        assert self._loss is not None, "Loss not initialized"
         builder = TFRunBuilder(self._sess, "compute_gradients")
         fetches = self._build_compute_gradients(builder, postprocessed_batch)
         return builder.get(fetches)
 
     @override(PolicyGraph)
     def apply_gradients(self, gradients):
+        assert self._loss is not None, "Loss not initialized"
         builder = TFRunBuilder(self._sess, "apply_gradients")
         fetches = self._build_apply_gradients(builder, gradients)
         builder.get(fetches)
 
     @override(PolicyGraph)
     def learn_on_batch(self, postprocessed_batch):
+        assert self._loss is not None, "Loss not initialized"
         builder = TFRunBuilder(self._sess, "learn_on_batch")
         fetches = self._build_learn_on_batch(builder, postprocessed_batch)
         return builder.get(fetches)

From b742efec0ec06947bbbcf60d5e520ceabd0886cc Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 15 May 2019 16:51:20 -0700
Subject: [PATCH 02/39] wip

---
 python/ray/rllib/agents/pg/pg.py              | 57 +++++++------------
 python/ray/rllib/agents/pg/pg_policy_graph.py |  1 -
 python/ray/rllib/agents/trainer_template.py   | 57 +++++++++++++++++++
 .../evaluation/dynamic_tf_policy_graph.py     | 21 ++++---
 4 files changed, 89 insertions(+), 47 deletions(-)
 create mode 100644 python/ray/rllib/agents/trainer_template.py

diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index 8ca36647dcb6..9223fd8e0ceb 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -2,12 +2,12 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.agents.trainer import Trainer, with_common_config
+from ray.rllib.agents.trainer import with_common_config
+from ray.rllib.agents.trainer_template import build_trainer
 from ray.rllib.agents.pg.pg_policy_graph import postprocess_advantages, \
     policy_gradient_loss, make_optimizer
 from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph
 from ray.rllib.optimizers import SyncSamplesOptimizer
-from ray.rllib.utils.annotations import override
 
 # yapf: disable
 # __sphinx_doc_begin__
@@ -22,49 +22,32 @@
 # __sphinx_doc_end__
 # yapf: enable
 
-
 PGPolicyGraph = build_tf_graph(
-    name="PG",
+    name="PGPolicyGraph",
     default_config=DEFAULT_CONFIG,
     postprocess_fn=postprocess_advantages,
     loss_fn=policy_gradient_loss,
     make_optimizer=make_optimizer)
 
 
-class PGTrainer(Trainer):
-    """Simple policy gradient agent.
+def make_policy_optimizer(local_ev, remote_evs, config):
+    optimizer_config = dict(config["optimizer"],
+                            **{"train_batch_size": config["train_batch_size"]})
+    return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config)
 
-    This is an example agent to show how to implement algorithms in RLlib.
-    In most cases, you will probably want to use the PPO agent instead.
-    """
 
-    _name = "PG"
-    _default_config = DEFAULT_CONFIG
-    _policy_graph = PGPolicyGraph
+def make_policy_graph(config):
+    if config["use_pytorch"]:
+        from ray.rllib.agents.pg.torch_pg_policy_graph import \
+            PGTorchPolicyGraph
+        return PGTorchPolicyGraph
+    else:
+        return PGPolicyGraph
 
-    @override(Trainer)
-    def _init(self, config, env_creator):
-        if config["use_pytorch"]:
-            from ray.rllib.agents.pg.torch_pg_policy_graph import \
-                PGTorchPolicyGraph
-            policy_cls = PGTorchPolicyGraph
-        else:
-            policy_cls = self._policy_graph
-        self.local_evaluator = self.make_local_evaluator(
-            env_creator, policy_cls)
-        self.remote_evaluators = self.make_remote_evaluators(
-            env_creator, policy_cls, config["num_workers"])
-        optimizer_config = dict(
-            config["optimizer"],
-            **{"train_batch_size": config["train_batch_size"]})
-        self.optimizer = SyncSamplesOptimizer(
-            self.local_evaluator, self.remote_evaluators, **optimizer_config)
 
-    @override(Trainer)
-    def _train(self):
-        prev_steps = self.optimizer.num_steps_sampled
-        self.optimizer.step()
-        result = self.collect_metrics()
-        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
-                      prev_steps)
-        return result
+PGTrainer = build_trainer(
+    "PG",
+    default_config=DEFAULT_CONFIG,
+    default_policy_graph=PGPolicyGraph,
+    make_policy_graph=make_policy_graph,
+    make_policy_optimizer=make_policy_optimizer)
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index f3bef7c6c296..3058bcf1a412 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -2,7 +2,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py
new file mode 100644
index 000000000000..9ac15ec5fdc2
--- /dev/null
+++ b/python/ray/rllib/agents/trainer_template.py
@@ -0,0 +1,57 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.agents.trainer import Trainer
+from ray.rllib.utils.annotations import override, DeveloperAPI
+
+
+@DeveloperAPI
+def build_trainer(name,
+                  default_config,
+                  default_policy_graph,
+                  make_policy_optimizer,
+                  validate_config=None,
+                  make_policy_graph=None,
+                  before_train_step=None,
+                  after_optimizer_step=None,
+                  after_train_result=None):
+    class trainer_cls(Trainer):
+        _name = name
+        _default_config = default_config
+        _policy_graph = default_policy_graph
+
+        def _init(self, config, env_creator):
+            if validate_config:
+                validate_config(config)
+            if make_policy_graph is None:
+                policy_graph = default_policy_graph
+            else:
+                policy_graph = make_policy_graph(config)
+            self.local_evaluator = self.make_local_evaluator(
+                env_creator, policy_graph)
+            self.remote_evaluators = self.make_remote_evaluators(
+                env_creator, policy_graph, config["num_workers"])
+            if make_policy_optimizer:
+                self.optimizer = make_policy_optimizer(
+                    self.local_evaluator, self.remote_evaluators, config)
+
+        @override(Trainer)
+        def _train(self):
+            if before_train_step:
+                before_train_step(self)
+            prev_steps = self.optimizer.num_steps_sampled
+            fetches = self.optimizer.step()
+            if after_optimizer_step:
+                after_optimizer_step(self, fetches)
+            res = self.collect_metrics()
+            res.update(
+                timesteps_this_iter=self.optimizer.num_steps_sampled -
+                prev_steps,
+                info=res.get("info", {}))
+            if after_train_result:
+                after_train_result(self, res)
+            return res
+
+    trainer_cls.__name__ = name
+    return trainer_cls
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 2185ca8785eb..151fa3121955 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -14,22 +14,24 @@
 tf = try_import_tf()
 
 
-def build_tf_graph(
-    name, default_config, postprocess_fn, loss_fn, make_optimizer=None):
-
+def build_tf_graph(name,
+                   default_config,
+                   postprocess_fn,
+                   loss_fn,
+                   make_optimizer=None):
     class graph_cls(DynamicTFPolicyGraph):
         def __init__(self, obs_space, action_space, config):
             config = dict(default_config, **config)
-            DynamicTFPolicyGraph.__init__(
-                self, obs_space, action_space, config, loss_fn)
+            DynamicTFPolicyGraph.__init__(self, obs_space, action_space,
+                                          config, loss_fn)
 
         @override(PolicyGraph)
         def postprocess_trajectory(self,
                                    sample_batch,
                                    other_agent_batches=None,
                                    episode=None):
-            return postprocess_fn(
-                self, sample_batch, other_agent_batches, episode)
+            return postprocess_fn(self, sample_batch, other_agent_batches,
+                                  episode)
 
         @override(TFPolicyGraph)
         def optimizer(self):
@@ -108,10 +110,11 @@ def _initialize_loss_if_needed(self, postprocessed_batch):
                     continue
                 elif v.dtype == np.object:
                     continue  # can't handle arbitrary objects in TF
-                shape = (None,) + v.shape[1:]
+                shape = (None, ) + v.shape[1:]
                 placeholder = tf.placeholder(v.dtype, shape=shape, name=k)
                 unroll_tensors[k] = placeholder
-                loss_inputs.append((k, placeholder))  # TODO: prune to used only
+                loss_inputs.append((k,
+                                    placeholder))  # TODO: prune to used only
 
             loss = self._build_loss(unroll_tensors, self.action_dist)
             TFPolicyGraph._initialize_loss(self, loss, loss_inputs)

From d8a722a48b95fb8f82f3441d5542107afd8d7cd3 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 15 May 2019 17:20:24 -0700
Subject: [PATCH 03/39] clean up

---
 python/ray/rllib/agents/pg/pg.py              | 19 +---
 python/ray/rllib/agents/pg/pg_policy_graph.py | 24 +++--
 .../evaluation/dynamic_tf_policy_graph.py     | 99 +++++++++++++++----
 .../ray/rllib/evaluation/tf_policy_graph.py   |  6 +-
 python/ray/rllib/utils/tracking_dict.py       | 19 ++++
 5 files changed, 124 insertions(+), 43 deletions(-)
 create mode 100644 python/ray/rllib/utils/tracking_dict.py

diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index 9223fd8e0ceb..347d404cc42f 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -4,9 +4,7 @@
 
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.agents.pg.pg_policy_graph import postprocess_advantages, \
-    policy_gradient_loss, make_optimizer
-from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph
+from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
 from ray.rllib.optimizers import SyncSamplesOptimizer
 
 # yapf: disable
@@ -22,21 +20,14 @@
 # __sphinx_doc_end__
 # yapf: enable
 
-PGPolicyGraph = build_tf_graph(
-    name="PGPolicyGraph",
-    default_config=DEFAULT_CONFIG,
-    postprocess_fn=postprocess_advantages,
-    loss_fn=policy_gradient_loss,
-    make_optimizer=make_optimizer)
-
 
-def make_policy_optimizer(local_ev, remote_evs, config):
+def _make_policy_optimizer(local_ev, remote_evs, config):
     optimizer_config = dict(config["optimizer"],
                             **{"train_batch_size": config["train_batch_size"]})
     return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config)
 
 
-def make_policy_graph(config):
+def _make_policy_graph(config):
     if config["use_pytorch"]:
         from ray.rllib.agents.pg.torch_pg_policy_graph import \
             PGTorchPolicyGraph
@@ -49,5 +40,5 @@ def make_policy_graph(config):
     "PG",
     default_config=DEFAULT_CONFIG,
     default_policy_graph=PGPolicyGraph,
-    make_policy_graph=make_policy_graph,
-    make_policy_optimizer=make_policy_optimizer)
+    make_policy_graph=_make_policy_graph,
+    make_policy_optimizer=_make_policy_optimizer)
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index 3058bcf1a412..c045cbfe2274 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -2,8 +2,10 @@
 from __future__ import division
 from __future__ import print_function
 
+import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
+from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils import try_import_tf
 
@@ -11,20 +13,28 @@
 
 
 # The basic policy gradients loss
-def policy_gradient_loss(postprocessed_batch, action_dist):
+def _policy_gradient_loss(graph, postprocessed_batch):
     actions = postprocessed_batch[SampleBatch.ACTIONS]
     advantages = postprocessed_batch[Postprocessing.ADVANTAGES]
-    return -tf.reduce_mean(action_dist.logp(actions) * advantages)
+    return -tf.reduce_mean(graph.action_dist.logp(actions) * advantages)
 
 
 # This adds the "advantages" column to the sample batch.
-def postprocess_advantages(graph,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
+def _postprocess_advantages(graph,
+                            sample_batch,
+                            other_agent_batches=None,
+                            episode=None):
     return compute_advantages(
         sample_batch, 0.0, graph.config["gamma"], use_gae=False)
 
 
-def make_optimizer(graph):
+def _make_optimizer(graph):
     return tf.train.AdamOptimizer(learning_rate=graph.config["lr"])
+
+
+PGPolicyGraph = build_tf_graph(
+    name="PGPolicyGraph",
+    get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
+    postprocess_fn=_postprocess_advantages,
+    loss_fn=_policy_gradient_loss,
+    make_optimizer=_make_optimizer)
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 151fa3121955..b2d07288efff 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -2,26 +2,49 @@
 from __future__ import division
 from __future__ import print_function
 
+import logging
 import numpy as np
 
 from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.utils.annotations import override
+from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils import try_import_tf
+from ray.rllib.utils.debug import log_once, summarize
+from ray.rllib.utils.tracking_dict import UsageTrackingDict
 
 tf = try_import_tf()
 
+logger = logging.getLogger(__name__)
 
+
+@DeveloperAPI
 def build_tf_graph(name,
-                   default_config,
-                   postprocess_fn,
+                   get_default_config,
                    loss_fn,
+                   postprocess_fn=None,
                    make_optimizer=None):
+    """Helper function for creating a dynamic tf policy graph at runtime.
+
+    Arguments:
+        name (str): name of the graph (e.g., "PGPolicyGraph")
+        get_default_config (func): function that returns the default config
+            to merge with any overrides
+        loss_fn (func): function that returns a loss tensor the policy graph,
+            and dict of experience tensor placeholders
+        postprocess_fn (func): optional experience postprocessing function
+            that takes the same args as PolicyGraph.postprocess_trajectory()
+        make_optimizer (func): optional function that returns a tf.Optimizer
+            given the policy graph object
+
+    Returns:
+        a DynamicTFPolicyGraph instance that uses the specified args
+    """
+
     class graph_cls(DynamicTFPolicyGraph):
         def __init__(self, obs_space, action_space, config):
-            config = dict(default_config, **config)
+            config = dict(get_default_config(), **config)
             DynamicTFPolicyGraph.__init__(self, obs_space, action_space,
                                           config, loss_fn)
 
@@ -30,6 +53,8 @@ def postprocess_trajectory(self,
                                    sample_batch,
                                    other_agent_batches=None,
                                    episode=None):
+            if not postprocess_fn:
+                return sample_batch
             return postprocess_fn(self, sample_batch, other_agent_batches,
                                   episode)
 
@@ -45,7 +70,22 @@ def optimizer(self):
 
 
 class DynamicTFPolicyGraph(TFPolicyGraph):
-    def __init__(self, obs_space, action_space, config, loss_fn):
+    """A TFPolicyGraph that auto-defines placeholders dynamically at runtime.
+
+    The loss function of this class is not initialized until the first batch
+    of experiences is collected from the environment. At that point we
+    dynamically generate TF placeholders based on the batch keys and values.
+    which are passed into the user-defined loss function.
+    """
+
+    def __init__(self,
+                 obs_space,
+                 action_space,
+                 config,
+                 loss_fn,
+                 autosetup_model=True,
+                 action_sampler=None,
+                 action_prob=None):
         self.config = config
         self._build_loss = loss_fn
 
@@ -57,13 +97,23 @@ def __init__(self, obs_space, action_space, config, loss_fn):
         prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
 
         # Create the model network and action outputs
-        self.model = ModelCatalog.get_model({
-            "obs": obs,
-            "prev_actions": prev_actions,
-            "prev_rewards": prev_rewards,
-            "is_training": self._get_is_training_placeholder(),
-        }, obs_space, action_space, self.logit_dim, self.config["model"])
-        self.action_dist = dist_class(self.model.outputs)
+        if autosetup_model:
+            self.model = ModelCatalog.get_model({
+                "obs": obs,
+                "prev_actions": prev_actions,
+                "prev_rewards": prev_rewards,
+                "is_training": self._get_is_training_placeholder(),
+            }, obs_space, action_space, self.logit_dim, self.config["model"])
+            self.action_dist = dist_class(self.model.outputs)
+            action_sampler = self.action_dist.sample()
+            action_prob = self.action_dist.sampled_action_prob()
+        else:
+            self.model = None
+            self.action_dist = None
+            if not action_sampler:
+                raise ValueError(
+                    "When autosetup_model=False, action_sampler must be "
+                    "passed in to the constructor.")
 
         sess = tf.get_default_session()
         TFPolicyGraph.__init__(
@@ -72,8 +122,8 @@ def __init__(self, obs_space, action_space, config, loss_fn):
             action_space,
             sess,
             obs_input=obs,
-            action_sampler=self.action_dist.sample(),
-            action_prob=self.action_dist.sampled_action_prob(),
+            action_sampler=action_sampler,
+            action_prob=action_prob,
             loss=None,  # dynamically initialized on run
             loss_inputs=[],
             model=self.model,
@@ -87,18 +137,21 @@ def __init__(self, obs_space, action_space, config, loss_fn):
 
     @override(PolicyGraph)
     def get_initial_state(self):
-        return self.model.state_init
+        if self.model:
+            return self.model.state_init
+        else:
+            return []
 
     def _initialize_loss_if_needed(self, postprocessed_batch):
         if self._loss is not None:
             return  # already created
 
         with self._sess.graph.as_default():
-            unroll_tensors = {
+            unroll_tensors = UsageTrackingDict({
                 SampleBatch.PREV_ACTIONS: self._prev_action_input,
                 SampleBatch.PREV_REWARDS: self._prev_reward_input,
                 SampleBatch.CUR_OBS: self._obs_input,
-            }
+            })
             loss_inputs = [
                 (SampleBatch.PREV_ACTIONS, self._prev_action_input),
                 (SampleBatch.PREV_REWARDS, self._prev_reward_input),
@@ -113,10 +166,16 @@ def _initialize_loss_if_needed(self, postprocessed_batch):
                 shape = (None, ) + v.shape[1:]
                 placeholder = tf.placeholder(v.dtype, shape=shape, name=k)
                 unroll_tensors[k] = placeholder
-                loss_inputs.append((k,
-                                    placeholder))  # TODO: prune to used only
 
-            loss = self._build_loss(unroll_tensors, self.action_dist)
+            if log_once("loss_init"):
+                logger.info(
+                    "Initializing loss function with inputs:\n\n{}\n".format(
+                        summarize(unroll_tensors)))
+
+            loss = self._build_loss(self, unroll_tensors)
+            for k in unroll_tensors.accessed_keys:
+                loss_inputs.append((k, unroll_tensors[k]))
+
             TFPolicyGraph._initialize_loss(self, loss, loss_inputs)
             self._sess.run(tf.global_variables_initializer())
 
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index e5cf697ac32a..8e07e61284b1 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -172,8 +172,10 @@ def _initialize_loss(self, loss, loss_inputs):
             self._apply_op = self.build_apply_op(self._optimizer,
                                                  self._grads_and_vars)
 
-        logger.debug("Initialized {} with loss inputs: {}".format(
-            self, self._loss_input_dict))
+        if log_once("loss_used"):
+            logger.info(
+                "These tensors were used in the loss_fn:\n\n{}\n".format(
+                    summarize(self._loss_input_dict)))
 
     @override(PolicyGraph)
     def compute_actions(self,
diff --git a/python/ray/rllib/utils/tracking_dict.py b/python/ray/rllib/utils/tracking_dict.py
new file mode 100644
index 000000000000..8b65a4708c2a
--- /dev/null
+++ b/python/ray/rllib/utils/tracking_dict.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class UsageTrackingDict(dict):
+    """Dict that tracks which keys have been accessed.
+    
+    We make the simplifying assumption only __getitem__ is used to access
+    values.
+    """
+
+    def __init__(self, *args, **kwargs):
+        dict.__init__(self, *args, **kwargs)
+        self.accessed_keys = set()
+
+    def __getitem__(self, key):
+        self.accessed_keys.add(key)
+        return dict.__getitem__(self, key)

From 169493c2d4605cca5de0bca651340efdbb07895b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 15 May 2019 17:27:49 -0700
Subject: [PATCH 04/39] fix

---
 python/ray/rllib/utils/tracking_dict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/rllib/utils/tracking_dict.py b/python/ray/rllib/utils/tracking_dict.py
index 8b65a4708c2a..d0a04c4d059e 100644
--- a/python/ray/rllib/utils/tracking_dict.py
+++ b/python/ray/rllib/utils/tracking_dict.py
@@ -5,7 +5,7 @@
 
 class UsageTrackingDict(dict):
     """Dict that tracks which keys have been accessed.
-    
+
     We make the simplifying assumption only __getitem__ is used to access
     values.
     """

From fee8ec53443ff439ec5c62b174c2ccb8d84d48de Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 15 May 2019 18:02:23 -0700
Subject: [PATCH 05/39] document trainer

---
 python/ray/rllib/agents/pg/pg.py              |  4 +--
 python/ray/rllib/agents/trainer_template.py   | 31 +++++++++++++++++--
 .../evaluation/dynamic_tf_policy_graph.py     |  2 +-
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index 347d404cc42f..d9acebd91e80 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -27,7 +27,7 @@ def _make_policy_optimizer(local_ev, remote_evs, config):
     return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config)
 
 
-def _make_policy_graph(config):
+def _get_policy_graph(config):
     if config["use_pytorch"]:
         from ray.rllib.agents.pg.torch_pg_policy_graph import \
             PGTorchPolicyGraph
@@ -40,5 +40,5 @@ def _make_policy_graph(config):
     "PG",
     default_config=DEFAULT_CONFIG,
     default_policy_graph=PGPolicyGraph,
-    make_policy_graph=_make_policy_graph,
+    get_policy_graph=_get_policy_graph,
     make_policy_optimizer=_make_policy_optimizer)
diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py
index 9ac15ec5fdc2..6c99eb6d4acc 100644
--- a/python/ray/rllib/agents/trainer_template.py
+++ b/python/ray/rllib/agents/trainer_template.py
@@ -12,10 +12,35 @@ def build_trainer(name,
                   default_policy_graph,
                   make_policy_optimizer,
                   validate_config=None,
-                  make_policy_graph=None,
+                  get_policy_graph=None,
                   before_train_step=None,
                   after_optimizer_step=None,
                   after_train_result=None):
+    """Helper function for defining a custom trainer.
+
+    Arguments:
+        name (str): name of the trainer (e.g., "PPO")
+        default_config (dict): the default config dict of the algorithm
+        default_policy_graph (cls): the default PolicyGraph class to use
+        make_policy_optimizer (func): function that returns a PolicyOptimizer
+            instance given (local_evaluator, remote_evaluators, config)
+        validate_config (func): optional callback that checks a given config
+            for correctness. It may mutate the config as needed.
+        get_policy_graph (func): optional callback that takes a config and
+            returns the policy graph class to override the default with
+        before_train_step (func): optional callback to run before each train()
+            call. It takes the trainer instance as an argument.
+        after_optimizer_step (func): optional callback to run after each
+            step() call to the policy optimizer. It takes the trainer instance
+            and the policy gradient fetches as arguments.
+        after_train_result (func): optional callback to run at the end of each
+            train() call. It takes the trainer instance and result dict as
+            arguments, and may mutate the result dict as needed.
+
+    Returns:
+        a Trainer instance that uses the specified args.
+    """
+
     class trainer_cls(Trainer):
         _name = name
         _default_config = default_config
@@ -24,10 +49,10 @@ class trainer_cls(Trainer):
         def _init(self, config, env_creator):
             if validate_config:
                 validate_config(config)
-            if make_policy_graph is None:
+            if get_policy_graph is None:
                 policy_graph = default_policy_graph
             else:
-                policy_graph = make_policy_graph(config)
+                policy_graph = get_policy_graph(config)
             self.local_evaluator = self.make_local_evaluator(
                 env_creator, policy_graph)
             self.remote_evaluators = self.make_remote_evaluators(
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index b2d07288efff..71e06d7f54c4 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -28,7 +28,7 @@ def build_tf_graph(name,
     """Helper function for creating a dynamic tf policy graph at runtime.
 
     Arguments:
-        name (str): name of the graph (e.g., "PGPolicyGraph")
+        name (str): name of the graph (e.g., "PPOPolicyGraph")
         get_default_config (func): function that returns the default config
             to merge with any overrides
         loss_fn (func): function that returns a loss tensor the policy graph,

From 03b602166b701a6cb8f9f4fe80f4aad7777190e9 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 15 May 2019 23:07:38 -0700
Subject: [PATCH 06/39] wip

---
 .../ray/rllib/agents/ppo/ppo_policy_graph.py  | 211 ++++++------------
 .../evaluation/dynamic_tf_policy_graph.py     |  64 +++++-
 .../ray/rllib/evaluation/tf_policy_graph.py   |   5 +
 .../rllib/optimizers/multi_gpu_optimizer.py   |  18 +-
 4 files changed, 142 insertions(+), 156 deletions(-)

diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index 61aced1db740..9a7c46e2854f 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -7,6 +7,7 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
+from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch
@@ -107,6 +108,50 @@ def reduce_mean_valid(t):
         self.loss = loss
 
 
+def loss_fn(graph, postprocessed_batch):
+    if graph.model.state_in:
+        max_seq_len = tf.reduce_max(graph.model.seq_lens)
+        mask = tf.sequence_mask(graph.model.seq_lens, max_seq_len)
+        mask = tf.reshape(mask, [-1])
+    else:
+        mask = tf.ones_like(
+            postprocessed_batch[Postprocessing.ADVANTAGES], dtype=tf.bool)
+
+    loss_obj = PPOLoss(
+        graph.action_space,
+        postprocessed_batch[Postprocessing.VALUE_TARGETS],
+        postprocessed_batch[Postprocessing.ADVANTAGES],
+        postprocessed_batch[SampleBatch.ACTIONS],
+        postprocessed_batch[BEHAVIOUR_LOGITS],
+        postprocessed_batch[SampleBatch.VF_PREDS],
+        graph.action_dist,
+        graph.value_function,
+        graph.kl_coeff,
+        mask,
+        entropy_coeff=graph.config["entropy_coeff"],
+        clip_param=graph.config["clip_param"],
+        vf_clip_param=graph.config["vf_clip_param"],
+        vf_loss_coeff=graph.config["vf_loss_coeff"],
+        use_gae=graph.config["use_gae"])
+
+    graph.explained_variance = explained_variance(
+        postprocessed_batch[Postprocessing.VALUE_TARGETS],
+        graph.value_function)
+
+    graph.stats_fetches = {
+        "cur_kl_coeff": graph.kl_coeff,
+        "cur_lr": tf.cast(graph.cur_lr, tf.float64),
+        "total_loss": loss_obj.loss,
+        "policy_loss": loss_obj.mean_policy_loss,
+        "vf_loss": loss_obj.mean_vf_loss,
+        "vf_explained_var": graph.explained_variance,
+        "kl": loss_obj.mean_kl,
+        "entropy": loss_obj.mean_entropy,
+    }
+
+    return loss_obj.loss
+
+
 class PPOPostprocessing(object):
     """Adds the policy logits, VF preds, and advantages to the trajectory."""
 
@@ -115,7 +160,7 @@ def extra_compute_action_fetches(self):
         return dict(
             TFPolicyGraph.extra_compute_action_fetches(self), **{
                 SampleBatch.VF_PREDS: self.value_function,
-                BEHAVIOUR_LOGITS: self.logits
+                BEHAVIOUR_LOGITS: self.model.outputs,
             })
 
     @override(PolicyGraph)
@@ -143,83 +188,19 @@ def postprocess_trajectory(self,
         return batch
 
 
-class PPOPolicyGraph(LearningRateSchedule, PPOPostprocessing, TFPolicyGraph):
+class PPOPolicyGraph(
+        LearningRateSchedule, PPOPostprocessing, DynamicTFPolicyGraph):
+
     def __init__(self,
                  observation_space,
                  action_space,
                  config,
                  existing_inputs=None):
-        """
-        Arguments:
-            observation_space: Environment observation space specification.
-            action_space: Environment action space specification.
-            config (dict): Configuration values for PPO graph.
-            existing_inputs (list): Optional list of tuples that specify the
-                placeholders upon which the graph should be built upon.
-        """
         config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)
-        self.sess = tf.get_default_session()
-        self.action_space = action_space
-        self.config = config
-        self.kl_coeff_val = self.config["kl_coeff"]
-        self.kl_target = self.config["kl_target"]
-        dist_cls, logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"])
-
-        if existing_inputs:
-            obs_ph, value_targets_ph, adv_ph, act_ph, \
-                logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \
-                existing_inputs[:8]
-            existing_state_in = existing_inputs[8:-1]
-            existing_seq_lens = existing_inputs[-1]
-        else:
-            obs_ph = tf.placeholder(
-                tf.float32,
-                name="obs",
-                shape=(None, ) + observation_space.shape)
-            adv_ph = tf.placeholder(
-                tf.float32, name="advantages", shape=(None, ))
-            act_ph = ModelCatalog.get_action_placeholder(action_space)
-            logits_ph = tf.placeholder(
-                tf.float32, name="logits", shape=(None, logit_dim))
-            vf_preds_ph = tf.placeholder(
-                tf.float32, name="vf_preds", shape=(None, ))
-            value_targets_ph = tf.placeholder(
-                tf.float32, name="value_targets", shape=(None, ))
-            prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
-            prev_rewards_ph = tf.placeholder(
-                tf.float32, [None], name="prev_reward")
-            existing_state_in = None
-            existing_seq_lens = None
-        self.observations = obs_ph
-        self.prev_actions = prev_actions_ph
-        self.prev_rewards = prev_rewards_ph
-
-        self.loss_in = [
-            (SampleBatch.CUR_OBS, obs_ph),
-            (Postprocessing.VALUE_TARGETS, value_targets_ph),
-            (Postprocessing.ADVANTAGES, adv_ph),
-            (SampleBatch.ACTIONS, act_ph),
-            (BEHAVIOUR_LOGITS, logits_ph),
-            (SampleBatch.VF_PREDS, vf_preds_ph),
-            (SampleBatch.PREV_ACTIONS, prev_actions_ph),
-            (SampleBatch.PREV_REWARDS, prev_rewards_ph),
-        ]
-        self.model = ModelCatalog.get_model(
-            {
-                "obs": obs_ph,
-                "prev_actions": prev_actions_ph,
-                "prev_rewards": prev_rewards_ph,
-                "is_training": self._get_is_training_placeholder(),
-            },
-            observation_space,
-            action_space,
-            logit_dim,
-            self.config["model"],
-            state_in=existing_state_in,
-            seq_lens=existing_seq_lens)
 
         # KL Coefficient
+        self.kl_coeff_val = config["kl_coeff"]
+        self.kl_target = config["kl_target"]
         self.kl_coeff = tf.get_variable(
             initializer=tf.constant_initializer(self.kl_coeff_val),
             name="kl_coeff",
@@ -227,9 +208,10 @@ def __init__(self,
             trainable=False,
             dtype=tf.float32)
 
-        self.logits = self.model.outputs
-        curr_action_dist = dist_cls(self.logits)
-        self.sampler = curr_action_dist.sample()
+        DynamicTFPolicyGraph.__init__(
+            self, observation_space, action_space, config, loss_fn,
+            existing_inputs=existing_inputs)
+
         if self.config["use_gae"]:
             if self.config["vf_share_layers"]:
                 self.value_function = self.model.value_function()
@@ -249,81 +231,18 @@ def __init__(self,
                         "value_function() method.")
                 with tf.variable_scope("value_function"):
                     self.value_function = ModelCatalog.get_model({
-                        "obs": obs_ph,
-                        "prev_actions": prev_actions_ph,
-                        "prev_rewards": prev_rewards_ph,
+                        "obs": self._obs_input,
+                        "prev_actions": self._prev_action_input,
+                        "prev_rewards": self._prev_reward_input,
                         "is_training": self._get_is_training_placeholder(),
                     }, observation_space, action_space, 1, vf_config).outputs
                     self.value_function = tf.reshape(self.value_function, [-1])
         else:
-            self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])
-
-        if self.model.state_in:
-            max_seq_len = tf.reduce_max(self.model.seq_lens)
-            mask = tf.sequence_mask(self.model.seq_lens, max_seq_len)
-            mask = tf.reshape(mask, [-1])
-        else:
-            mask = tf.ones_like(adv_ph, dtype=tf.bool)
-
-        self.loss_obj = PPOLoss(
-            action_space,
-            value_targets_ph,
-            adv_ph,
-            act_ph,
-            logits_ph,
-            vf_preds_ph,
-            curr_action_dist,
-            self.value_function,
-            self.kl_coeff,
-            mask,
-            entropy_coeff=self.config["entropy_coeff"],
-            clip_param=self.config["clip_param"],
-            vf_clip_param=self.config["vf_clip_param"],
-            vf_loss_coeff=self.config["vf_loss_coeff"],
-            use_gae=self.config["use_gae"])
+            self.value_function = tf.zeros(shape=tf.shape(self._obs_input)[:1])
 
         LearningRateSchedule.__init__(self, self.config["lr"],
                                       self.config["lr_schedule"])
-        TFPolicyGraph.__init__(
-            self,
-            observation_space,
-            action_space,
-            self.sess,
-            obs_input=obs_ph,
-            action_sampler=self.sampler,
-            action_prob=curr_action_dist.sampled_action_prob(),
-            loss=self.loss_obj.loss,
-            model=self.model,
-            loss_inputs=self.loss_in,
-            state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out,
-            prev_action_input=prev_actions_ph,
-            prev_reward_input=prev_rewards_ph,
-            seq_lens=self.model.seq_lens,
-            max_seq_len=config["model"]["max_seq_len"])
-
-        self.sess.run(tf.global_variables_initializer())
-        self.explained_variance = explained_variance(value_targets_ph,
-                                                     self.value_function)
-        self.stats_fetches = {
-            "cur_kl_coeff": self.kl_coeff,
-            "cur_lr": tf.cast(self.cur_lr, tf.float64),
-            "total_loss": self.loss_obj.loss,
-            "policy_loss": self.loss_obj.mean_policy_loss,
-            "vf_loss": self.loss_obj.mean_vf_loss,
-            "vf_explained_var": self.explained_variance,
-            "kl": self.loss_obj.mean_kl,
-            "entropy": self.loss_obj.mean_entropy
-        }
-
-    @override(TFPolicyGraph)
-    def copy(self, existing_inputs):
-        """Creates a copy of self using existing input placeholders."""
-        return PPOPolicyGraph(
-            self.observation_space,
-            self.action_space,
-            self.config,
-            existing_inputs=existing_inputs)
+        self._sess.run(tf.global_variables_initializer())
 
     @override(TFPolicyGraph)
     def gradients(self, optimizer, loss):
@@ -352,19 +271,19 @@ def update_kl(self, sampled_kl):
             self.kl_coeff_val *= 1.5
         elif sampled_kl < 0.5 * self.kl_target:
             self.kl_coeff_val *= 0.5
-        self.kl_coeff.load(self.kl_coeff_val, session=self.sess)
+        self.kl_coeff.load(self.kl_coeff_val, session=self._sess)
         return self.kl_coeff_val
 
     def _value(self, ob, prev_action, prev_reward, *args):
         feed_dict = {
-            self.observations: [ob],
-            self.prev_actions: [prev_action],
-            self.prev_rewards: [prev_reward],
+            self._obs_input: [ob],
+            self._prev_action_input: [prev_action],
+            self._prev_reward_input: [prev_reward],
             self.model.seq_lens: [1]
         }
         assert len(args) == len(self.model.state_in), \
             (args, self.model.state_in)
         for k, v in zip(self.model.state_in, args):
             feed_dict[k] = v
-        vf = self.sess.run(self.value_function, feed_dict)
+        vf = self._sess.run(self.value_function, feed_dict)
         return vf[0]
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 71e06d7f54c4..9141c51e7681 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -24,7 +24,8 @@ def build_tf_graph(name,
                    get_default_config,
                    loss_fn,
                    postprocess_fn=None,
-                   make_optimizer=None):
+                   make_optimizer=None,
+                   extra_action_fetches=None):
     """Helper function for creating a dynamic tf policy graph at runtime.
 
     Arguments:
@@ -37,6 +38,8 @@ def build_tf_graph(name,
             that takes the same args as PolicyGraph.postprocess_trajectory()
         make_optimizer (func): optional function that returns a tf.Optimizer
             given the policy graph object
+        extra_action_fetches (func): optional function that returns
+            a dict of TF fetches given the policy graph object
 
     Returns:
         a DynamicTFPolicyGraph instance that uses the specified args
@@ -47,6 +50,11 @@ def __init__(self, obs_space, action_space, config):
             config = dict(get_default_config(), **config)
             DynamicTFPolicyGraph.__init__(self, obs_space, action_space,
                                           config, loss_fn)
+            if build_extra_action_fetches is None:
+                self._extra_action_fetches = {}
+            else:
+                self._extra_action_fetches = (
+                    build_extra_action_fetches(self))
 
         @override(PolicyGraph)
         def postprocess_trajectory(self,
@@ -65,6 +73,12 @@ def optimizer(self):
             else:
                 return TFPolicyGraph.optimizer(self)
 
+        @override(TFPolicyGraph)
+        def extra_compute_action_fetches(self):
+            return dict(
+                TFPolicyGraph.extra_compute_action_fetches(self),
+                **self._extra_action_fetches)
+
     graph_cls.__name__ = name
     return graph_cls
 
@@ -85,19 +99,28 @@ def __init__(self,
                  loss_fn,
                  autosetup_model=True,
                  action_sampler=None,
-                 action_prob=None):
+                 action_prob=None,
+                 existing_inputs=None):
         self.config = config
+        self.autosetup_model = autosetup_model
         self._build_loss = loss_fn
 
         # Setup standard placeholders
-        obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
-        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"])
-        prev_actions = ModelCatalog.get_action_placeholder(action_space)
-        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
+        if existing_inputs is not None:
+            obs = existing_inputs[SampleBatch.CUR_OBS]
+            prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS]
+            prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS]
+        else:
+            obs = tf.placeholder(
+                tf.float32, shape=[None] + list(obs_space.shape),
+                name="observation")
+            prev_actions = ModelCatalog.get_action_placeholder(action_space)
+            prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
 
         # Create the model network and action outputs
         if autosetup_model:
+            dist_class, self.logit_dim = ModelCatalog.get_action_dist(
+                action_space, self.config["model"])
             self.model = ModelCatalog.get_model({
                 "obs": obs,
                 "prev_actions": prev_actions,
@@ -108,6 +131,7 @@ def __init__(self,
             action_sampler = self.action_dist.sample()
             action_prob = self.action_dist.sampled_action_prob()
         else:
+            self.logit_dim = None
             self.model = None
             self.action_dist = None
             if not action_sampler:
@@ -135,6 +159,31 @@ def __init__(self,
             max_seq_len=config["model"]["max_seq_len"])
         sess.run(tf.global_variables_initializer())
 
+    @override(TFPolicyGraph)
+    def copy(self, existing_inputs):
+        """Creates a copy of self using existing input placeholders."""
+        assert self._loss is not None, "Cannot copy graph before loss init"
+        if len(self._loss_inputs) != len(existing_inputs):
+            raise ValueError(
+                "Tensor list mismatch", self._loss_inputs,
+                existing_inputs)
+        for i, (k, v) in enumerate(self._loss_inputs):
+            if v.shape.as_list() != existing_inputs[i].shape.as_list():
+                raise ValueError(
+                    "Tensor shape mismatch", i, k,
+                    v.shape, existing_inputs[i].shape)
+        input_dict = {
+            k: existing_inputs[i] for i, (k, _) in enumerate(self._loss_inputs)
+        }
+        instance = self.__class__(
+            self.observation_space, self.action_space, self.config,
+            existing_inputs=input_dict)
+        loss = instance._build_loss(instance, input_dict)
+        TFPolicyGraph._initialize_loss(
+            instance, loss,
+            [(k, existing_inputs[i]) for i, (k, _) in enumerate(self._loss_inputs)])
+        return instance
+
     @override(PolicyGraph)
     def get_initial_state(self):
         if self.model:
@@ -177,7 +226,6 @@ def _initialize_loss_if_needed(self, postprocessed_batch):
                 loss_inputs.append((k, unroll_tensors[k]))
 
             TFPolicyGraph._initialize_loss(self, loss, loss_inputs)
-            self._sess.run(tf.global_variables_initializer())
 
     @override(PolicyGraph)
     def compute_gradients(self, postprocessed_batch):
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index 8e07e61284b1..d988a4be7e4f 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -125,6 +125,9 @@ def __init__(self,
             self._initialize_loss(loss, loss_inputs)
         else:
             self._loss = None
+            # TODO(ekl) what is the right way to handle pre-init vars?
+            self._variables = ray.experimental.tf_utils.TensorFlowVariables(
+                self._sampler, self._sess)
 
         if len(self._state_inputs) != len(self._state_outputs):
             raise ValueError(
@@ -177,6 +180,8 @@ def _initialize_loss(self, loss, loss_inputs):
                 "These tensors were used in the loss_fn:\n\n{}\n".format(
                     summarize(self._loss_input_dict)))
 
+        self._sess.run(tf.global_variables_initializer())
+
     @override(PolicyGraph)
     def compute_actions(self,
                         obs_batch,
diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
index 45df865e43ff..79340769f284 100644
--- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py
+++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
@@ -8,6 +8,7 @@
 from collections import defaultdict
 
 import ray
+from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
@@ -92,9 +93,18 @@ def __init__(self,
         # reuse is set to AUTO_REUSE because Adam nodes are created after
         # all of the device copies are created.
         self.optimizers = {}
+
+    def _initialize_optimizers_as_needed(self, samples):
         with self.local_evaluator.tf_sess.graph.as_default():
             with self.local_evaluator.tf_sess.as_default():
-                for policy_id, policy in self.policies.items():
+                for policy_id, sample_batch in samples.policy_batches.items():
+                    if policy_id in self.optimizers:
+                        continue  # already initialized
+
+                    policy = self.policies[policy_id]
+                    if isinstance(policy, DynamicTFPolicyGraph):
+                        policy._initialize_loss_if_needed(sample_batch)
+
                     with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE):
                         if policy._state_inputs:
                             rnn_inputs = policy._state_inputs + [
@@ -110,7 +120,9 @@ def __init__(self,
                                 self.per_device_batch_size, policy.copy))
 
                 self.sess = self.local_evaluator.tf_sess
-                self.sess.run(tf.global_variables_initializer())
+                self.sess.run(tf.global_variables_initializer())  # TODO(ekl) how to deal with this
+
+        self.optimizers_initialized = True
 
     @override(PolicyOptimizer)
     def step(self):
@@ -148,6 +160,8 @@ def step(self):
                     DEFAULT_POLICY_ID: samples
                 }, samples.count)
 
+            self._initialize_optimizers_as_needed(samples)
+
         for policy_id, policy in self.policies.items():
             if policy_id not in samples.policy_batches:
                 continue

From 48bdcf4a2f2f389327937ad250ce0dd1332c7d83 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 15 May 2019 23:24:36 -0700
Subject: [PATCH 07/39] initialize the graph using a fake batch

---
 .../evaluation/dynamic_tf_policy_graph.py     | 41 +++++++++++--------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 71e06d7f54c4..2dab7c7f8c2b 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -72,10 +72,11 @@ def optimizer(self):
 class DynamicTFPolicyGraph(TFPolicyGraph):
     """A TFPolicyGraph that auto-defines placeholders dynamically at runtime.
 
-    The loss function of this class is not initialized until the first batch
-    of experiences is collected from the environment. At that point we
-    dynamically generate TF placeholders based on the batch keys and values.
-    which are passed into the user-defined loss function.
+    Initialization of this class occurs in two phases.
+      * Phase 1: the model is created and model variables are initialized.
+      * Phase 2: a fake batch of data is created, sent to the trajectory
+        postprocessor, and then used to create placeholders for the loss
+        function. The loss function is initialiezd with these placeholders.
     """
 
     def __init__(self,
@@ -133,6 +134,7 @@ def __init__(self,
             prev_reward_input=prev_rewards,
             seq_lens=self.model.seq_lens,
             max_seq_len=config["model"]["max_seq_len"])
+        self._initialize_loss_if_needed()
         sess.run(tf.global_variables_initializer())
 
     @override(PolicyGraph)
@@ -142,10 +144,28 @@ def get_initial_state(self):
         else:
             return []
 
-    def _initialize_loss_if_needed(self, postprocessed_batch):
+    def _initialize_loss_if_needed(self):
         if self._loss is not None:
             return  # already created
 
+        def fake_array(tensor):
+            shape = tensor.shape.as_list()
+            shape[0] = 1
+            return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype)
+
+        fake_batch = {
+            SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input),
+            SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input),
+            SampleBatch.CUR_OBS: fake_array(self._obs_input),
+            SampleBatch.ACTIONS: fake_array(self._sampler),
+            SampleBatch.REWARDS: np.array([0], dtype=np.int32),
+        }
+        for k, v in self.extra_compute_action_fetches().items():
+            fake_batch[k] = fake_array(v)
+
+        postprocessed_batch = self.postprocess_trajectory(
+            SampleBatch(fake_batch))
+
         with self._sess.graph.as_default():
             unroll_tensors = UsageTrackingDict({
                 SampleBatch.PREV_ACTIONS: self._prev_action_input,
@@ -177,14 +197,3 @@ def _initialize_loss_if_needed(self, postprocessed_batch):
                 loss_inputs.append((k, unroll_tensors[k]))
 
             TFPolicyGraph._initialize_loss(self, loss, loss_inputs)
-            self._sess.run(tf.global_variables_initializer())
-
-    @override(PolicyGraph)
-    def compute_gradients(self, postprocessed_batch):
-        self._initialize_loss_if_needed(postprocessed_batch)
-        return TFPolicyGraph.compute_gradients(self, postprocessed_batch)
-
-    @override(PolicyGraph)
-    def learn_on_batch(self, postprocessed_batch):
-        self._initialize_loss_if_needed(postprocessed_batch)
-        return TFPolicyGraph.learn_on_batch(self, postprocessed_batch)

From 18b290bff8aa6d02aea89b629ff0089a0292315b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Wed, 15 May 2019 23:30:01 -0700
Subject: [PATCH 08/39] clean up dynamic init

---
 .../ray/rllib/evaluation/dynamic_tf_policy_graph.py  | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 2dab7c7f8c2b..14964348a2e8 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -116,6 +116,7 @@ def __init__(self,
                     "When autosetup_model=False, action_sampler must be "
                     "passed in to the constructor.")
 
+        # Phase 1 init
         sess = tf.get_default_session()
         TFPolicyGraph.__init__(
             self,
@@ -134,7 +135,9 @@ def __init__(self,
             prev_reward_input=prev_rewards,
             seq_lens=self.model.seq_lens,
             max_seq_len=config["model"]["max_seq_len"])
-        self._initialize_loss_if_needed()
+
+        # Phase 2 init
+        self._initialize_loss_dynamic()
         sess.run(tf.global_variables_initializer())
 
     @override(PolicyGraph)
@@ -144,10 +147,7 @@ def get_initial_state(self):
         else:
             return []
 
-    def _initialize_loss_if_needed(self):
-        if self._loss is not None:
-            return  # already created
-
+    def _initialize_loss_dynamic(self):
         def fake_array(tensor):
             shape = tensor.shape.as_list()
             shape[0] = 1
@@ -157,8 +157,10 @@ def fake_array(tensor):
             SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input),
             SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input),
             SampleBatch.CUR_OBS: fake_array(self._obs_input),
+            SampleBatch.NEXT_OBS: fake_array(self._obs_input),
             SampleBatch.ACTIONS: fake_array(self._sampler),
             SampleBatch.REWARDS: np.array([0], dtype=np.int32),
+            SampleBatch.DONES: np.array([False], dtype=np.bool),
         }
         for k, v in self.extra_compute_action_fetches().items():
             fake_batch[k] = fake_array(v)

From 26367032e64652a905c5a3aa4b4111d80a43c8a8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 00:32:55 -0700
Subject: [PATCH 09/39] wip

---
 python/ray/rllib/optimizers/multi_gpu_optimizer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
index 23ee1833b9f0..45df865e43ff 100644
--- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py
+++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
@@ -6,7 +6,6 @@
 import math
 import numpy as np
 from collections import defaultdict
-import tensorflow as tf
 
 import ray
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
@@ -19,6 +18,9 @@
 from ray.rllib.utils.timer import TimerStat
 from ray.rllib.evaluation.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
     MultiAgentBatch
+from ray.rllib.utils import try_import_tf
+
+tf = try_import_tf()
 
 logger = logging.getLogger(__name__)
 

From d239a79efb0a49ec97ecef8c69297a990f600b34 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 00:34:29 -0700
Subject: [PATCH 10/39] spelling

---
 python/ray/rllib/evaluation/dynamic_tf_policy_graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 7f298d3abb09..f678df9cd34c 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -90,7 +90,7 @@ class DynamicTFPolicyGraph(TFPolicyGraph):
       * Phase 1: the model is created and model variables are initialized.
       * Phase 2: a fake batch of data is created, sent to the trajectory
         postprocessor, and then used to create placeholders for the loss
-        function. The loss function is initialiezd with these placeholders.
+        function. The loss function is initialized with these placeholders.
     """
 
     def __init__(self,

From 3ff0d0883b8ccdb9a2363e0323f348a2acccdfde Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 12:17:18 -0700
Subject: [PATCH 11/39] use builder for ppo pol graph

---
 python/ray/rllib/agents/pg/pg_policy_graph.py |  20 +-
 .../ray/rllib/agents/ppo/ppo_policy_graph.py  | 252 +++++++++---------
 .../evaluation/dynamic_tf_policy_graph.py     |  95 ++-----
 .../ray/rllib/evaluation/tf_policy_graph.py   |   4 +-
 .../evaluation/tf_policy_graph_template.py    | 120 +++++++++
 python/ray/rllib/optimizers/multi_gpu_impl.py |   2 +-
 .../rllib/optimizers/multi_gpu_optimizer.py   |   2 +-
 7 files changed, 276 insertions(+), 219 deletions(-)
 create mode 100644 python/ray/rllib/evaluation/tf_policy_graph_template.py

diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index c045cbfe2274..4858d4a5e87f 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -5,7 +5,7 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph
+from ray.rllib.evaluation.tf_policy_graph_template import build_tf_graph
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils import try_import_tf
 
@@ -13,23 +13,23 @@
 
 
 # The basic policy gradients loss
-def _policy_gradient_loss(graph, postprocessed_batch):
-    actions = postprocessed_batch[SampleBatch.ACTIONS]
-    advantages = postprocessed_batch[Postprocessing.ADVANTAGES]
-    return -tf.reduce_mean(graph.action_dist.logp(actions) * advantages)
+def _policy_gradient_loss(policy, batch_tensors):
+    actions = batch_tensors[SampleBatch.ACTIONS]
+    advantages = batch_tensors[Postprocessing.ADVANTAGES]
+    return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages)
 
 
 # This adds the "advantages" column to the sample batch.
-def _postprocess_advantages(graph,
+def _postprocess_advantages(policy,
                             sample_batch,
                             other_agent_batches=None,
                             episode=None):
     return compute_advantages(
-        sample_batch, 0.0, graph.config["gamma"], use_gae=False)
+        sample_batch, 0.0, policy.config["gamma"], use_gae=False)
 
 
-def _make_optimizer(graph):
-    return tf.train.AdamOptimizer(learning_rate=graph.config["lr"])
+def _make_optimizer(policy):
+    return tf.train.AdamOptimizer(learning_rate=policy.config["lr"])
 
 
 PGPolicyGraph = build_tf_graph(
@@ -37,4 +37,4 @@ def _make_optimizer(graph):
     get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
     postprocess_fn=_postprocess_advantages,
     loss_fn=_policy_gradient_loss,
-    make_optimizer=_make_optimizer)
+    optimizer_fn=_make_optimizer)
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index b34ebc45482a..b322dba122b0 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -7,14 +7,10 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph
-from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \
-    LearningRateSchedule
+from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule
+from ray.rllib.evaluation.tf_policy_graph_template import build_tf_graph
 from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.utils.annotations import override
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils import try_import_tf
 
@@ -108,95 +104,103 @@ def reduce_mean_valid(t):
         self.loss = loss
 
 
-def _build_ppo_loss(graph, postprocessed_batch):
-    if graph.model.state_in:
-        max_seq_len = tf.reduce_max(graph.model.seq_lens)
-        mask = tf.sequence_mask(graph.model.seq_lens, max_seq_len)
+def _build_ppo_loss(policy, batch_tensors):
+    if policy.model.state_in:
+        max_seq_len = tf.reduce_max(policy.model.seq_lens)
+        mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len)
         mask = tf.reshape(mask, [-1])
     else:
         mask = tf.ones_like(
-            postprocessed_batch[Postprocessing.ADVANTAGES], dtype=tf.bool)
-
-    loss_obj = PPOLoss(
-        graph.action_space,
-        postprocessed_batch[Postprocessing.VALUE_TARGETS],
-        postprocessed_batch[Postprocessing.ADVANTAGES],
-        postprocessed_batch[SampleBatch.ACTIONS],
-        postprocessed_batch[BEHAVIOUR_LOGITS],
-        postprocessed_batch[SampleBatch.VF_PREDS],
-        graph.action_dist,
-        graph.value_function,
-        graph.kl_coeff,
+            batch_tensors[Postprocessing.ADVANTAGES], dtype=tf.bool)
+
+    policy.loss_obj = PPOLoss(
+        policy.action_space,
+        batch_tensors[Postprocessing.VALUE_TARGETS],
+        batch_tensors[Postprocessing.ADVANTAGES],
+        batch_tensors[SampleBatch.ACTIONS],
+        batch_tensors[BEHAVIOUR_LOGITS],
+        batch_tensors[SampleBatch.VF_PREDS],
+        policy.action_dist,
+        policy.value_function,
+        policy.kl_coeff,
         mask,
-        entropy_coeff=graph.config["entropy_coeff"],
-        clip_param=graph.config["clip_param"],
-        vf_clip_param=graph.config["vf_clip_param"],
-        vf_loss_coeff=graph.config["vf_loss_coeff"],
-        use_gae=graph.config["use_gae"])
-
-    graph.explained_variance = explained_variance(
-        postprocessed_batch[Postprocessing.VALUE_TARGETS],
-        graph.value_function)
-
-    graph.stats_fetches = {
-        "cur_kl_coeff": graph.kl_coeff,
-        "cur_lr": tf.cast(graph.cur_lr, tf.float64),
-        "total_loss": loss_obj.loss,
-        "policy_loss": loss_obj.mean_policy_loss,
-        "vf_loss": loss_obj.mean_vf_loss,
-        "vf_explained_var": graph.explained_variance,
-        "kl": loss_obj.mean_kl,
-        "entropy": loss_obj.mean_entropy,
+        entropy_coeff=policy.config["entropy_coeff"],
+        clip_param=policy.config["clip_param"],
+        vf_clip_param=policy.config["vf_clip_param"],
+        vf_loss_coeff=policy.config["vf_loss_coeff"],
+        use_gae=policy.config["use_gae"])
+
+    return policy.loss_obj.loss
+
+
+def _build_ppo_stats(policy, batch_tensors):
+    policy.explained_variance = explained_variance(
+        batch_tensors[Postprocessing.VALUE_TARGETS], policy.value_function)
+
+    stats_fetches = {
+        "cur_kl_coeff": policy.kl_coeff,
+        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
+        "total_loss": policy.loss_obj.loss,
+        "policy_loss": policy.loss_obj.mean_policy_loss,
+        "vf_loss": policy.loss_obj.mean_vf_loss,
+        "vf_explained_var": policy.explained_variance,
+        "kl": policy.loss_obj.mean_kl,
+        "entropy": policy.loss_obj.mean_entropy,
     }
 
-    return loss_obj.loss
+    return stats_fetches
 
 
-class PPOPostprocessing(object):
+def _build_ppo_action_fetches(policy):
+    """Adds value function and logits outputs to experience batches."""
+    return {
+        SampleBatch.VF_PREDS: policy.value_function,
+        BEHAVIOUR_LOGITS: policy.model.outputs,
+    }
+
+
+def _postprocess_ppo_gae(policy,
+                         sample_batch,
+                         other_agent_batches=None,
+                         episode=None):
     """Adds the policy logits, VF preds, and advantages to the trajectory."""
 
-    @override(TFPolicyGraph)
-    def extra_compute_action_fetches(self):
-        return dict(
-            TFPolicyGraph.extra_compute_action_fetches(self), **{
-                SampleBatch.VF_PREDS: self.value_function,
-                BEHAVIOUR_LOGITS: self.model.outputs,
-            })
-
-    @override(PolicyGraph)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        completed = sample_batch["dones"][-1]
-        if completed:
-            last_r = 0.0
-        else:
-            next_state = []
-            for i in range(len(self.model.state_in)):
-                next_state.append([sample_batch["state_out_{}".format(i)][-1]])
-            last_r = self._value(sample_batch[SampleBatch.NEXT_OBS][-1],
-                                 sample_batch[SampleBatch.ACTIONS][-1],
-                                 sample_batch[SampleBatch.REWARDS][-1],
-                                 *next_state)
-        batch = compute_advantages(
-            sample_batch,
-            last_r,
-            self.config["gamma"],
-            self.config["lambda"],
-            use_gae=self.config["use_gae"])
-        return batch
-
-
-class PPOPolicyGraph(LearningRateSchedule, PPOPostprocessing,
-                     DynamicTFPolicyGraph):
-    def __init__(self,
-                 observation_space,
-                 action_space,
-                 config,
-                 existing_inputs=None):
-        config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)
+    completed = sample_batch["dones"][-1]
+    if completed:
+        last_r = 0.0
+    else:
+        next_state = []
+        for i in range(len(policy.model.state_in)):
+            next_state.append([sample_batch["state_out_{}".format(i)][-1]])
+        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
+                               sample_batch[SampleBatch.ACTIONS][-1],
+                               sample_batch[SampleBatch.REWARDS][-1],
+                               *next_state)
+    batch = compute_advantages(
+        sample_batch,
+        last_r,
+        policy.config["gamma"],
+        policy.config["lambda"],
+        use_gae=policy.config["use_gae"])
+    return batch
+
+
+def _build_ppo_gradients(policy, optimizer, loss):
+    if policy.config["grad_clip"] is not None:
+        policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
+                                            tf.get_variable_scope().name)
+        grads = tf.gradients(loss, policy.var_list)
+        policy.grads, _ = tf.clip_by_global_norm(grads,
+                                                 policy.config["grad_clip"])
+        clipped_grads = list(zip(policy.grads, policy.var_list))
+        return clipped_grads
+    else:
+        return optimizer.compute_gradients(
+            loss, colocate_gradients_with_ops=True)
+
 
+class KLCoeffMixin(object):
+    def __init__(self, config):
         # KL Coefficient
         self.kl_coeff_val = config["kl_coeff"]
         self.kl_target = config["kl_target"]
@@ -207,20 +211,22 @@ def __init__(self,
             trainable=False,
             dtype=tf.float32)
 
-        DynamicTFPolicyGraph.__init__(
-            self,
-            observation_space,
-            action_space,
-            config,
-            _build_ppo_loss,
-            existing_inputs=existing_inputs,
-            autoinit_loss=False)
-
-        if self.config["use_gae"]:
-            if self.config["vf_share_layers"]:
+    def update_kl(self, sampled_kl):
+        if sampled_kl > 2.0 * self.kl_target:
+            self.kl_coeff_val *= 1.5
+        elif sampled_kl < 0.5 * self.kl_target:
+            self.kl_coeff_val *= 0.5
+        self.kl_coeff.load(self.kl_coeff_val, session=self._sess)
+        return self.kl_coeff_val
+
+
+class ValueNetworkMixin(object):
+    def __init__(self, obs_space, action_space, config):
+        if config["use_gae"]:
+            if config["vf_share_layers"]:
                 self.value_function = self.model.value_function()
             else:
-                vf_config = self.config["model"].copy()
+                vf_config = config["model"].copy()
                 # Do not split the last layer of the value function into
                 # mean parameters and standard deviation parameters and
                 # do not make the standard deviations free variables.
@@ -239,45 +245,11 @@ def __init__(self,
                         "prev_actions": self._prev_action_input,
                         "prev_rewards": self._prev_reward_input,
                         "is_training": self._get_is_training_placeholder(),
-                    }, observation_space, action_space, 1, vf_config).outputs
+                    }, obs_space, action_space, 1, vf_config).outputs
                     self.value_function = tf.reshape(self.value_function, [-1])
         else:
             self.value_function = tf.zeros(shape=tf.shape(self._obs_input)[:1])
 
-        LearningRateSchedule.__init__(self, self.config["lr"],
-                                      self.config["lr_schedule"])
-        self._initialize_loss()
-
-    @override(TFPolicyGraph)
-    def gradients(self, optimizer, loss):
-        if self.config["grad_clip"] is not None:
-            self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                                              tf.get_variable_scope().name)
-            grads = tf.gradients(loss, self.var_list)
-            self.grads, _ = tf.clip_by_global_norm(grads,
-                                                   self.config["grad_clip"])
-            clipped_grads = list(zip(self.grads, self.var_list))
-            return clipped_grads
-        else:
-            return optimizer.compute_gradients(
-                loss, colocate_gradients_with_ops=True)
-
-    @override(PolicyGraph)
-    def get_initial_state(self):
-        return self.model.state_init
-
-    @override(TFPolicyGraph)
-    def extra_compute_grad_fetches(self):
-        return {LEARNER_STATS_KEY: self.stats_fetches}
-
-    def update_kl(self, sampled_kl):
-        if sampled_kl > 2.0 * self.kl_target:
-            self.kl_coeff_val *= 1.5
-        elif sampled_kl < 0.5 * self.kl_target:
-            self.kl_coeff_val *= 0.5
-        self.kl_coeff.load(self.kl_coeff_val, session=self._sess)
-        return self.kl_coeff_val
-
     def _value(self, ob, prev_action, prev_reward, *args):
         feed_dict = {
             self._obs_input: [ob],
@@ -291,3 +263,21 @@ def _value(self, ob, prev_action, prev_reward, *args):
             feed_dict[k] = v
         vf = self._sess.run(self.value_function, feed_dict)
         return vf[0]
+
+
+def _setup_mixins(policy, obs_space, action_space, config):
+    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
+    KLCoeffMixin.__init__(policy, config)
+    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+
+
+PPOPolicyGraph = build_tf_graph(
+    name="PPOPolicyGraph",
+    get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
+    loss_fn=_build_ppo_loss,
+    stats_fn=_build_ppo_stats,
+    extra_action_fetches_fn=_build_ppo_action_fetches,
+    postprocess_fn=_postprocess_ppo_gae,
+    gradients_fn=_build_ppo_gradients,
+    pre_loss_init_fn=_setup_mixins,
+    mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index f678df9cd34c..49da52a25636 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -21,68 +21,6 @@
 
 
 @DeveloperAPI
-def build_tf_graph(name,
-                   get_default_config,
-                   loss_fn,
-                   postprocess_fn=None,
-                   make_optimizer=None,
-                   make_extra_action_fetches=None):
-    """Helper function for creating a dynamic tf policy graph at runtime.
-
-    Arguments:
-        name (str): name of the graph (e.g., "PPOPolicyGraph")
-        get_default_config (func): function that returns the default config
-            to merge with any overrides
-        loss_fn (func): function that returns a loss tensor the policy graph,
-            and dict of experience tensor placeholders
-        postprocess_fn (func): optional experience postprocessing function
-            that takes the same args as PolicyGraph.postprocess_trajectory()
-        make_optimizer (func): optional function that returns a tf.Optimizer
-            given the policy graph object
-        make_extra_action_fetches (func): optional function that returns
-            a dict of TF fetches given the policy graph object
-
-    Returns:
-        a DynamicTFPolicyGraph instance that uses the specified args
-    """
-
-    class graph_cls(DynamicTFPolicyGraph):
-        def __init__(self, obs_space, action_space, config):
-            config = dict(get_default_config(), **config)
-            if make_extra_action_fetches is None:
-                self._extra_action_fetches = {}
-            else:
-                self._extra_action_fetches = make_extra_action_fetches(self)
-            DynamicTFPolicyGraph.__init__(self, obs_space, action_space,
-                                          config, loss_fn)
-
-        @override(PolicyGraph)
-        def postprocess_trajectory(self,
-                                   sample_batch,
-                                   other_agent_batches=None,
-                                   episode=None):
-            if not postprocess_fn:
-                return sample_batch
-            return postprocess_fn(self, sample_batch, other_agent_batches,
-                                  episode)
-
-        @override(TFPolicyGraph)
-        def optimizer(self):
-            if make_optimizer:
-                return make_optimizer(self)
-            else:
-                return TFPolicyGraph.optimizer(self)
-
-        @override(TFPolicyGraph)
-        def extra_compute_action_fetches(self):
-            return dict(
-                TFPolicyGraph.extra_compute_action_fetches(self),
-                **self._extra_action_fetches)
-
-    graph_cls.__name__ = name
-    return graph_cls
-
-
 class DynamicTFPolicyGraph(TFPolicyGraph):
     """A TFPolicyGraph that auto-defines placeholders dynamically at runtime.
 
@@ -98,14 +36,16 @@ def __init__(self,
                  action_space,
                  config,
                  loss_fn,
+                 stats_fn=None,
                  autosetup_model=True,
-                 autoinit_loss=True,
+                 pre_loss_init_fn=None,
                  action_sampler=None,
                  action_prob=None,
                  existing_inputs=None):
         self.config = config
         self.autosetup_model = autosetup_model
-        self._build_loss = loss_fn
+        self._loss_fn = loss_fn
+        self._stats_fn = stats_fn
 
         # Setup standard placeholders
         if existing_inputs is not None:
@@ -182,7 +122,9 @@ def __init__(self,
             seq_lens=self.model.seq_lens,
             max_seq_len=config["model"]["max_seq_len"])
 
-        if autoinit_loss:
+        # Phase 2 init
+        pre_loss_init_fn(self, obs_space, action_space, config)
+        if not existing_inputs:
             self._initialize_loss()
 
     @override(TFPolicyGraph)
@@ -201,7 +143,7 @@ def copy(self, existing_inputs):
             if v.shape.as_list() != existing_inputs[i].shape.as_list():
                 raise ValueError("Tensor shape mismatch", i, k, v.shape,
                                  existing_inputs[i].shape)
-        # by convention, the loss inputs are followed by state inputs and then
+        # By convention, the loss inputs are followed by state inputs and then
         # the seq len tensor
         rnn_inputs = []
         for i in range(len(self._state_inputs)):
@@ -217,7 +159,10 @@ def copy(self, existing_inputs):
             self.action_space,
             self.config,
             existing_inputs=input_dict)
-        loss = instance._build_loss(instance, input_dict)
+        loss = instance._loss_fn(instance, input_dict)
+        if instance._stats_fn:
+            instance._stats_fetches.update(
+                instance._stats_fn(instance, input_dict))
         TFPolicyGraph._initialize_loss(
             instance, loss, [(k, existing_inputs[i])
                              for i, (k, _) in enumerate(self._loss_inputs)])
@@ -259,7 +204,7 @@ def fake_array(tensor):
         postprocessed_batch = self.postprocess_trajectory(
             SampleBatch(dummy_batch))
 
-        loss_input = UsageTrackingDict({
+        batch_tensors = UsageTrackingDict({
             SampleBatch.PREV_ACTIONS: self._prev_action_input,
             SampleBatch.PREV_REWARDS: self._prev_reward_input,
             SampleBatch.CUR_OBS: self._obs_input,
@@ -271,22 +216,24 @@ def fake_array(tensor):
         ]
 
         for k, v in postprocessed_batch.items():
-            if k in loss_input:
+            if k in batch_tensors:
                 continue
             elif v.dtype == np.object:
                 continue  # can't handle arbitrary objects in TF
             shape = (None, ) + v.shape[1:]
             placeholder = tf.placeholder(v.dtype, shape=shape, name=k)
-            loss_input[k] = placeholder
+            batch_tensors[k] = placeholder
 
         if log_once("loss_init"):
             logger.info(
                 "Initializing loss function with dummy input:\n\n{}\n".format(
-                    summarize(loss_input)))
+                    summarize(batch_tensors)))
 
-        loss = self._build_loss(self, loss_input)
-        for k in sorted(loss_input.accessed_keys):
-            loss_inputs.append((k, loss_input[k]))
+        loss = self._loss_fn(self, batch_tensors)
+        if self._stats_fn:
+            self._stats_fetches.update(self._stats_fn(self, batch_tensors))
+        for k in sorted(batch_tensors.accessed_keys):
+            loss_inputs.append((k, batch_tensors[k]))
 
         TFPolicyGraph._initialize_loss(self, loss, loss_inputs)
         self._sess.run(tf.global_variables_initializer())
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index bc15dd35db9b..a33626b2df52 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -120,6 +120,7 @@ def __init__(self,
         self._max_seq_len = max_seq_len
         self._batch_divisibility_req = batch_divisibility_req
         self._update_ops = update_ops
+        self._stats_fetches = {}
 
         if loss is not None:
             self._initialize_loss(loss, loss_inputs)
@@ -147,10 +148,9 @@ def _initialize_loss(self, loss, loss_inputs):
 
         if self.model:
             self._loss = self.model.custom_loss(loss, self._loss_input_dict)
-            self._stats_fetches = {"model": self.model.custom_stats()}
+            self._stats_fetches.update({"model": self.model.custom_stats()})
         else:
             self._loss = loss
-            self._stats_fetches = {}
 
         self._optimizer = self.optimizer()
         self._grads_and_vars = [
diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py
new file mode 100644
index 000000000000..549b508e791e
--- /dev/null
+++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py
@@ -0,0 +1,120 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph
+from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
+from ray.rllib.utils.annotations import override, DeveloperAPI
+
+
+@DeveloperAPI
+def build_tf_graph(name,
+                   get_default_config,
+                   loss_fn,
+                   stats_fn=None,
+                   extra_action_fetches_fn=None,
+                   postprocess_fn=None,
+                   optimizer_fn=None,
+                   gradients_fn=None,
+                   pre_init_fn=None,
+                   pre_loss_init_fn=None,
+                   post_init_fn=None,
+                   mixins=None):
+    """Helper function for creating a dynamic tf policy graph at runtime.
+
+    Arguments:
+        name (str): name of the graph (e.g., "PPOPolicyGraph")
+        get_default_config (func): function that returns the default config
+            to merge with any overrides
+        loss_fn (func): function that returns a loss tensor the policy graph,
+            and dict of experience tensor placeholders
+        stats_fn (func): optional function that returns a dict of
+            TF fetches given the policy graph and batch input tensors
+        extra_action_fetches_fn (func): optional function that returns
+            a dict of TF fetches given the policy graph object
+        postprocess_fn (func): optional experience postprocessing function
+            that takes the same args as PolicyGraph.postprocess_trajectory()
+        optimizer_fn (func): optional function that returns a tf.Optimizer
+            given the policy graph object
+        gradients_fn (func): optional function that returns a list of gradients
+            given a tf optimizer and loss tensor. If not specified, this
+            defaults to optimizer.compute_gradients(loss)
+        pre_init_fn (func): optional function to run at the beginning of
+            __init__ that takes the same arguments as __init__
+        pre_loss_init_fn (func): optional function to run prior to loss
+            init that takes the same arguments as __init__
+        post_init_fn (func): optional function to run at the end of __init__
+            that takes the same arguments as __init__
+        mixins (list): list of any class mixins for the returned policy class
+
+    Returns:
+        a DynamicTFPolicyGraph instance that uses the specified args
+    """
+
+    if mixins is None:
+        mixins = []
+
+    class graph_cls(*mixins, DynamicTFPolicyGraph):
+        def __init__(self,
+                     obs_space,
+                     action_space,
+                     config,
+                     existing_inputs=None):
+            config = dict(get_default_config(), **config)
+
+            if pre_init_fn:
+                pre_init_fn(self, obs_space, action_space, config)
+
+            def before_loss_init(policy, obs_space, action_space, config):
+                if pre_loss_init_fn:
+                    pre_loss_init_fn(policy, obs_space, action_space, config)
+                if extra_action_fetches_fn is None:
+                    self._extra_action_fetches = {}
+                else:
+                    self._extra_action_fetches = extra_action_fetches_fn(self)
+
+            DynamicTFPolicyGraph.__init__(
+                self,
+                obs_space,
+                action_space,
+                config,
+                loss_fn,
+                stats_fn,
+                pre_loss_init_fn=before_loss_init,
+                existing_inputs=existing_inputs)
+            if post_init_fn:
+                post_init_fn(self, obs_space, action_space, config)
+
+        @override(PolicyGraph)
+        def postprocess_trajectory(self,
+                                   sample_batch,
+                                   other_agent_batches=None,
+                                   episode=None):
+            if not postprocess_fn:
+                return sample_batch
+            return postprocess_fn(self, sample_batch, other_agent_batches,
+                                  episode)
+
+        @override(TFPolicyGraph)
+        def optimizer(self):
+            if optimizer_fn:
+                return optimizer_fn(self)
+            else:
+                return TFPolicyGraph.optimizer(self)
+
+        @override(TFPolicyGraph)
+        def gradients(self, optimizer, loss):
+            if gradients_fn:
+                return gradients_fn(self, optimizer, loss)
+            else:
+                return TFPolicyGraph.gradients(self, optimizer, loss)
+
+        @override(TFPolicyGraph)
+        def extra_compute_action_fetches(self):
+            return dict(
+                TFPolicyGraph.extra_compute_action_fetches(self),
+                **self._extra_action_fetches)
+
+    graph_cls.__name__ = name
+    return graph_cls
diff --git a/python/ray/rllib/optimizers/multi_gpu_impl.py b/python/ray/rllib/optimizers/multi_gpu_impl.py
index d892dbe7dbac..8d1bbd4fb54d 100644
--- a/python/ray/rllib/optimizers/multi_gpu_impl.py
+++ b/python/ray/rllib/optimizers/multi_gpu_impl.py
@@ -255,7 +255,7 @@ def optimize(self, sess, batch_index):
 
         fetches = {"train": self._train_op}
         for tower in self._towers:
-            fetches.update(tower.loss_graph.extra_compute_grad_fetches())
+            fetches.update(tower.loss_graph._get_grad_and_stats_fetches())
 
         return sess.run(fetches, feed_dict=feed_dict)
 
diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
index 45df865e43ff..de2671e6a932 100644
--- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py
+++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py
@@ -222,6 +222,6 @@ def stats(self):
 def _averaged(kv):
     out = {}
     for k, v in kv.items():
-        if v[0] is not None:
+        if v[0] is not None and not isinstance(v[0], dict):
             out[k] = np.mean(v)
     return out

From e218d2c59ad35cd09648e10bfcaa09c541610cef Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 14:37:37 -0700
Subject: [PATCH 12/39] add ppo graph

---
 python/ray/rllib/agents/ppo/ppo.py | 212 ++++++++++++++---------------
 1 file changed, 103 insertions(+), 109 deletions(-)

diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index 8f69c91149e7..54f7a579fcc4 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -4,10 +4,10 @@
 
 import logging
 
-from ray.rllib.agents import Trainer, with_common_config
+from ray.rllib.agents import with_common_config
 from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
+from ray.rllib.agents.trainer_template import build_trainer
 from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer
-from ray.rllib.utils.annotations import override
 
 logger = logging.getLogger(__name__)
 
@@ -63,110 +63,104 @@
 # yapf: enable
 
 
-class PPOTrainer(Trainer):
-    """Multi-GPU optimized implementation of PPO in TensorFlow."""
-
-    _name = "PPO"
-    _default_config = DEFAULT_CONFIG
-    _policy_graph = PPOPolicyGraph
-
-    @override(Trainer)
-    def _init(self, config, env_creator):
-        self._validate_config()
-        self.local_evaluator = self.make_local_evaluator(
-            env_creator, self._policy_graph)
-        self.remote_evaluators = self.make_remote_evaluators(
-            env_creator, self._policy_graph, config["num_workers"])
-        if config["simple_optimizer"]:
-            self.optimizer = SyncSamplesOptimizer(
-                self.local_evaluator,
-                self.remote_evaluators,
-                num_sgd_iter=config["num_sgd_iter"],
-                train_batch_size=config["train_batch_size"])
-        else:
-            self.optimizer = LocalMultiGPUOptimizer(
-                self.local_evaluator,
-                self.remote_evaluators,
-                sgd_batch_size=config["sgd_minibatch_size"],
-                num_sgd_iter=config["num_sgd_iter"],
-                num_gpus=config["num_gpus"],
-                sample_batch_size=config["sample_batch_size"],
-                num_envs_per_worker=config["num_envs_per_worker"],
-                train_batch_size=config["train_batch_size"],
-                standardize_fields=["advantages"],
-                straggler_mitigation=config["straggler_mitigation"])
-
-    @override(Trainer)
-    def _train(self):
-        if "observation_filter" not in self.raw_user_config:
-            # TODO(ekl) remove this message after a few releases
-            logger.info(
-                "Important! Since 0.7.0, observation normalization is no "
-                "longer enabled by default. To enable running-mean "
-                "normalization, set 'observation_filter': 'MeanStdFilter'. "
-                "You can ignore this message if your environment doesn't "
-                "require observation normalization.")
-        prev_steps = self.optimizer.num_steps_sampled
-        fetches = self.optimizer.step()
-        if "kl" in fetches:
-            # single-agent
-            self.local_evaluator.for_policy(
-                lambda pi: pi.update_kl(fetches["kl"]))
-        else:
-
-            def update(pi, pi_id):
-                if pi_id in fetches:
-                    pi.update_kl(fetches[pi_id]["kl"])
-                else:
-                    logger.debug(
-                        "No data for {}, not updating kl".format(pi_id))
-
-            # multi-agent
-            self.local_evaluator.foreach_trainable_policy(update)
-        res = self.collect_metrics()
-        res.update(
-            timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps,
-            info=res.get("info", {}))
-
-        # Warn about bad clipping configs
-        if self.config["vf_clip_param"] <= 0:
-            rew_scale = float("inf")
-        elif res["policy_reward_mean"]:
-            rew_scale = 0  # punt on handling multiagent case
-        else:
-            rew_scale = round(
-                abs(res["episode_reward_mean"]) / self.config["vf_clip_param"],
-                0)
-        if rew_scale > 200:
-            logger.warning(
-                "The magnitude of your environment rewards are more than "
-                "{}x the scale of `vf_clip_param`. ".format(rew_scale) +
-                "This means that it will take more than "
-                "{} iterations for your value ".format(rew_scale) +
-                "function to converge. If this is not intended, consider "
-                "increasing `vf_clip_param`.")
-        return res
-
-    def _validate_config(self):
-        if self.config["entropy_coeff"] < 0:
-            raise DeprecationWarning("entropy_coeff must be >= 0")
-        if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]:
-            raise ValueError(
-                "Minibatch size {} must be <= train batch size {}.".format(
-                    self.config["sgd_minibatch_size"],
-                    self.config["train_batch_size"]))
-        if (self.config["batch_mode"] == "truncate_episodes"
-                and not self.config["use_gae"]):
-            raise ValueError(
-                "Episode truncation is not supported without a value "
-                "function. Consider setting batch_mode=complete_episodes.")
-        if (self.config["multiagent"]["policy_graphs"]
-                and not self.config["simple_optimizer"]):
-            logger.info(
-                "In multi-agent mode, policies will be optimized sequentially "
-                "by the multi-GPU optimizer. Consider setting "
-                "simple_optimizer=True if this doesn't work for you.")
-        if not self.config["vf_share_layers"]:
-            logger.warning(
-                "FYI: By default, the value function will not share layers "
-                "with the policy model ('vf_share_layers': False).")
+def _make_optimizer(local_evaluator, remote_evaluators, config):
+    if config["simple_optimizer"]:
+        return SyncSamplesOptimizer(
+            local_evaluator,
+            remote_evaluators,
+            num_sgd_iter=config["num_sgd_iter"],
+            train_batch_size=config["train_batch_size"])
+
+    return LocalMultiGPUOptimizer(
+        local_evaluator,
+        remote_evaluators,
+        sgd_batch_size=config["sgd_minibatch_size"],
+        num_sgd_iter=config["num_sgd_iter"],
+        num_gpus=config["num_gpus"],
+        sample_batch_size=config["sample_batch_size"],
+        num_envs_per_worker=config["num_envs_per_worker"],
+        train_batch_size=config["train_batch_size"],
+        standardize_fields=["advantages"],
+        straggler_mitigation=config["straggler_mitigation"])
+
+
+def _update_kl(trainer, fetches):
+    if "kl" in fetches:
+        # single-agent
+        trainer.local_evaluator.for_policy(
+            lambda pi: pi.update_kl(fetches["kl"]))
+    else:
+
+        def update(pi, pi_id):
+            if pi_id in fetches:
+                pi.update_kl(fetches[pi_id]["kl"])
+            else:
+                logger.debug("No data for {}, not updating kl".format(pi_id))
+
+        # multi-agent
+        trainer.local_evaluator.foreach_trainable_policy(update)
+
+
+def _warn_about_obs_filter(trainer):
+    if "observation_filter" not in trainer.raw_user_config:
+        # TODO(ekl) remove this message after a few releases
+        logger.info(
+            "Important! Since 0.7.0, observation normalization is no "
+            "longer enabled by default. To enable running-mean "
+            "normalization, set 'observation_filter': 'MeanStdFilter'. "
+            "You can ignore this message if your environment doesn't "
+            "require observation normalization.")
+
+
+def _warn_about_bad_reward_scales(trainer, result):
+    # Warn about bad clipping configs
+    if trainer.config["vf_clip_param"] <= 0:
+        rew_scale = float("inf")
+    elif result["policy_reward_mean"]:
+        rew_scale = 0  # punt on handling multiagent case
+    else:
+        rew_scale = round(
+            abs(result["episode_reward_mean"]) /
+            trainer.config["vf_clip_param"], 0)
+    if rew_scale > 200:
+        logger.warning(
+            "The magnitude of your environment rewards are more than "
+            "{}x the scale of `vf_clip_param`. ".format(rew_scale) +
+            "This means that it will take more than "
+            "{} iterations for your value ".format(rew_scale) +
+            "function to converge. If this is not intended, consider "
+            "increasing `vf_clip_param`.")
+
+
+def _validate_config(config):
+    if config["entropy_coeff"] < 0:
+        raise DeprecationWarning("entropy_coeff must be >= 0")
+    if config["sgd_minibatch_size"] > config["train_batch_size"]:
+        raise ValueError(
+            "Minibatch size {} must be <= train batch size {}.".format(
+                config["sgd_minibatch_size"], config["train_batch_size"]))
+    if (config["batch_mode"] == "truncate_episodes" and not config["use_gae"]):
+        raise ValueError(
+            "Episode truncation is not supported without a value "
+            "function. Consider setting batch_mode=complete_episodes.")
+    if (config["multiagent"]["policy_graphs"]
+            and not config["simple_optimizer"]):
+        logger.info(
+            "In multi-agent mode, policies will be optimized sequentially "
+            "by the multi-GPU optimizer. Consider setting "
+            "simple_optimizer=True if this doesn't work for you.")
+    if not config["vf_share_layers"]:
+        logger.warning(
+            "FYI: By default, the value function will not share layers "
+            "with the policy model ('vf_share_layers': False).")
+
+
+PPOTrainer = build_trainer(
+    "PPO",
+    default_config=DEFAULT_CONFIG,
+    default_policy_graph=PPOPolicyGraph,
+    make_policy_optimizer=_make_optimizer,
+    validate_config=_validate_config,
+    after_optimizer_step=_update_kl,
+    before_train_step=_warn_about_obs_filter,
+    after_train_result=_warn_about_bad_reward_scales)

From 9d9fd97211e9d1d4822c6cab2bf904f251146c8f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 14:44:12 -0700
Subject: [PATCH 13/39] fix naming

---
 .../ray/rllib/agents/ppo/ppo_policy_graph.py  |  2 +-
 .../evaluation/tf_policy_graph_template.py    | 30 ++++++++++---------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index b322dba122b0..774fd7c583a3 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -279,5 +279,5 @@ def _setup_mixins(policy, obs_space, action_space, config):
     extra_action_fetches_fn=_build_ppo_action_fetches,
     postprocess_fn=_postprocess_ppo_gae,
     gradients_fn=_build_ppo_gradients,
-    pre_loss_init_fn=_setup_mixins,
+    before_loss_init=_setup_mixins,
     mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])
diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py
index 549b508e791e..338ab587ce4a 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py
@@ -17,9 +17,9 @@ def build_tf_graph(name,
                    postprocess_fn=None,
                    optimizer_fn=None,
                    gradients_fn=None,
-                   pre_init_fn=None,
-                   pre_loss_init_fn=None,
-                   post_init_fn=None,
+                   before_init=None,
+                   before_loss_init=None,
+                   after_init=None,
                    mixins=None):
     """Helper function for creating a dynamic tf policy graph at runtime.
 
@@ -40,11 +40,11 @@ def build_tf_graph(name,
         gradients_fn (func): optional function that returns a list of gradients
             given a tf optimizer and loss tensor. If not specified, this
             defaults to optimizer.compute_gradients(loss)
-        pre_init_fn (func): optional function to run at the beginning of
+        before_init (func): optional function to run at the beginning of
             __init__ that takes the same arguments as __init__
-        pre_loss_init_fn (func): optional function to run prior to loss
+        before_loss_init (func): optional function to run prior to loss
             init that takes the same arguments as __init__
-        post_init_fn (func): optional function to run at the end of __init__
+        after_init (func): optional function to run at the end of __init__
             that takes the same arguments as __init__
         mixins (list): list of any class mixins for the returned policy class
 
@@ -63,12 +63,13 @@ def __init__(self,
                      existing_inputs=None):
             config = dict(get_default_config(), **config)
 
-            if pre_init_fn:
-                pre_init_fn(self, obs_space, action_space, config)
+            if before_init:
+                before_init(self, obs_space, action_space, config)
 
-            def before_loss_init(policy, obs_space, action_space, config):
-                if pre_loss_init_fn:
-                    pre_loss_init_fn(policy, obs_space, action_space, config)
+            def before_loss_init_wrapper(policy, obs_space, action_space,
+                                         config):
+                if before_loss_init:
+                    before_loss_init(policy, obs_space, action_space, config)
                 if extra_action_fetches_fn is None:
                     self._extra_action_fetches = {}
                 else:
@@ -81,10 +82,11 @@ def before_loss_init(policy, obs_space, action_space, config):
                 config,
                 loss_fn,
                 stats_fn,
-                pre_loss_init_fn=before_loss_init,
+                before_loss_init=before_loss_init_wrapper,
                 existing_inputs=existing_inputs)
-            if post_init_fn:
-                post_init_fn(self, obs_space, action_space, config)
+
+            if after_init:
+                after_init(self, obs_space, action_space, config)
 
         @override(PolicyGraph)
         def postprocess_trajectory(self,

From 3093391674ecdc5f53c3ade5a8afa04a2eb3a4a4 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 14:45:32 -0700
Subject: [PATCH 14/39] order

---
 python/ray/rllib/evaluation/tf_policy_graph_template.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py
index 338ab587ce4a..00c3a25ab753 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py
@@ -46,7 +46,9 @@ def build_tf_graph(name,
             init that takes the same arguments as __init__
         after_init (func): optional function to run at the end of __init__
             that takes the same arguments as __init__
-        mixins (list): list of any class mixins for the returned policy class
+        mixins (list): list of any class mixins for the returned policy class.
+            These mixins will be applied in order and will have higher
+            precedence than the DynamicTFPolicyGraph class
 
     Returns:
         a DynamicTFPolicyGraph instance that uses the specified args

From e670abd218b7b1c99bd3018168089a9a154a4aa0 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 14:51:13 -0700
Subject: [PATCH 15/39] docs

---
 .../evaluation/dynamic_tf_policy_graph.py     | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 49da52a25636..5b7daa0a3eae 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -28,7 +28,8 @@ class DynamicTFPolicyGraph(TFPolicyGraph):
       * Phase 1: the model is created and model variables are initialized.
       * Phase 2: a fake batch of data is created, sent to the trajectory
         postprocessor, and then used to create placeholders for the loss
-        function. The loss function is initialized with these placeholders.
+        function. The loss and stats functions are initialized with these
+        placeholders.
     """
 
     def __init__(self,
@@ -38,10 +39,33 @@ def __init__(self,
                  loss_fn,
                  stats_fn=None,
                  autosetup_model=True,
-                 pre_loss_init_fn=None,
+                 before_loss_init=None,
                  action_sampler=None,
                  action_prob=None,
                  existing_inputs=None):
+        """Initialize a dynamic TF policy graph.
+
+        Arguments:
+            observation_space (gym.Space): Observation space of the policy.
+            action_space (gym.Space): Action space of the policy.
+            config (dict): Policy-specific configuration data.
+            loss_fn (func): function that returns a loss tensor the policy
+                graph, and dict of experience tensor placeholders
+            stats_fn (func): optional function that returns a dict of
+                TF fetches given the policy graph and batch input tensors
+            autosetup_model (bool): whether to create a model and action dist
+                using catalog defaults. These will be available as self.model
+                and self.action_dist
+            before_loss_init (func): optional function to run prior to loss
+                init that takes the same arguments as __init__
+            action_sampler (Tensor): if autosetup_model is False, this must be
+                specified to define how the policy computes actions
+            action_prob (Tensor): if autosetup_model is False, this can be
+                specified to define the chosen action probability
+            existing_inputs (OrderedDict): when copying a policy graph, this
+                specifies an existing dict of placeholders to use instead of
+                defining new ones
+        """
         self.config = config
         self.autosetup_model = autosetup_model
         self._loss_fn = loss_fn
@@ -123,7 +147,7 @@ def __init__(self,
             max_seq_len=config["model"]["max_seq_len"])
 
         # Phase 2 init
-        pre_loss_init_fn(self, obs_space, action_space, config)
+        before_loss_init(self, obs_space, action_space, config)
         if not existing_inputs:
             self._initialize_loss()
 

From 0aba2f38f652c23e7144b5868f5dd0c8beaa8216 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 14:58:21 -0700
Subject: [PATCH 16/39] set class name correctly

---
 python/ray/rllib/agents/trainer_template.py             | 7 ++++++-
 python/ray/rllib/evaluation/tf_policy_graph_template.py | 4 ++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py
index 6c99eb6d4acc..85ed16b56044 100644
--- a/python/ray/rllib/agents/trainer_template.py
+++ b/python/ray/rllib/agents/trainer_template.py
@@ -41,6 +41,10 @@ def build_trainer(name,
         a Trainer instance that uses the specified args.
     """
 
+    if name.endswith("Trainer"):
+        raise ValueError("Algorithm name should not include *Trainer suffix",
+                         name)
+
     class trainer_cls(Trainer):
         _name = name
         _default_config = default_config
@@ -78,5 +82,6 @@ def _train(self):
                 after_train_result(self, res)
             return res
 
-    trainer_cls.__name__ = name
+    trainer_cls.__name__ = name + "Trainer"
+    trainer_cls.__qualname__ = name + "Trainer"
     return trainer_cls
diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py
index 00c3a25ab753..16aac69bd8ba 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py
@@ -57,6 +57,9 @@ def build_tf_graph(name,
     if mixins is None:
         mixins = []
 
+    if not name.endswith("PolicyGraph"):
+        raise ValueError("Name should match *PolicyGraph", name)
+
     class graph_cls(*mixins, DynamicTFPolicyGraph):
         def __init__(self,
                      obs_space,
@@ -121,4 +124,5 @@ def extra_compute_action_fetches(self):
                 **self._extra_action_fetches)
 
     graph_cls.__name__ = name
+    graph_cls.__qualname__ = name
     return graph_cls

From 298fcd041174a9bd24c511fc4b516f6fd760d23d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 15:35:22 -0700
Subject: [PATCH 17/39] add torch builder

---
 python/ray/rllib/agents/pg/pg_policy_graph.py |   4 +-
 .../rllib/agents/pg/torch_pg_policy_graph.py  |  92 +++++----------
 .../ray/rllib/agents/ppo/ppo_policy_graph.py  |   4 +-
 .../evaluation/tf_policy_graph_template.py    |  24 ++--
 .../rllib/evaluation/torch_policy_graph.py    |  36 +++---
 .../evaluation/torch_policy_graph_template.py | 108 ++++++++++++++++++
 python/ray/rllib/utils/tracking_dict.py       |  12 +-
 7 files changed, 187 insertions(+), 93 deletions(-)
 create mode 100644 python/ray/rllib/evaluation/torch_policy_graph_template.py

diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index 4858d4a5e87f..510662be6f00 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -5,7 +5,7 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.tf_policy_graph_template import build_tf_graph
+from ray.rllib.evaluation.tf_policy_graph_template import build_tf_policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils import try_import_tf
 
@@ -32,7 +32,7 @@ def _make_optimizer(policy):
     return tf.train.AdamOptimizer(learning_rate=policy.config["lr"])
 
 
-PGPolicyGraph = build_tf_graph(
+PGPolicyGraph = build_tf_policy(
     name="PGPolicyGraph",
     get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
     postprocess_fn=_postprocess_advantages,
diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
index 746ef1bca42f..063cc0610c3e 100644
--- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
@@ -3,81 +3,47 @@
 from __future__ import print_function
 
 import torch
-from torch import nn
 
 import ray
-from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
-from ray.rllib.utils.annotations import override
+from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy
 
 
-class PGLoss(nn.Module):
-    def __init__(self, dist_class):
-        nn.Module.__init__(self)
-        self.dist_class = dist_class
+def _pg_torch_loss(policy, batch_tensors):
+    logits, _, values, _ = policy.model({
+        SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
+    }, [])
+    action_dist = policy.dist_class(logits)
+    log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS])
+    # save the error in the policy object
+    policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
+        log_probs.reshape(-1))
+    return policy.pi_err
 
-    def forward(self, policy_model, observations, actions, advantages):
-        logits, _, values, _ = policy_model({
-            SampleBatch.CUR_OBS: observations
-        }, [])
-        dist = self.dist_class(logits)
-        log_probs = dist.logp(actions)
-        self.pi_err = -advantages.dot(log_probs.reshape(-1))
-        return self.pi_err
 
+def _postprocess_advantages(policy,
+                            sample_batch,
+                            other_agent_batches=None,
+                            episode=None):
+    return compute_advantages(
+        sample_batch, 0.0, policy.config["gamma"], use_gae=False)
 
-class PGPostprocessing(object):
-    """Adds the value func output and advantages field to the trajectory."""
 
-    @override(TorchPolicyGraph)
-    def extra_action_out(self, model_out):
-        return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}
+def _pg_loss_stats(policy, batch_tensors):
+    # the error is recorded when computing the loss
+    return {"policy_loss": policy.pi_err.item()}
 
-    @override(PolicyGraph)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        return compute_advantages(
-            sample_batch, 0.0, self.config["gamma"], use_gae=False)
 
+def _make_optimizer(policy):
+    return torch.optim.Adam(policy._model.parameters(), lr=policy.config["lr"])
 
-class PGTorchPolicyGraph(PGPostprocessing, TorchPolicyGraph):
-    def __init__(self, obs_space, action_space, config):
-        config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
-        self.config = config
-        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"], torch=True)
-        model = ModelCatalog.get_torch_model(obs_space, self.logit_dim,
-                                             self.config["model"])
-        loss = PGLoss(dist_class)
 
-        TorchPolicyGraph.__init__(
-            self,
-            obs_space,
-            action_space,
-            model,
-            loss,
-            loss_inputs=[
-                SampleBatch.CUR_OBS, SampleBatch.ACTIONS,
-                Postprocessing.ADVANTAGES
-            ],
-            action_distribution_cls=dist_class)
-
-    @override(TorchPolicyGraph)
-    def optimizer(self):
-        return torch.optim.Adam(self._model.parameters(), lr=self.config["lr"])
-
-    @override(TorchPolicyGraph)
-    def extra_grad_info(self):
-        return {"policy_loss": self._loss.pi_err.item()}
-
-    def _value(self, obs):
-        with self.lock:
-            obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device)
-            _, _, vf, _ = self.model({"obs": obs}, [])
-            return vf.detach().cpu().numpy().squeeze()
+PGTorchPolicyGraph = build_torch_policy(
+    name="PGTorchPolicyGraph",
+    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
+    loss_fn=_pg_torch_loss,
+    stats_fn=_pg_loss_stats,
+    postprocess_fn=_postprocess_advantages,
+    optimizer_fn=_make_optimizer)
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index 774fd7c583a3..6d5e7c971919 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -9,7 +9,7 @@
     Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule
-from ray.rllib.evaluation.tf_policy_graph_template import build_tf_graph
+from ray.rllib.evaluation.tf_policy_graph_template import build_tf_policy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils import try_import_tf
@@ -271,7 +271,7 @@ def _setup_mixins(policy, obs_space, action_space, config):
     LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
 
 
-PPOPolicyGraph = build_tf_graph(
+PPOPolicyGraph = build_tf_policy(
     name="PPOPolicyGraph",
     get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
     loss_fn=_build_ppo_loss,
diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py
index 16aac69bd8ba..2b69c1a49bd7 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py
@@ -9,18 +9,18 @@
 
 
 @DeveloperAPI
-def build_tf_graph(name,
-                   get_default_config,
-                   loss_fn,
-                   stats_fn=None,
-                   extra_action_fetches_fn=None,
-                   postprocess_fn=None,
-                   optimizer_fn=None,
-                   gradients_fn=None,
-                   before_init=None,
-                   before_loss_init=None,
-                   after_init=None,
-                   mixins=None):
+def build_tf_policy(name,
+                    get_default_config,
+                    loss_fn,
+                    stats_fn=None,
+                    extra_action_fetches_fn=None,
+                    postprocess_fn=None,
+                    optimizer_fn=None,
+                    gradients_fn=None,
+                    before_init=None,
+                    before_loss_init=None,
+                    after_init=None,
+                    mixins=None):
     """Helper function for creating a dynamic tf policy graph at runtime.
 
     Arguments:
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py
index fb5c879a1ab8..4a4e79a15242 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph.py
@@ -15,6 +15,7 @@
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.tracking_dict import UsageTrackingDict
 
 
 class TorchPolicyGraph(PolicyGraph):
@@ -87,30 +88,26 @@ def compute_actions(self,
 
     @override(PolicyGraph)
     def learn_on_batch(self, postprocessed_batch):
+        batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
+
         with self.lock:
-            loss_in = []
-            for key in self._loss_inputs:
-                loss_in.append(
-                    torch.from_numpy(postprocessed_batch[key]).to(self.device))
-            loss_out = self._loss(self._model, *loss_in)
+            loss_out = self._compute_loss(batch_tensors)
             self._optimizer.zero_grad()
             loss_out.backward()
 
             grad_process_info = self.extra_grad_process()
             self._optimizer.step()
 
-            grad_info = self.extra_grad_info()
+            grad_info = self.extra_grad_info(batch_tensors)
             grad_info.update(grad_process_info)
             return {LEARNER_STATS_KEY: grad_info}
 
     @override(PolicyGraph)
     def compute_gradients(self, postprocessed_batch):
+        batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
+
         with self.lock:
-            loss_in = []
-            for key in self._loss_inputs:
-                loss_in.append(
-                    torch.from_numpy(postprocessed_batch[key]).to(self.device))
-            loss_out = self._loss(self._model, *loss_in)
+            loss_out = self._compute_loss(batch_tensors)
             self._optimizer.zero_grad()
             loss_out.backward()
 
@@ -125,7 +122,7 @@ def compute_gradients(self, postprocessed_batch):
                 else:
                     grads.append(None)
 
-            grad_info = self.extra_grad_info()
+            grad_info = self.extra_grad_info(batch_tensors)
             grad_info.update(grad_process_info)
             return grads, {LEARNER_STATS_KEY: grad_info}
 
@@ -163,7 +160,7 @@ def extra_action_out(self, model_out):
             model_out (list): Outputs of the policy model module."""
         return {}
 
-    def extra_grad_info(self):
+    def extra_grad_info(self, batch_tensors):
         """Return dict of extra grad info."""
 
         return {}
@@ -171,3 +168,16 @@ def extra_grad_info(self):
     def optimizer(self):
         """Custom PyTorch optimizer to use."""
         return torch.optim.Adam(self._model.parameters())
+
+    def _compute_loss(self, batch_tensors):
+        loss_in = []
+        for key in self._loss_inputs:
+            loss_in.append(batch_tensors[key])
+        loss_out = self._loss(self._model, *loss_in)
+        return loss_out
+
+    def _lazy_tensor_dict(self, postprocessed_batch):
+        batch_tensors = UsageTrackingDict(postprocessed_batch)
+        batch_tensors.set_get_interceptor(
+            lambda arr: torch.from_numpy(arr).to(self.device))
+        return batch_tensors
diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_graph_template.py
new file mode 100644
index 000000000000..66fb51032797
--- /dev/null
+++ b/python/ray/rllib/evaluation/torch_policy_graph_template.py
@@ -0,0 +1,108 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.utils.annotations import override, DeveloperAPI
+
+
+@DeveloperAPI
+def build_torch_policy(name,
+                       get_default_config,
+                       loss_fn,
+                       stats_fn=None,
+                       postprocess_fn=None,
+                       optimizer_fn=None,
+                       before_init=None,
+                       after_init=None,
+                       mixins=None):
+    """Helper function for creating a dynamic tf policy graph at runtime.
+
+    Arguments:
+        name (str): name of the graph (e.g., "PPOPolicyGraph")
+        get_default_config (func): function that returns the default config
+            to merge with any overrides
+        loss_fn (func): function that returns a loss tensor the policy graph,
+            and dict of experience tensor placeholders
+        stats_fn (func): optional function that returns a dict of
+            values given the policy graph and batch input tensors
+        postprocess_fn (func): optional experience postprocessing function
+            that takes the same args as PolicyGraph.postprocess_trajectory()
+        optimizer_fn (func): optional function that returns a torch optimizer
+            given the policy graph object
+        before_init (func): optional function to run at the beginning of
+            __init__ that takes the same arguments as __init__
+        after_init (func): optional function to run at the end of __init__
+            that takes the same arguments as __init__
+        mixins (list): list of any class mixins for the returned policy class.
+            These mixins will be applied in order and will have higher
+            precedence than the TorchPolicyGraph class
+
+    Returns:
+        a TorchPolicyGraph instance that uses the specified args
+    """
+
+    if mixins is None:
+        mixins = []
+
+    if not name.endswith("TorchPolicyGraph"):
+        raise ValueError("Name should match *TorchPolicyGraph", name)
+
+    class graph_cls(*mixins, TorchPolicyGraph):
+        def __init__(self, obs_space, action_space, config):
+            config = dict(get_default_config(), **config)
+            self.config = config
+
+            if before_init:
+                before_init(self, obs_space, action_space, config)
+
+            self.dist_class, self.logit_dim = ModelCatalog.get_action_dist(
+                action_space, self.config["model"], torch=True)
+            self.model = ModelCatalog.get_torch_model(
+                obs_space, self.logit_dim, self.config["model"])
+
+            TorchPolicyGraph.__init__(
+                self,
+                obs_space,
+                action_space,
+                self.model,
+                None,  # loss fn is None since we override _compute_loss
+                [],  # TODO(ekl) clean up torch loss handling
+                self.dist_class)
+
+            if after_init:
+                after_init(self, obs_space, action_space, config)
+
+        @override(PolicyGraph)
+        def postprocess_trajectory(self,
+                                   sample_batch,
+                                   other_agent_batches=None,
+                                   episode=None):
+            if not postprocess_fn:
+                return sample_batch
+            return postprocess_fn(self, sample_batch, other_agent_batches,
+                                  episode)
+
+        @override(TorchPolicyGraph)
+        def optimizer(self):
+            if optimizer_fn:
+                return optimizer_fn(self)
+            else:
+                return TorchPolicyGraph.optimizer(self)
+
+        @override(TorchPolicyGraph)
+        def extra_grad_info(self, batch_tensors):
+            if stats_fn:
+                return stats_fn(self, batch_tensors)
+            else:
+                return TorchPolicyGraph.extra_grad_info(self, batch_tensors)
+
+        @override(TorchPolicyGraph)
+        def _compute_loss(self, batch_tensors):
+            return loss_fn(self, batch_tensors)
+
+    graph_cls.__name__ = name
+    graph_cls.__qualname__ = name
+    return graph_cls
diff --git a/python/ray/rllib/utils/tracking_dict.py b/python/ray/rllib/utils/tracking_dict.py
index d0a04c4d059e..d43f6e87b0f7 100644
--- a/python/ray/rllib/utils/tracking_dict.py
+++ b/python/ray/rllib/utils/tracking_dict.py
@@ -6,6 +6,9 @@
 class UsageTrackingDict(dict):
     """Dict that tracks which keys have been accessed.
 
+    It can also intercept gets and allow an arbitrary callback to be applied
+    (i.e., to lazily convert numpy arrays to Tensors).
+
     We make the simplifying assumption only __getitem__ is used to access
     values.
     """
@@ -13,7 +16,14 @@ class UsageTrackingDict(dict):
     def __init__(self, *args, **kwargs):
         dict.__init__(self, *args, **kwargs)
         self.accessed_keys = set()
+        self.get_interceptor = None
+
+    def set_get_interceptor(self, fn):
+        self.get_interceptor = fn
 
     def __getitem__(self, key):
         self.accessed_keys.add(key)
-        return dict.__getitem__(self, key)
+        value = dict.__getitem__(self, key)
+        if self.get_interceptor:
+            value = self.get_interceptor(value)
+        return value

From 5269fe0f2a463e11d5edce017fb583def7f4249a Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 16:12:00 -0700
Subject: [PATCH 18/39] add custom model support in builder

---
 .../agents/a3c/a3c_torch_policy_graph.py      | 173 ++++++++----------
 python/ray/rllib/agents/pg/pg_policy_graph.py |  18 +-
 .../rllib/agents/pg/torch_pg_policy_graph.py  |  22 +--
 .../ray/rllib/agents/ppo/ppo_policy_graph.py  |  30 +--
 .../evaluation/dynamic_tf_policy_graph.py     |  60 +++---
 .../evaluation/tf_policy_graph_template.py    |   5 +
 .../rllib/evaluation/torch_policy_graph.py    |  23 +--
 .../evaluation/torch_policy_graph_template.py |  51 ++++--
 8 files changed, 181 insertions(+), 201 deletions(-)

diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
index d35aabe0d667..13704f74e938 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -7,109 +7,84 @@
 from torch import nn
 
 import ray
-from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
-from ray.rllib.utils.annotations import override
-
-
-class A3CLoss(nn.Module):
-    def __init__(self, dist_class, vf_loss_coeff=0.5, entropy_coeff=0.01):
-        nn.Module.__init__(self)
-        self.dist_class = dist_class
-        self.vf_loss_coeff = vf_loss_coeff
-        self.entropy_coeff = entropy_coeff
-
-    def forward(self, policy_model, observations, actions, advantages,
-                value_targets):
-        logits, _, values, _ = policy_model({
-            SampleBatch.CUR_OBS: observations
-        }, [])
-        dist = self.dist_class(logits)
-        log_probs = dist.logp(actions)
-        self.entropy = dist.entropy().mean()
-        self.pi_err = -advantages.dot(log_probs.reshape(-1))
-        self.value_err = F.mse_loss(values.reshape(-1), value_targets)
-        overall_err = sum([
-            self.pi_err,
-            self.vf_loss_coeff * self.value_err,
-            -self.entropy_coeff * self.entropy,
-        ])
-
-        return overall_err
-
-
-class A3CPostprocessing(object):
-    """Adds the VF preds and advantages fields to the trajectory."""
-
-    @override(TorchPolicyGraph)
-    def extra_action_out(self, model_out):
-        return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}
-
-    @override(PolicyGraph)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        completed = sample_batch[SampleBatch.DONES][-1]
-        if completed:
-            last_r = 0.0
-        else:
-            last_r = self._value(sample_batch[SampleBatch.NEXT_OBS][-1])
-        return compute_advantages(sample_batch, last_r, self.config["gamma"],
-                                  self.config["lambda"])
-
-
-class A3CTorchPolicyGraph(A3CPostprocessing, TorchPolicyGraph):
-    """A simple, non-recurrent PyTorch policy example."""
-
-    def __init__(self, obs_space, action_space, config):
-        config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
-        self.config = config
-        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"], torch=True)
-        model = ModelCatalog.get_torch_model(obs_space, self.logit_dim,
-                                             self.config["model"])
-        loss = A3CLoss(dist_class, self.config["vf_loss_coeff"],
-                       self.config["entropy_coeff"])
-        TorchPolicyGraph.__init__(
-            self,
-            obs_space,
-            action_space,
-            model,
-            loss,
-            loss_inputs=[
-                SampleBatch.CUR_OBS, SampleBatch.ACTIONS,
-                Postprocessing.ADVANTAGES, Postprocessing.VALUE_TARGETS
-            ],
-            action_distribution_cls=dist_class)
-
-    @override(TorchPolicyGraph)
-    def optimizer(self):
-        return torch.optim.Adam(self._model.parameters(), lr=self.config["lr"])
-
-    @override(TorchPolicyGraph)
-    def extra_grad_process(self):
-        info = {}
-        if self.config["grad_clip"]:
-            total_norm = nn.utils.clip_grad_norm_(self._model.parameters(),
-                                                  self.config["grad_clip"])
-            info["grad_gnorm"] = total_norm
-        return info
-
-    @override(TorchPolicyGraph)
-    def extra_grad_info(self):
-        return {
-            "policy_entropy": self._loss.entropy.item(),
-            "policy_loss": self._loss.pi_err.item(),
-            "vf_loss": self._loss.value_err.item()
-        }
-
+from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy
+
+
+def a3c_torch_loss(policy, batch_tensors):
+    logits, _, values, _ = policy.model({
+        SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
+    }, [])
+    dist = policy.dist_class(logits)
+    log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS])
+    policy.entropy = dist.entropy().mean()
+    policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
+        log_probs.reshape(-1))
+    policy.value_err = F.mse_loss(
+        values.reshape(-1), batch_tensors[Postprocessing.VALUE_TARGETS])
+    overall_err = sum([
+        policy.pi_err,
+        policy.config["vf_loss_coeff"] * policy.value_err,
+        -policy.config["entropy_coeff"] * policy.entropy,
+    ])
+    return overall_err
+
+
+def a3c_torch_stats(policy, batch_tensors):
+    return {
+        "policy_entropy": policy.entropy.item(),
+        "policy_loss": policy.pi_err.item(),
+        "vf_loss": policy.value_err.item(),
+    }
+
+
+def a3c_extra_action_out(policy, model_out):
+    return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}
+
+
+def a3c_extra_grad_process(policy):
+    info = {}
+    if policy.config["grad_clip"]:
+        total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(),
+                                              policy.config["grad_clip"])
+        info["grad_gnorm"] = total_norm
+    return info
+
+
+def optimizer(policy):
+    return torch.optim.Adam(policy.model.parameters(), lr=policy.config["lr"])
+
+
+def postprocess_torch_a3c(policy,
+                          sample_batch,
+                          other_agent_batches=None,
+                          episode=None):
+    completed = sample_batch[SampleBatch.DONES][-1]
+    if completed:
+        last_r = 0.0
+    else:
+        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
+    return compute_advantages(sample_batch, last_r, policy.config["gamma"],
+                              policy.config["lambda"])
+
+
+class ValueNetworkMixin(object):
     def _value(self, obs):
         with self.lock:
             obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device)
-            _, _, vf, _ = self._model({"obs": obs}, [])
+            _, _, vf, _ = self.model({"obs": obs}, [])
             return vf.detach().cpu().numpy().squeeze()
+
+
+A3CTorchPolicyGraph = build_torch_policy(
+    name="A3CTorchPolicyGraph",
+    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
+    loss_fn=a3c_torch_loss,
+    stats_fn=a3c_torch_stats,
+    postprocess_fn=postprocess_torch_a3c,
+    extra_action_out_fn=a3c_extra_action_out,
+    extra_grad_process_fn=a3c_extra_grad_process,
+    optimizer_fn=optimizer,
+    mixins=[ValueNetworkMixin])
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index 510662be6f00..4bdcb3e1fed5 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -13,28 +13,28 @@
 
 
 # The basic policy gradients loss
-def _policy_gradient_loss(policy, batch_tensors):
+def policy_gradient_loss(policy, batch_tensors):
     actions = batch_tensors[SampleBatch.ACTIONS]
     advantages = batch_tensors[Postprocessing.ADVANTAGES]
     return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages)
 
 
 # This adds the "advantages" column to the sample batch.
-def _postprocess_advantages(policy,
-                            sample_batch,
-                            other_agent_batches=None,
-                            episode=None):
+def postprocess_advantages(policy,
+                           sample_batch,
+                           other_agent_batches=None,
+                           episode=None):
     return compute_advantages(
         sample_batch, 0.0, policy.config["gamma"], use_gae=False)
 
 
-def _make_optimizer(policy):
+def make_optimizer(policy):
     return tf.train.AdamOptimizer(learning_rate=policy.config["lr"])
 
 
 PGPolicyGraph = build_tf_policy(
     name="PGPolicyGraph",
     get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
-    postprocess_fn=_postprocess_advantages,
-    loss_fn=_policy_gradient_loss,
-    optimizer_fn=_make_optimizer)
+    postprocess_fn=postprocess_advantages,
+    loss_fn=policy_gradient_loss,
+    optimizer_fn=make_optimizer)
diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
index 063cc0610c3e..040dcadc9742 100644
--- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
@@ -11,7 +11,7 @@
 from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy
 
 
-def _pg_torch_loss(policy, batch_tensors):
+def pg_torch_loss(policy, batch_tensors):
     logits, _, values, _ = policy.model({
         SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
     }, [])
@@ -23,27 +23,27 @@ def _pg_torch_loss(policy, batch_tensors):
     return policy.pi_err
 
 
-def _postprocess_advantages(policy,
-                            sample_batch,
-                            other_agent_batches=None,
-                            episode=None):
+def postprocess_advantages(policy,
+                           sample_batch,
+                           other_agent_batches=None,
+                           episode=None):
     return compute_advantages(
         sample_batch, 0.0, policy.config["gamma"], use_gae=False)
 
 
-def _pg_loss_stats(policy, batch_tensors):
+def pg_loss_stats(policy, batch_tensors):
     # the error is recorded when computing the loss
     return {"policy_loss": policy.pi_err.item()}
 
 
-def _make_optimizer(policy):
+def make_optimizer(policy):
     return torch.optim.Adam(policy._model.parameters(), lr=policy.config["lr"])
 
 
 PGTorchPolicyGraph = build_torch_policy(
     name="PGTorchPolicyGraph",
     get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
-    loss_fn=_pg_torch_loss,
-    stats_fn=_pg_loss_stats,
-    postprocess_fn=_postprocess_advantages,
-    optimizer_fn=_make_optimizer)
+    loss_fn=pg_torch_loss,
+    stats_fn=pg_loss_stats,
+    postprocess_fn=postprocess_advantages,
+    optimizer_fn=make_optimizer)
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index 6d5e7c971919..6e05d7069f90 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -104,7 +104,7 @@ def reduce_mean_valid(t):
         self.loss = loss
 
 
-def _build_ppo_loss(policy, batch_tensors):
+def build_ppo_loss(policy, batch_tensors):
     if policy.model.state_in:
         max_seq_len = tf.reduce_max(policy.model.seq_lens)
         mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len)
@@ -133,7 +133,7 @@ def _build_ppo_loss(policy, batch_tensors):
     return policy.loss_obj.loss
 
 
-def _build_ppo_stats(policy, batch_tensors):
+def build_ppo_stats(policy, batch_tensors):
     policy.explained_variance = explained_variance(
         batch_tensors[Postprocessing.VALUE_TARGETS], policy.value_function)
 
@@ -151,7 +151,7 @@ def _build_ppo_stats(policy, batch_tensors):
     return stats_fetches
 
 
-def _build_ppo_action_fetches(policy):
+def build_ppo_action_fetches(policy):
     """Adds value function and logits outputs to experience batches."""
     return {
         SampleBatch.VF_PREDS: policy.value_function,
@@ -159,10 +159,10 @@ def _build_ppo_action_fetches(policy):
     }
 
 
-def _postprocess_ppo_gae(policy,
-                         sample_batch,
-                         other_agent_batches=None,
-                         episode=None):
+def postprocess_ppo_gae(policy,
+                        sample_batch,
+                        other_agent_batches=None,
+                        episode=None):
     """Adds the policy logits, VF preds, and advantages to the trajectory."""
 
     completed = sample_batch["dones"][-1]
@@ -185,7 +185,7 @@ def _postprocess_ppo_gae(policy,
     return batch
 
 
-def _build_ppo_gradients(policy, optimizer, loss):
+def build_ppo_gradients(policy, optimizer, loss):
     if policy.config["grad_clip"] is not None:
         policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             tf.get_variable_scope().name)
@@ -265,7 +265,7 @@ def _value(self, ob, prev_action, prev_reward, *args):
         return vf[0]
 
 
-def _setup_mixins(policy, obs_space, action_space, config):
+def setup_mixins(policy, obs_space, action_space, config):
     ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
     KLCoeffMixin.__init__(policy, config)
     LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
@@ -274,10 +274,10 @@ def _setup_mixins(policy, obs_space, action_space, config):
 PPOPolicyGraph = build_tf_policy(
     name="PPOPolicyGraph",
     get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
-    loss_fn=_build_ppo_loss,
-    stats_fn=_build_ppo_stats,
-    extra_action_fetches_fn=_build_ppo_action_fetches,
-    postprocess_fn=_postprocess_ppo_gae,
-    gradients_fn=_build_ppo_gradients,
-    before_loss_init=_setup_mixins,
+    loss_fn=build_ppo_loss,
+    stats_fn=build_ppo_stats,
+    extra_action_fetches_fn=build_ppo_action_fetches,
+    postprocess_fn=postprocess_ppo_gae,
+    gradients_fn=build_ppo_gradients,
+    before_loss_init=setup_mixins,
     mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 5b7daa0a3eae..d61ecc80ef5f 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -10,7 +10,7 @@
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
 from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.annotations import override
 from ray.rllib.utils import try_import_tf
 from ray.rllib.utils.debug import log_once, summarize
 from ray.rllib.utils.tracking_dict import UsageTrackingDict
@@ -20,7 +20,6 @@
 logger = logging.getLogger(__name__)
 
 
-@DeveloperAPI
 class DynamicTFPolicyGraph(TFPolicyGraph):
     """A TFPolicyGraph that auto-defines placeholders dynamically at runtime.
 
@@ -38,10 +37,8 @@ def __init__(self,
                  config,
                  loss_fn,
                  stats_fn=None,
-                 autosetup_model=True,
                  before_loss_init=None,
-                 action_sampler=None,
-                 action_prob=None,
+                 make_action_sampler=None,
                  existing_inputs=None):
         """Initialize a dynamic TF policy graph.
 
@@ -53,21 +50,17 @@ def __init__(self,
                 graph, and dict of experience tensor placeholders
             stats_fn (func): optional function that returns a dict of
                 TF fetches given the policy graph and batch input tensors
-            autosetup_model (bool): whether to create a model and action dist
-                using catalog defaults. These will be available as self.model
-                and self.action_dist
             before_loss_init (func): optional function to run prior to loss
                 init that takes the same arguments as __init__
-            action_sampler (Tensor): if autosetup_model is False, this must be
-                specified to define how the policy computes actions
-            action_prob (Tensor): if autosetup_model is False, this can be
-                specified to define the chosen action probability
+            make_action_sampler (func): optional function that returns a
+                tuple of action and action prob tensors. The function takes
+                (policy, input_dict, obs_space, action_space, config) as its
+                arguments
             existing_inputs (OrderedDict): when copying a policy graph, this
                 specifies an existing dict of placeholders to use instead of
                 defining new ones
         """
         self.config = config
-        self.autosetup_model = autosetup_model
         self._loss_fn = loss_fn
         self._stats_fn = stats_fn
 
@@ -85,9 +78,23 @@ def __init__(self,
             prev_rewards = tf.placeholder(
                 tf.float32, [None], name="prev_reward")
 
+        input_dict = {
+            "obs": obs,
+            "prev_actions": prev_actions,
+            "prev_rewards": prev_rewards,
+            "is_training": self._get_is_training_placeholder(),
+        }
+
         # Create the model network and action outputs
-        if autosetup_model:
-            dist_class, self.logit_dim = ModelCatalog.get_action_dist(
+        if make_action_sampler:
+            assert not existing_inputs, \
+                "Cloning not supported with custom action sampler"
+            self.model = None
+            self.action_dist = None
+            action_sampler, action_prob = make_action_sampler(
+                self, input_dict, obs_space, action_space, config)
+        else:
+            dist_class, logit_dim = ModelCatalog.get_action_dist(
                 action_space, self.config["model"])
             if existing_inputs:
                 existing_state_in = [
@@ -102,29 +109,16 @@ def __init__(self,
                 existing_state_in = []
                 existing_seq_lens = None
             self.model = ModelCatalog.get_model(
-                {
-                    "obs": obs,
-                    "prev_actions": prev_actions,
-                    "prev_rewards": prev_rewards,
-                    "is_training": self._get_is_training_placeholder(),
-                },
+                input_dict,
                 obs_space,
                 action_space,
-                self.logit_dim,
+                logit_dim,
                 self.config["model"],
                 state_in=existing_state_in,
                 seq_lens=existing_seq_lens)
             self.action_dist = dist_class(self.model.outputs)
             action_sampler = self.action_dist.sample()
             action_prob = self.action_dist.sampled_action_prob()
-        else:
-            self.logit_dim = None
-            self.model = None
-            self.action_dist = None
-            if not action_sampler:
-                raise ValueError(
-                    "When autosetup_model=False, action_sampler must be "
-                    "passed in to the constructor.")
 
         # Phase 1 init
         sess = tf.get_default_session()
@@ -139,11 +133,11 @@ def __init__(self,
             loss=None,  # dynamically initialized on run
             loss_inputs=[],
             model=self.model,
-            state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out,
+            state_inputs=self.model and self.model.state_in,
+            state_outputs=self.model and self.model.state_out,
             prev_action_input=prev_actions,
             prev_reward_input=prev_rewards,
-            seq_lens=self.model.seq_lens,
+            seq_lens=self.model and self.model.seq_lens,
             max_seq_len=config["model"]["max_seq_len"])
 
         # Phase 2 init
diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py
index 2b69c1a49bd7..4c708de35e18 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py
@@ -20,6 +20,7 @@ def build_tf_policy(name,
                     before_init=None,
                     before_loss_init=None,
                     after_init=None,
+                    make_action_sampler=None,
                     mixins=None):
     """Helper function for creating a dynamic tf policy graph at runtime.
 
@@ -46,6 +47,10 @@ def build_tf_policy(name,
             init that takes the same arguments as __init__
         after_init (func): optional function to run at the end of __init__
             that takes the same arguments as __init__
+        make_action_sampler (func): optional function that returns a
+            tuple of action and action prob tensors. The function takes
+            (policy, input_dict, obs_space, action_space, config) as its
+            arguments
         mixins (list): list of any class mixins for the returned policy class.
             These mixins will be applied in order and will have higher
             precedence than the DynamicTFPolicyGraph class
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py
index 4a4e79a15242..3a1464606612 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph.py
@@ -31,7 +31,7 @@ class TorchPolicyGraph(PolicyGraph):
     """
 
     def __init__(self, observation_space, action_space, model, loss,
-                 loss_inputs, action_distribution_cls):
+                 action_distribution_cls):
         """Build a policy graph from policy and loss torch modules.
 
         Note that model will be placed on GPU device if CUDA_VISIBLE_DEVICES
@@ -43,13 +43,8 @@ def __init__(self, observation_space, action_space, model, loss,
             model (nn.Module): PyTorch policy module. Given observations as
                 input, this module must return a list of outputs where the
                 first item is action logits, and the rest can be any value.
-            loss (nn.Module): Loss defined as a PyTorch module. The inputs for
-                this module are defined by the `loss_inputs` param. This module
-                returns a single scalar loss. Note that this module should
-                internally be using the model module.
-            loss_inputs (list): List of SampleBatch columns that will be
-                passed to the loss module's forward() function when computing
-                the loss. For example, ["obs", "action", "advantages"].
+            loss (func): Function that takes (policy_graph, batch_tensors)
+                and returns a single scalar loss.
             action_distribution_cls (ActionDistribution): Class for action
                 distribution.
         """
@@ -61,7 +56,6 @@ def __init__(self, observation_space, action_space, model, loss,
                        else torch.device("cpu"))
         self._model = model.to(self.device)
         self._loss = loss
-        self._loss_inputs = loss_inputs
         self._optimizer = self.optimizer()
         self._action_dist_cls = action_distribution_cls
 
@@ -91,7 +85,7 @@ def learn_on_batch(self, postprocessed_batch):
         batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
 
         with self.lock:
-            loss_out = self._compute_loss(batch_tensors)
+            loss_out = self._loss(self, batch_tensors)
             self._optimizer.zero_grad()
             loss_out.backward()
 
@@ -107,7 +101,7 @@ def compute_gradients(self, postprocessed_batch):
         batch_tensors = self._lazy_tensor_dict(postprocessed_batch)
 
         with self.lock:
-            loss_out = self._compute_loss(batch_tensors)
+            loss_out = self._loss(self, batch_tensors)
             self._optimizer.zero_grad()
             loss_out.backward()
 
@@ -169,13 +163,6 @@ def optimizer(self):
         """Custom PyTorch optimizer to use."""
         return torch.optim.Adam(self._model.parameters())
 
-    def _compute_loss(self, batch_tensors):
-        loss_in = []
-        for key in self._loss_inputs:
-            loss_in.append(batch_tensors[key])
-        loss_out = self._loss(self._model, *loss_in)
-        return loss_out
-
     def _lazy_tensor_dict(self, postprocessed_batch):
         batch_tensors = UsageTrackingDict(postprocessed_batch)
         batch_tensors.set_get_interceptor(
diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_graph_template.py
index 66fb51032797..50685ce5bd5f 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph_template.py
@@ -14,9 +14,12 @@ def build_torch_policy(name,
                        loss_fn,
                        stats_fn=None,
                        postprocess_fn=None,
+                       extra_action_out_fn=None,
+                       extra_grad_process_fn=None,
                        optimizer_fn=None,
                        before_init=None,
                        after_init=None,
+                       make_model_and_action_dist=None,
                        mixins=None):
     """Helper function for creating a dynamic tf policy graph at runtime.
 
@@ -30,12 +33,20 @@ def build_torch_policy(name,
             values given the policy graph and batch input tensors
         postprocess_fn (func): optional experience postprocessing function
             that takes the same args as PolicyGraph.postprocess_trajectory()
+        extra_action_out_fn (func): optional function that returns
+            a dict of extra values to include in experiences
+        extra_grad_process_fn (func): optional function that is called after
+            gradients are computed and returns processing info
         optimizer_fn (func): optional function that returns a torch optimizer
             given the policy graph object
         before_init (func): optional function to run at the beginning of
             __init__ that takes the same arguments as __init__
         after_init (func): optional function to run at the end of __init__
             that takes the same arguments as __init__
+        make_model_and_action_dist (func): optional func that takes the same
+            arguments as __init__ and returns a tuple of model instance and
+            torch action distribution class. If not specified, the default
+            model and action dist from the catalog will be used
         mixins (list): list of any class mixins for the returned policy class.
             These mixins will be applied in order and will have higher
             precedence than the TorchPolicyGraph class
@@ -58,19 +69,17 @@ def __init__(self, obs_space, action_space, config):
             if before_init:
                 before_init(self, obs_space, action_space, config)
 
-            self.dist_class, self.logit_dim = ModelCatalog.get_action_dist(
-                action_space, self.config["model"], torch=True)
-            self.model = ModelCatalog.get_torch_model(
-                obs_space, self.logit_dim, self.config["model"])
+            if make_model_and_action_dist:
+                self.model, self.dist_class = make_model_and_action_dist(
+                    self, obs_space, action_space, config)
+            else:
+                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
+                    action_space, self.config["model"], torch=True)
+                self.model = ModelCatalog.get_torch_model(
+                    obs_space, logit_dim, self.config["model"])
 
-            TorchPolicyGraph.__init__(
-                self,
-                obs_space,
-                action_space,
-                self.model,
-                None,  # loss fn is None since we override _compute_loss
-                [],  # TODO(ekl) clean up torch loss handling
-                self.dist_class)
+            TorchPolicyGraph.__init__(self, obs_space, action_space,
+                                      self.model, loss_fn, self.dist_class)
 
             if after_init:
                 after_init(self, obs_space, action_space, config)
@@ -85,6 +94,20 @@ def postprocess_trajectory(self,
             return postprocess_fn(self, sample_batch, other_agent_batches,
                                   episode)
 
+        @override(TorchPolicyGraph)
+        def extra_grad_process(self):
+            if extra_grad_process_fn:
+                return extra_grad_process_fn(self)
+            else:
+                return TorchPolicyGraph.extra_grad_process(self)
+
+        @override(TorchPolicyGraph)
+        def extra_action_out(self, model_out):
+            if extra_action_out_fn:
+                return extra_action_out_fn(self, model_out)
+            else:
+                return TorchPolicyGraph.extra_action_out_fn(self, model_out)
+
         @override(TorchPolicyGraph)
         def optimizer(self):
             if optimizer_fn:
@@ -99,10 +122,6 @@ def extra_grad_info(self, batch_tensors):
             else:
                 return TorchPolicyGraph.extra_grad_info(self, batch_tensors)
 
-        @override(TorchPolicyGraph)
-        def _compute_loss(self, batch_tensors):
-            return loss_fn(self, batch_tensors)
-
     graph_cls.__name__ = name
     graph_cls.__qualname__ = name
     return graph_cls

From ac108dde2710ba574956f390e9dad8b34a3896bf Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 16:13:09 -0700
Subject: [PATCH 19/39] cleanup

---
 .../agents/a3c/a3c_torch_policy_graph.py      | 26 +++++++++----------
 .../evaluation/torch_policy_graph_template.py |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
index 13704f74e938..930bd78094be 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -40,6 +40,19 @@ def a3c_torch_stats(policy, batch_tensors):
     }
 
 
+def postprocess_torch_a3c(policy,
+                          sample_batch,
+                          other_agent_batches=None,
+                          episode=None):
+    completed = sample_batch[SampleBatch.DONES][-1]
+    if completed:
+        last_r = 0.0
+    else:
+        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
+    return compute_advantages(sample_batch, last_r, policy.config["gamma"],
+                              policy.config["lambda"])
+
+
 def a3c_extra_action_out(policy, model_out):
     return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}
 
@@ -57,19 +70,6 @@ def optimizer(policy):
     return torch.optim.Adam(policy.model.parameters(), lr=policy.config["lr"])
 
 
-def postprocess_torch_a3c(policy,
-                          sample_batch,
-                          other_agent_batches=None,
-                          episode=None):
-    completed = sample_batch[SampleBatch.DONES][-1]
-    if completed:
-        last_r = 0.0
-    else:
-        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
-    return compute_advantages(sample_batch, last_r, policy.config["gamma"],
-                              policy.config["lambda"])
-
-
 class ValueNetworkMixin(object):
     def _value(self, obs):
         with self.lock:
diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_graph_template.py
index 50685ce5bd5f..745ba893a93f 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph_template.py
@@ -21,7 +21,7 @@ def build_torch_policy(name,
                        after_init=None,
                        make_model_and_action_dist=None,
                        mixins=None):
-    """Helper function for creating a dynamic tf policy graph at runtime.
+    """Helper function for creating a torch policy graph at runtime.
 
     Arguments:
         name (str): name of the graph (e.g., "PPOPolicyGraph")

From 707acf827347783852ba106b1d1530034a1e1637 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 16:20:28 -0700
Subject: [PATCH 20/39] remove underscores

---
 python/ray/rllib/agents/pg/pg.py   |  8 ++++----
 python/ray/rllib/agents/ppo/ppo.py | 20 ++++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index d9acebd91e80..a2e81be5ebec 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -21,13 +21,13 @@
 # yapf: enable
 
 
-def _make_policy_optimizer(local_ev, remote_evs, config):
+def make_policy_optimizer(local_ev, remote_evs, config):
     optimizer_config = dict(config["optimizer"],
                             **{"train_batch_size": config["train_batch_size"]})
     return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config)
 
 
-def _get_policy_graph(config):
+def get_policy_graph(config):
     if config["use_pytorch"]:
         from ray.rllib.agents.pg.torch_pg_policy_graph import \
             PGTorchPolicyGraph
@@ -40,5 +40,5 @@ def _get_policy_graph(config):
     "PG",
     default_config=DEFAULT_CONFIG,
     default_policy_graph=PGPolicyGraph,
-    get_policy_graph=_get_policy_graph,
-    make_policy_optimizer=_make_policy_optimizer)
+    get_policy_graph=get_policy_graph,
+    make_policy_optimizer=make_policy_optimizer)
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index 54f7a579fcc4..6bf6269a6a80 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -63,7 +63,7 @@
 # yapf: enable
 
 
-def _make_optimizer(local_evaluator, remote_evaluators, config):
+def make_optimizer(local_evaluator, remote_evaluators, config):
     if config["simple_optimizer"]:
         return SyncSamplesOptimizer(
             local_evaluator,
@@ -84,7 +84,7 @@ def _make_optimizer(local_evaluator, remote_evaluators, config):
         straggler_mitigation=config["straggler_mitigation"])
 
 
-def _update_kl(trainer, fetches):
+def update_kl(trainer, fetches):
     if "kl" in fetches:
         # single-agent
         trainer.local_evaluator.for_policy(
@@ -101,7 +101,7 @@ def update(pi, pi_id):
         trainer.local_evaluator.foreach_trainable_policy(update)
 
 
-def _warn_about_obs_filter(trainer):
+def warn_about_obs_filter(trainer):
     if "observation_filter" not in trainer.raw_user_config:
         # TODO(ekl) remove this message after a few releases
         logger.info(
@@ -112,7 +112,7 @@ def _warn_about_obs_filter(trainer):
             "require observation normalization.")
 
 
-def _warn_about_bad_reward_scales(trainer, result):
+def warn_about_bad_reward_scales(trainer, result):
     # Warn about bad clipping configs
     if trainer.config["vf_clip_param"] <= 0:
         rew_scale = float("inf")
@@ -132,7 +132,7 @@ def _warn_about_bad_reward_scales(trainer, result):
             "increasing `vf_clip_param`.")
 
 
-def _validate_config(config):
+def validate_config(config):
     if config["entropy_coeff"] < 0:
         raise DeprecationWarning("entropy_coeff must be >= 0")
     if config["sgd_minibatch_size"] > config["train_batch_size"]:
@@ -159,8 +159,8 @@ def _validate_config(config):
     "PPO",
     default_config=DEFAULT_CONFIG,
     default_policy_graph=PPOPolicyGraph,
-    make_policy_optimizer=_make_optimizer,
-    validate_config=_validate_config,
-    after_optimizer_step=_update_kl,
-    before_train_step=_warn_about_obs_filter,
-    after_train_result=_warn_about_bad_reward_scales)
+    make_policy_optimizer=make_optimizer,
+    validate_config=validate_config,
+    after_optimizer_step=update_kl,
+    before_train_step=warn_about_obs_filter,
+    after_train_result=warn_about_bad_reward_scales)

From a4a92600af018d94983eead2898213d76b49ec1c Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 17:07:00 -0700
Subject: [PATCH 21/39] fix py2 compat

---
 python/ray/rllib/agents/ddpg/ddpg_policy_graph.py |  2 +-
 .../rllib/evaluation/tf_policy_graph_template.py  | 13 +++++++++----
 .../evaluation/torch_policy_graph_template.py     | 15 ++++++++++-----
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
index 6c4917ad853f..6ac8f1ef8ab2 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
@@ -507,7 +507,7 @@ def make_noisy_actions():
 
             def make_uniform_random_actions():
                 # pure random exploration option
-                uniform_random_actions = tf.random.uniform(
+                uniform_random_actions = tf.random_uniform(
                     tf.shape(deterministic_actions))
                 # rescale uniform random actions according to action range
                 tf_range = tf.constant(action_range[None], dtype="float32")
diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py
index 4c708de35e18..57981b326673 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py
@@ -59,13 +59,18 @@ def build_tf_policy(name,
         a DynamicTFPolicyGraph instance that uses the specified args
     """
 
-    if mixins is None:
-        mixins = []
-
     if not name.endswith("PolicyGraph"):
         raise ValueError("Name should match *PolicyGraph", name)
 
-    class graph_cls(*mixins, DynamicTFPolicyGraph):
+    base = DynamicTFPolicyGraph
+    while mixins:
+
+        class new_base(mixins.pop(), base):
+            pass
+
+        base = new_base
+
+    class graph_cls(base):
         def __init__(self,
                      obs_space,
                      action_space,
diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_graph_template.py
index 745ba893a93f..71756f11dc85 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph_template.py
@@ -55,13 +55,18 @@ def build_torch_policy(name,
         a TorchPolicyGraph instance that uses the specified args
     """
 
-    if mixins is None:
-        mixins = []
-
     if not name.endswith("TorchPolicyGraph"):
         raise ValueError("Name should match *TorchPolicyGraph", name)
 
-    class graph_cls(*mixins, TorchPolicyGraph):
+    base = TorchPolicyGraph
+    while mixins:
+
+        class new_base(mixins.pop(), base):
+            pass
+
+        base = new_base
+
+    class graph_cls(base):
         def __init__(self, obs_space, action_space, config):
             config = dict(get_default_config(), **config)
             self.config = config
@@ -106,7 +111,7 @@ def extra_action_out(self, model_out):
             if extra_action_out_fn:
                 return extra_action_out_fn(self, model_out)
             else:
-                return TorchPolicyGraph.extra_action_out_fn(self, model_out)
+                return TorchPolicyGraph.extra_action_out(self, model_out)
 
         @override(TorchPolicyGraph)
         def optimizer(self):

From a2281da49f563e92fa0ca5798f8b06d23fa398f2 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 22:17:55 -0700
Subject: [PATCH 22/39] Update dynamic_tf_policy_graph.py

---
 python/ray/rllib/evaluation/dynamic_tf_policy_graph.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index d61ecc80ef5f..feba391e3574 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -239,6 +239,7 @@ def fake_array(tensor):
             elif v.dtype == np.object:
                 continue  # can't handle arbitrary objects in TF
             shape = (None, ) + v.shape[1:]
+            dtype = np.float32 if v.dtype == np.float64 else v.dtype
             placeholder = tf.placeholder(v.dtype, shape=shape, name=k)
             batch_tensors[k] = placeholder
 

From 817a1f9423c27c7409d1aa527214efabfc5c6ca4 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Thu, 16 May 2019 23:01:33 -0700
Subject: [PATCH 23/39] Update tracking_dict.py

---
 python/ray/rllib/utils/tracking_dict.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/ray/rllib/utils/tracking_dict.py b/python/ray/rllib/utils/tracking_dict.py
index d43f6e87b0f7..c0f145734e78 100644
--- a/python/ray/rllib/utils/tracking_dict.py
+++ b/python/ray/rllib/utils/tracking_dict.py
@@ -16,6 +16,7 @@ class UsageTrackingDict(dict):
     def __init__(self, *args, **kwargs):
         dict.__init__(self, *args, **kwargs)
         self.accessed_keys = set()
+        self.intercepted_values = {}
         self.get_interceptor = None
 
     def set_get_interceptor(self, fn):
@@ -25,5 +26,7 @@ def __getitem__(self, key):
         self.accessed_keys.add(key)
         value = dict.__getitem__(self, key)
         if self.get_interceptor:
-            value = self.get_interceptor(value)
+            if key not in self.intercepted_values:
+                self.intercepted_values[key] = self.get_interceptor(value)
+            value = self.intercepted_values[key]
         return value

From a7229cec0b8bff3afe9dfe0dbd5918910289490c Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 00:03:42 -0700
Subject: [PATCH 24/39] wip

---
 .../agents/a3c/a3c_torch_policy_graph.py      | 30 +++++++++----------
 .../ray/rllib/agents/ppo/ppo_policy_graph.py  | 16 +++++-----
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
index 930bd78094be..f241e25ef4d3 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -13,7 +13,7 @@
 from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy
 
 
-def a3c_torch_loss(policy, batch_tensors):
+def actor_critic_loss(policy, batch_tensors):
     logits, _, values, _ = policy.model({
         SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
     }, [])
@@ -32,7 +32,7 @@ def a3c_torch_loss(policy, batch_tensors):
     return overall_err
 
 
-def a3c_torch_stats(policy, batch_tensors):
+def loss_and_entropy_stats(policy, batch_tensors):
     return {
         "policy_entropy": policy.entropy.item(),
         "policy_loss": policy.pi_err.item(),
@@ -40,10 +40,10 @@ def a3c_torch_stats(policy, batch_tensors):
     }
 
 
-def postprocess_torch_a3c(policy,
-                          sample_batch,
-                          other_agent_batches=None,
-                          episode=None):
+def add_advantages(policy,
+                   sample_batch,
+                   other_agent_batches=None,
+                   episode=None):
     completed = sample_batch[SampleBatch.DONES][-1]
     if completed:
         last_r = 0.0
@@ -53,11 +53,11 @@ def postprocess_torch_a3c(policy,
                               policy.config["lambda"])
 
 
-def a3c_extra_action_out(policy, model_out):
+def model_value_predictions(policy, model_out):
     return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}
 
 
-def a3c_extra_grad_process(policy):
+def apply_grad_clipping(policy):
     info = {}
     if policy.config["grad_clip"]:
         total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(),
@@ -66,7 +66,7 @@ def a3c_extra_grad_process(policy):
     return info
 
 
-def optimizer(policy):
+def torch_optimizer(policy):
     return torch.optim.Adam(policy.model.parameters(), lr=policy.config["lr"])
 
 
@@ -81,10 +81,10 @@ def _value(self, obs):
 A3CTorchPolicyGraph = build_torch_policy(
     name="A3CTorchPolicyGraph",
     get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
-    loss_fn=a3c_torch_loss,
-    stats_fn=a3c_torch_stats,
-    postprocess_fn=postprocess_torch_a3c,
-    extra_action_out_fn=a3c_extra_action_out,
-    extra_grad_process_fn=a3c_extra_grad_process,
-    optimizer_fn=optimizer,
+    loss_fn=actor_critic_loss,
+    stats_fn=loss_and_entropy_stats,
+    postprocess_fn=add_advantages,
+    extra_action_out_fn=model_value_predictions,
+    extra_grad_process_fn=apply_grad_clipping,
+    optimizer_fn=torch_optimizer,
     mixins=[ValueNetworkMixin])
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index 6e05d7069f90..aab90a034a36 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -104,7 +104,7 @@ def reduce_mean_valid(t):
         self.loss = loss
 
 
-def build_ppo_loss(policy, batch_tensors):
+def ppo_surrogate_loss(policy, batch_tensors):
     if policy.model.state_in:
         max_seq_len = tf.reduce_max(policy.model.seq_lens)
         mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len)
@@ -133,7 +133,7 @@ def build_ppo_loss(policy, batch_tensors):
     return policy.loss_obj.loss
 
 
-def build_ppo_stats(policy, batch_tensors):
+def kl_and_loss_stats(policy, batch_tensors):
     policy.explained_variance = explained_variance(
         batch_tensors[Postprocessing.VALUE_TARGETS], policy.value_function)
 
@@ -151,7 +151,7 @@ def build_ppo_stats(policy, batch_tensors):
     return stats_fetches
 
 
-def build_ppo_action_fetches(policy):
+def vf_preds_and_logits_fetches(policy):
     """Adds value function and logits outputs to experience batches."""
     return {
         SampleBatch.VF_PREDS: policy.value_function,
@@ -185,7 +185,7 @@ def postprocess_ppo_gae(policy,
     return batch
 
 
-def build_ppo_gradients(policy, optimizer, loss):
+def clip_gradients(policy, optimizer, loss):
     if policy.config["grad_clip"] is not None:
         policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             tf.get_variable_scope().name)
@@ -274,10 +274,10 @@ def setup_mixins(policy, obs_space, action_space, config):
 PPOPolicyGraph = build_tf_policy(
     name="PPOPolicyGraph",
     get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
-    loss_fn=build_ppo_loss,
-    stats_fn=build_ppo_stats,
-    extra_action_fetches_fn=build_ppo_action_fetches,
+    loss_fn=ppo_surrogate_loss,
+    stats_fn=kl_and_loss_stats,
+    extra_action_fetches_fn=vf_preds_and_logits_fetches,
     postprocess_fn=postprocess_ppo_gae,
-    gradients_fn=build_ppo_gradients,
+    gradients_fn=clip_gradients,
     before_loss_init=setup_mixins,
     mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin])

From 4b9eb6df1138801181a637dd4bc614ffdef9aa97 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 00:06:05 -0700
Subject: [PATCH 25/39] rename

---
 python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py           | 2 +-
 python/ray/rllib/agents/pg/pg_policy_graph.py                   | 2 +-
 python/ray/rllib/agents/pg/torch_pg_policy_graph.py             | 2 +-
 python/ray/rllib/agents/ppo/ppo_policy_graph.py                 | 2 +-
 python/ray/rllib/evaluation/dynamic_tf_policy_graph.py          | 2 +-
 .../{tf_policy_graph_template.py => tf_policy_template.py}      | 0
 ...{torch_policy_graph_template.py => torch_policy_template.py} | 0
 7 files changed, 5 insertions(+), 5 deletions(-)
 rename python/ray/rllib/evaluation/{tf_policy_graph_template.py => tf_policy_template.py} (100%)
 rename python/ray/rllib/evaluation/{torch_policy_graph_template.py => torch_policy_template.py} (100%)

diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
index f241e25ef4d3..807c2327d77e 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -10,7 +10,7 @@
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy
+from ray.rllib.evaluation.torch_policy_template import build_torch_policy
 
 
 def actor_critic_loss(policy, batch_tensors):
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index 4bdcb3e1fed5..84e330adccfd 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -5,7 +5,7 @@
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.tf_policy_graph_template import build_tf_policy
+from ray.rllib.evaluation.tf_policy_template import build_tf_policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.utils import try_import_tf
 
diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
index 040dcadc9742..cbdfc4f5ea98 100644
--- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
@@ -8,7 +8,7 @@
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy
+from ray.rllib.evaluation.torch_policy_template import build_torch_policy
 
 
 def pg_torch_loss(policy, batch_tensors):
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index aab90a034a36..5984eee545ab 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -9,7 +9,7 @@
     Postprocessing
 from ray.rllib.evaluation.sample_batch import SampleBatch
 from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule
-from ray.rllib.evaluation.tf_policy_graph_template import build_tf_policy
+from ray.rllib.evaluation.tf_policy_template import build_tf_policy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils import try_import_tf
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index feba391e3574..d425770048e3 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -240,7 +240,7 @@ def fake_array(tensor):
                 continue  # can't handle arbitrary objects in TF
             shape = (None, ) + v.shape[1:]
             dtype = np.float32 if v.dtype == np.float64 else v.dtype
-            placeholder = tf.placeholder(v.dtype, shape=shape, name=k)
+            placeholder = tf.placeholder(dtype, shape=shape, name=k)
             batch_tensors[k] = placeholder
 
         if log_once("loss_init"):
diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
similarity index 100%
rename from python/ray/rllib/evaluation/tf_policy_graph_template.py
rename to python/ray/rllib/evaluation/tf_policy_template.py
diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_template.py
similarity index 100%
rename from python/ray/rllib/evaluation/torch_policy_graph_template.py
rename to python/ray/rllib/evaluation/torch_policy_template.py

From 6a1011e65988a0ae5b8b0803e0ad6f9ea654a3c1 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 00:08:12 -0700
Subject: [PATCH 26/39] debug level

---
 python/ray/rllib/evaluation/tf_policy_graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index a33626b2df52..853a6110eaf1 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -173,7 +173,7 @@ def _initialize_loss(self, loss, loss_inputs):
                                                  self._grads_and_vars)
 
         if log_once("loss_used"):
-            logger.info(
+            logger.debug(
                 "These tensors were used in the loss_fn:\n\n{}\n".format(
                     summarize(self._loss_input_dict)))
 

From b1cecbeac8b7315fa28cd290689a70215376327b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 00:17:32 -0700
Subject: [PATCH 27/39] rename policy_graph -> policy in new classes

---
 .../rllib/agents/a3c/a3c_torch_policy_graph.py   |  4 ++--
 python/ray/rllib/agents/pg/pg.py                 | 15 +++++++--------
 python/ray/rllib/agents/pg/pg_policy_graph.py    |  4 ++--
 .../ray/rllib/agents/pg/torch_pg_policy_graph.py |  4 ++--
 python/ray/rllib/agents/ppo/ppo.py               |  4 ++--
 python/ray/rllib/agents/ppo/ppo_policy_graph.py  |  4 ++--
 python/ray/rllib/agents/trainer_template.py      | 16 ++++++++--------
 .../ray/rllib/evaluation/tf_policy_template.py   |  6 +++---
 .../rllib/evaluation/torch_policy_template.py    |  6 +++---
 9 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
index 807c2327d77e..15301f85b53e 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -78,8 +78,8 @@ def _value(self, obs):
             return vf.detach().cpu().numpy().squeeze()
 
 
-A3CTorchPolicyGraph = build_torch_policy(
-    name="A3CTorchPolicyGraph",
+A3CTorchPolicy = build_torch_policy(
+    name="A3CTorchPolicy",
     get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
     loss_fn=actor_critic_loss,
     stats_fn=loss_and_entropy_stats,
diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index a2e81be5ebec..84cb9b9d8917 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -4,7 +4,7 @@
 
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
 from ray.rllib.optimizers import SyncSamplesOptimizer
 
 # yapf: disable
@@ -27,18 +27,17 @@ def make_policy_optimizer(local_ev, remote_evs, config):
     return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config)
 
 
-def get_policy_graph(config):
+def get_policy_class(config):
     if config["use_pytorch"]:
-        from ray.rllib.agents.pg.torch_pg_policy_graph import \
-            PGTorchPolicyGraph
-        return PGTorchPolicyGraph
+        from ray.rllib.agents.pg.torch_pg_policy_graph import PGTorchPolicy
+        return PGTorchPolicy
     else:
-        return PGPolicyGraph
+        return PGTFPolicy
 
 
 PGTrainer = build_trainer(
     "PG",
     default_config=DEFAULT_CONFIG,
-    default_policy_graph=PGPolicyGraph,
-    get_policy_graph=get_policy_graph,
+    default_policy=PGTFPolicy,
+    get_policy_class=get_policy_class,
     make_policy_optimizer=make_policy_optimizer)
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index 84e330adccfd..d4f97605e2e5 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -32,8 +32,8 @@ def make_optimizer(policy):
     return tf.train.AdamOptimizer(learning_rate=policy.config["lr"])
 
 
-PGPolicyGraph = build_tf_policy(
-    name="PGPolicyGraph",
+PGTFPolicy = build_tf_policy(
+    name="PGTFPolicy",
     get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
     postprocess_fn=postprocess_advantages,
     loss_fn=policy_gradient_loss,
diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
index cbdfc4f5ea98..ea280f3f6c08 100644
--- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
@@ -40,8 +40,8 @@ def make_optimizer(policy):
     return torch.optim.Adam(policy._model.parameters(), lr=policy.config["lr"])
 
 
-PGTorchPolicyGraph = build_torch_policy(
-    name="PGTorchPolicyGraph",
+PGTorchPolicy = build_torch_policy(
+    name="PGTorchPolicy",
     get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
     loss_fn=pg_torch_loss,
     stats_fn=pg_loss_stats,
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index 6bf6269a6a80..99a5240b00ea 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -5,7 +5,7 @@
 import logging
 
 from ray.rllib.agents import with_common_config
-from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
+from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy
 from ray.rllib.agents.trainer_template import build_trainer
 from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer
 
@@ -158,7 +158,7 @@ def validate_config(config):
 PPOTrainer = build_trainer(
     "PPO",
     default_config=DEFAULT_CONFIG,
-    default_policy_graph=PPOPolicyGraph,
+    default_policy=PPOTFPolicy,
     make_policy_optimizer=make_optimizer,
     validate_config=validate_config,
     after_optimizer_step=update_kl,
diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
index 5984eee545ab..334ca788c936 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py
@@ -271,8 +271,8 @@ def setup_mixins(policy, obs_space, action_space, config):
     LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
 
 
-PPOPolicyGraph = build_tf_policy(
-    name="PPOPolicyGraph",
+PPOTFPolicy = build_tf_policy(
+    name="PPOTFPolicy",
     get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
     loss_fn=ppo_surrogate_loss,
     stats_fn=kl_and_loss_stats,
diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py
index 85ed16b56044..643835fc2890 100644
--- a/python/ray/rllib/agents/trainer_template.py
+++ b/python/ray/rllib/agents/trainer_template.py
@@ -9,10 +9,10 @@
 @DeveloperAPI
 def build_trainer(name,
                   default_config,
-                  default_policy_graph,
+                  default_policy,
                   make_policy_optimizer,
                   validate_config=None,
-                  get_policy_graph=None,
+                  get_policy_class=None,
                   before_train_step=None,
                   after_optimizer_step=None,
                   after_train_result=None):
@@ -21,12 +21,12 @@ def build_trainer(name,
     Arguments:
         name (str): name of the trainer (e.g., "PPO")
         default_config (dict): the default config dict of the algorithm
-        default_policy_graph (cls): the default PolicyGraph class to use
+        default_policy (cls): the default PolicyGraph class to use
         make_policy_optimizer (func): function that returns a PolicyOptimizer
             instance given (local_evaluator, remote_evaluators, config)
         validate_config (func): optional callback that checks a given config
             for correctness. It may mutate the config as needed.
-        get_policy_graph (func): optional callback that takes a config and
+        get_policy_class (func): optional callback that takes a config and
             returns the policy graph class to override the default with
         before_train_step (func): optional callback to run before each train()
             call. It takes the trainer instance as an argument.
@@ -48,15 +48,15 @@ def build_trainer(name,
     class trainer_cls(Trainer):
         _name = name
         _default_config = default_config
-        _policy_graph = default_policy_graph
+        _policy_graph = default_policy
 
         def _init(self, config, env_creator):
             if validate_config:
                 validate_config(config)
-            if get_policy_graph is None:
-                policy_graph = default_policy_graph
+            if get_policy_class is None:
+                policy_graph = default_policy
             else:
-                policy_graph = get_policy_graph(config)
+                policy_graph = get_policy_class(config)
             self.local_evaluator = self.make_local_evaluator(
                 env_creator, policy_graph)
             self.remote_evaluators = self.make_remote_evaluators(
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index 57981b326673..40294ef4139d 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -25,7 +25,7 @@ def build_tf_policy(name,
     """Helper function for creating a dynamic tf policy graph at runtime.
 
     Arguments:
-        name (str): name of the graph (e.g., "PPOPolicyGraph")
+        name (str): name of the graph (e.g., "PPOPolicy")
         get_default_config (func): function that returns the default config
             to merge with any overrides
         loss_fn (func): function that returns a loss tensor the policy graph,
@@ -59,8 +59,8 @@ def build_tf_policy(name,
         a DynamicTFPolicyGraph instance that uses the specified args
     """
 
-    if not name.endswith("PolicyGraph"):
-        raise ValueError("Name should match *PolicyGraph", name)
+    if not name.endswith("TFPolicy"):
+        raise ValueError("Name should match *TFPolicy", name)
 
     base = DynamicTFPolicyGraph
     while mixins:
diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py
index 71756f11dc85..374ed7395b2a 100644
--- a/python/ray/rllib/evaluation/torch_policy_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_template.py
@@ -24,7 +24,7 @@ def build_torch_policy(name,
     """Helper function for creating a torch policy graph at runtime.
 
     Arguments:
-        name (str): name of the graph (e.g., "PPOPolicyGraph")
+        name (str): name of the graph (e.g., "PPOPolicy")
         get_default_config (func): function that returns the default config
             to merge with any overrides
         loss_fn (func): function that returns a loss tensor the policy graph,
@@ -55,8 +55,8 @@ def build_torch_policy(name,
         a TorchPolicyGraph instance that uses the specified args
     """
 
-    if not name.endswith("TorchPolicyGraph"):
-        raise ValueError("Name should match *TorchPolicyGraph", name)
+    if not name.endswith("TorchPolicy"):
+        raise ValueError("Name should match *TorchPolicy", name)
 
     base = TorchPolicyGraph
     while mixins:

From c857285f5c9f6a8d7f0c81d0a3997fffcda7270a Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 11:26:04 -0700
Subject: [PATCH 28/39] fix test

---
 python/ray/rllib/agents/a3c/a3c.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py
index 836d9f074999..eb384058de80 100644
--- a/python/ray/rllib/agents/a3c/a3c.py
+++ b/python/ray/rllib/agents/a3c/a3c.py
@@ -49,8 +49,8 @@ class A3CTrainer(Trainer):
     def _init(self, config, env_creator):
         if config["use_pytorch"]:
             from ray.rllib.agents.a3c.a3c_torch_policy_graph import \
-                A3CTorchPolicyGraph
-            policy_cls = A3CTorchPolicyGraph
+                A3CTorchPolicy
+            policy_cls = A3CTorchPolicy
         else:
             policy_cls = self._policy_graph
 

From 64b267e29c3286fb5e0b4b1e4f3cea2cf7d0cc18 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 14:39:22 -0700
Subject: [PATCH 29/39] rename ppo tf policy

---
 python/ray/rllib/evaluation/policy_evaluator.py        | 10 +++++-----
 python/ray/rllib/examples/multiagent_two_trainers.py   |  4 ++--
 .../ray/rllib/tests/test_external_multi_agent_env.py   |  4 ++--
 python/ray/rllib/tests/test_io.py                      |  4 ++--
 python/ray/rllib/tests/test_multi_agent_env.py         |  8 ++++----
 python/ray/rllib/tests/test_nested_spaces.py           |  6 +++---
 python/ray/rllib/tests/test_optimizers.py              |  6 +++---
 7 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py
index f6761122156e..48e19dfcb96e 100644
--- a/python/ray/rllib/evaluation/policy_evaluator.py
+++ b/python/ray/rllib/evaluation/policy_evaluator.py
@@ -65,7 +65,7 @@ class PolicyEvaluator(EvaluatorInterface):
         >>> # Create a policy evaluator and using it to collect experiences.
         >>> evaluator = PolicyEvaluator(
         ...   env_creator=lambda _: gym.make("CartPole-v0"),
-        ...   policy_graph=PGPolicyGraph)
+        ...   policy_graph=PGTFPolicy)
         >>> print(evaluator.sample())
         SampleBatch({
             "obs": [[...]], "actions": [[...]], "rewards": [[...]],
@@ -76,7 +76,7 @@ class PolicyEvaluator(EvaluatorInterface):
         ...   evaluator_cls=PolicyEvaluator,
         ...   evaluator_args={
         ...     "env_creator": lambda _: gym.make("CartPole-v0"),
-        ...     "policy_graph": PGPolicyGraph,
+        ...     "policy_graph": PGTFPolicy,
         ...   },
         ...   num_workers=10)
         >>> for _ in range(10): optimizer.step()
@@ -87,12 +87,12 @@ class PolicyEvaluator(EvaluatorInterface):
         ...   policy_graphs={
         ...       # Use an ensemble of two policies for car agents
         ...       "car_policy1":
-        ...         (PGPolicyGraph, Box(...), Discrete(...), {"gamma": 0.99}),
+        ...         (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.99}),
         ...       "car_policy2":
-        ...         (PGPolicyGraph, Box(...), Discrete(...), {"gamma": 0.95}),
+        ...         (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.95}),
         ...       # Use a single shared policy for all traffic lights
         ...       "traffic_light_policy":
-        ...         (PGPolicyGraph, Box(...), Discrete(...), {}),
+        ...         (PGTFPolicy, Box(...), Discrete(...), {}),
         ...   },
         ...   policy_mapping_fn=lambda agent_id:
         ...     random.choice(["car_policy1", "car_policy2"])
diff --git a/python/ray/rllib/examples/multiagent_two_trainers.py b/python/ray/rllib/examples/multiagent_two_trainers.py
index 2c18f2bf4b96..1d4257e4eb9d 100644
--- a/python/ray/rllib/examples/multiagent_two_trainers.py
+++ b/python/ray/rllib/examples/multiagent_two_trainers.py
@@ -18,7 +18,7 @@
 from ray.rllib.agents.dqn.dqn import DQNTrainer
 from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
 from ray.rllib.agents.ppo.ppo import PPOTrainer
-from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
+from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy
 from ray.rllib.tests.test_multi_agent_env import MultiCartpole
 from ray.tune.logger import pretty_print
 from ray.tune.registry import register_env
@@ -39,7 +39,7 @@
     # You can also have multiple policy graphs per trainer, but here we just
     # show one each for PPO and DQN.
     policy_graphs = {
-        "ppo_policy": (PPOPolicyGraph, obs_space, act_space, {}),
+        "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}),
         "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}),
     }
 
diff --git a/python/ray/rllib/tests/test_external_multi_agent_env.py b/python/ray/rllib/tests/test_external_multi_agent_env.py
index e5e182b38655..c01e6fa0b7ae 100644
--- a/python/ray/rllib/tests/test_external_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_external_multi_agent_env.py
@@ -8,7 +8,7 @@
 import unittest
 
 import ray
-from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
 from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
@@ -67,7 +67,7 @@ def testTrainExternalMultiCartpoleManyPolicies(self):
         obs_space = single_env.observation_space
         policies = {}
         for i in range(20):
-            policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
+            policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                            {})
         policy_ids = list(policies.keys())
         ev = PolicyEvaluator(
diff --git a/python/ray/rllib/tests/test_io.py b/python/ray/rllib/tests/test_io.py
index 9f92c9107c4e..0706be1019cc 100644
--- a/python/ray/rllib/tests/test_io.py
+++ b/python/ray/rllib/tests/test_io.py
@@ -15,7 +15,7 @@
 
 import ray
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
 from ray.rllib.evaluation import SampleBatch
 from ray.rllib.offline import IOContext, JsonWriter, JsonReader
 from ray.rllib.offline.json_writer import _to_json
@@ -159,7 +159,7 @@ def testMultiAgent(self):
         def gen_policy():
             obs_space = single_env.observation_space
             act_space = single_env.action_space
-            return (PGPolicyGraph, obs_space, act_space, {})
+            return (PGTFPolicy, obs_space, act_space, {})
 
         pg = PGTrainer(
             env="multi_cartpole",
diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py
index eccb9aa82fb8..72130712d555 100644
--- a/python/ray/rllib/tests/test_multi_agent_env.py
+++ b/python/ray/rllib/tests/test_multi_agent_env.py
@@ -8,7 +8,7 @@
 
 import ray
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
 from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
 from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer,
                                   AsyncGradientsOptimizer)
@@ -470,7 +470,7 @@ def get_initial_state(self):
         self.assertEqual(batch["state_out_0"][1], h)
 
     def testReturningModelBasedRolloutsData(self):
-        class ModelBasedPolicyGraph(PGPolicyGraph):
+        class ModelBasedPolicyGraph(PGTFPolicy):
             def compute_actions(self,
                                 obs_batch,
                                 state_batches,
@@ -584,7 +584,7 @@ def _testWithOptimizer(self, optimizer_cls):
             }
         else:
             policies = {
-                "p1": (PGPolicyGraph, obs_space, act_space, {}),
+                "p1": (PGTFPolicy, obs_space, act_space, {}),
                 "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
             }
         ev = PolicyEvaluator(
@@ -640,7 +640,7 @@ def testTrainMultiCartpoleManyPolicies(self):
         obs_space = env.observation_space
         policies = {}
         for i in range(20):
-            policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
+            policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                            {})
         policy_ids = list(policies.keys())
         ev = PolicyEvaluator(
diff --git a/python/ray/rllib/tests/test_nested_spaces.py b/python/ray/rllib/tests/test_nested_spaces.py
index e4285e42287c..b70bd9a2908e 100644
--- a/python/ray/rllib/tests/test_nested_spaces.py
+++ b/python/ray/rllib/tests/test_nested_spaces.py
@@ -12,7 +12,7 @@
 import ray
 from ray.rllib.agents.a3c import A2CTrainer
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
 from ray.rllib.env import MultiAgentEnv
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.vector_env import VectorEnv
@@ -333,10 +333,10 @@ def testMultiAgentComplexSpaces(self):
                 "multiagent": {
                     "policy_graphs": {
                         "tuple_policy": (
-                            PGPolicyGraph, TUPLE_SPACE, act_space,
+                            PGTFPolicy, TUPLE_SPACE, act_space,
                             {"model": {"custom_model": "tuple_spy"}}),
                         "dict_policy": (
-                            PGPolicyGraph, DICT_SPACE, act_space,
+                            PGTFPolicy, DICT_SPACE, act_space,
                             {"model": {"custom_model": "dict_spy"}}),
                     },
                     "policy_mapping_fn": lambda a: {
diff --git a/python/ray/rllib/tests/test_optimizers.py b/python/ray/rllib/tests/test_optimizers.py
index 9c9e6b56b426..5436baeafa90 100644
--- a/python/ray/rllib/tests/test_optimizers.py
+++ b/python/ray/rllib/tests/test_optimizers.py
@@ -9,7 +9,7 @@
 
 import ray
 from ray.rllib.agents.ppo import PPOTrainer
-from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
+from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy
 from ray.rllib.evaluation import SampleBatch
 from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
 from ray.rllib.optimizers import AsyncGradientsOptimizer, AsyncSamplesOptimizer
@@ -240,12 +240,12 @@ def make_sess():
 
         local = PolicyEvaluator(
             env_creator=lambda _: gym.make("CartPole-v0"),
-            policy_graph=PPOPolicyGraph,
+            policy_graph=PPOTFPolicy,
             tf_session_creator=make_sess)
         remotes = [
             PolicyEvaluator.as_remote().remote(
                 env_creator=lambda _: gym.make("CartPole-v0"),
-                policy_graph=PPOPolicyGraph,
+                policy_graph=PPOTFPolicy,
                 tf_session_creator=make_sess)
         ]
         return local, remotes

From 52f06e9dce3612c830fe99a59fd1cebcfbe83680 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 16:23:22 -0700
Subject: [PATCH 30/39] port appo too

---
 python/ray/rllib/agents/ppo/appo.py           |   6 +-
 .../ray/rllib/agents/ppo/appo_policy_graph.py | 549 +++++++-----------
 .../evaluation/dynamic_tf_policy_graph.py     |  29 +-
 .../rllib/evaluation/tf_policy_template.py    |  11 +-
 4 files changed, 259 insertions(+), 336 deletions(-)

diff --git a/python/ray/rllib/agents/ppo/appo.py b/python/ray/rllib/agents/ppo/appo.py
index ac3251775d52..b32531dd7d5c 100644
--- a/python/ray/rllib/agents/ppo/appo.py
+++ b/python/ray/rllib/agents/ppo/appo.py
@@ -2,7 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.agents.ppo.appo_policy_graph import AsyncPPOPolicyGraph
+from ray.rllib.agents.ppo.appo_policy_graph import AsyncPPOTFPolicy
 from ray.rllib.agents.trainer import with_base_config
 from ray.rllib.agents import impala
 from ray.rllib.utils.annotations import override
@@ -57,8 +57,8 @@ class APPOTrainer(impala.ImpalaTrainer):
 
     _name = "APPO"
     _default_config = DEFAULT_CONFIG
-    _policy_graph = AsyncPPOPolicyGraph
+    _policy_graph = AsyncPPOTFPolicy
 
     @override(impala.ImpalaTrainer)
     def _get_policy_graph(self):
-        return AsyncPPOPolicyGraph
+        return AsyncPPOTFPolicy
diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py
index caaaf512bcb1..b2ff83ee2e85 100644
--- a/python/ray/rllib/agents/ppo/appo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py
@@ -12,14 +12,11 @@
 
 import ray
 from ray.rllib.agents.impala import vtrace
-from ray.rllib.evaluation.policy_graph import PolicyGraph
-from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \
-    LearningRateSchedule
-from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.utils.annotations import override
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.evaluation.sample_batch import SampleBatch
+from ray.rllib.evaluation.tf_policy_template import build_tf_policy
+from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule
 from ray.rllib.utils.explained_variance import explained_variance
-from ray.rllib.models.action_dist import MultiCategorical
 from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.utils import try_import_tf
 
@@ -27,6 +24,8 @@
 
 logger = logging.getLogger(__name__)
 
+BEHAVIOUR_LOGITS = "behaviour_logits"
+
 
 class PPOSurrogateLoss(object):
     """Loss used when V-trace is disabled.
@@ -163,333 +162,233 @@ def __init__(self,
                            self.entropy * entropy_coeff)
 
 
-class APPOPostprocessing(object):
-    """Adds the policy logits, VF preds, and advantages to the trajectory."""
-
-    @override(TFPolicyGraph)
-    def extra_compute_action_fetches(self):
-        out = {"behaviour_logits": self.model.outputs}
-        if not self.config["vtrace"]:
-            out["vf_preds"] = self.value_function
-        return dict(TFPolicyGraph.extra_compute_action_fetches(self), **out)
-
-    @override(PolicyGraph)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        if not self.config["vtrace"]:
-            completed = sample_batch["dones"][-1]
-            if completed:
-                last_r = 0.0
-            else:
-                next_state = []
-                for i in range(len(self.model.state_in)):
-                    next_state.append(
-                        [sample_batch["state_out_{}".format(i)][-1]])
-                last_r = self.value(sample_batch["new_obs"][-1], *next_state)
-            batch = compute_advantages(
-                sample_batch,
-                last_r,
-                self.config["gamma"],
-                self.config["lambda"],
-                use_gae=self.config["use_gae"])
-        else:
-            batch = sample_batch
-        del batch.data["new_obs"]  # not used, so save some bandwidth
-        return batch
-
-
-class AsyncPPOPolicyGraph(LearningRateSchedule, APPOPostprocessing,
-                          TFPolicyGraph):
-    def __init__(self,
-                 observation_space,
-                 action_space,
-                 config,
-                 existing_inputs=None):
-        config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config)
-        assert config["batch_mode"] == "truncate_episodes", \
-            "Must use `truncate_episodes` batch mode with V-trace."
-        self.config = config
-        self.sess = tf.get_default_session()
-        self.grads = None
-
-        if isinstance(action_space, gym.spaces.Discrete):
-            is_multidiscrete = False
-            output_hidden_shape = [action_space.n]
-        elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete):
-            is_multidiscrete = True
-            output_hidden_shape = action_space.nvec.astype(np.int32)
-        else:
-            is_multidiscrete = False
-            output_hidden_shape = 1
-
-        # Policy network model
-        dist_class, logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"])
-
-        # Create input placeholders
-        if existing_inputs:
-            if self.config["vtrace"]:
-                actions, dones, behaviour_logits, rewards, observations, \
-                    prev_actions, prev_rewards = existing_inputs[:7]
-                existing_state_in = existing_inputs[7:-1]
-                existing_seq_lens = existing_inputs[-1]
-            else:
-                actions, dones, behaviour_logits, rewards, observations, \
-                    prev_actions, prev_rewards, adv_ph, value_targets = \
-                    existing_inputs[:9]
-                existing_state_in = existing_inputs[9:-1]
-                existing_seq_lens = existing_inputs[-1]
+def _make_time_major(policy, tensor, drop_last=False):
+    """Swaps batch and trajectory axis.
+    Args:
+        policy: Policy reference
+        tensor: A tensor or list of tensors to reshape.
+        drop_last: A bool indicating whether to drop the last
+        trajectory item.
+    Returns:
+        res: A tensor with swapped axes or a list of tensors with
+        swapped axes.
+    """
+    if isinstance(tensor, list):
+        return [_make_time_major(policy, t, drop_last) for t in tensor]
+
+    if policy.model.state_init:
+        B = tf.shape(policy.model.seq_lens)[0]
+        T = tf.shape(tensor)[0] // B
+    else:
+        # Important: chop the tensor into batches at known episode cut
+        # boundaries. TODO(ekl) this is kind of a hack
+        T = policy.config["sample_batch_size"]
+        B = tf.shape(tensor)[0] // T
+    rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))
+
+    # swap B and T axes
+    res = tf.transpose(
+        rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))
+
+    if drop_last:
+        return res[:-1]
+    return res
+
+
+def build_appo_surrogate_loss(policy, batch_tensors):
+    if isinstance(policy.action_space, gym.spaces.Discrete):
+        is_multidiscrete = False
+        output_hidden_shape = [policy.action_space.n]
+    elif isinstance(policy.action_space,
+                    gym.spaces.multi_discrete.MultiDiscrete):
+        is_multidiscrete = True
+        output_hidden_shape = policy.action_space.nvec.astype(np.int32)
+    else:
+        is_multidiscrete = False
+        output_hidden_shape = 1
+
+    def make_time_major(*args, **kw):
+        return _make_time_major(policy, *args, **kw)
+
+    actions = batch_tensors[SampleBatch.ACTIONS]
+    dones = batch_tensors[SampleBatch.DONES]
+    rewards = batch_tensors[SampleBatch.REWARDS]
+    behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS]
+    unpacked_behaviour_logits = tf.split(
+        behaviour_logits, output_hidden_shape, axis=1)
+    unpacked_outputs = tf.split(
+        policy.model.outputs, output_hidden_shape, axis=1)
+    prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \
+        behaviour_logits
+    action_dist = policy.action_dist
+    prev_action_dist = policy.dist_class(prev_dist_inputs)
+    values = policy.value_function
+
+    if policy.model.state_in:
+        max_seq_len = tf.reduce_max(policy.model.seq_lens) - 1
+        mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len)
+        mask = tf.reshape(mask, [-1])
+    else:
+        mask = tf.ones_like(rewards)
+
+    if policy.config["vtrace"]:
+        logger.info("Using V-Trace surrogate loss (vtrace=True)")
+
+        # Prepare actions for loss
+        loss_actions = actions if is_multidiscrete else tf.expand_dims(
+            actions, axis=1)
+
+        policy.loss = VTraceSurrogateLoss(
+            actions=make_time_major(loss_actions, drop_last=True),
+            prev_actions_logp=make_time_major(
+                prev_action_dist.logp(actions), drop_last=True),
+            actions_logp=make_time_major(
+                action_dist.logp(actions), drop_last=True),
+            action_kl=prev_action_dist.kl(action_dist),
+            actions_entropy=make_time_major(
+                action_dist.entropy(), drop_last=True),
+            dones=make_time_major(dones, drop_last=True),
+            behaviour_logits=make_time_major(
+                unpacked_behaviour_logits, drop_last=True),
+            target_logits=make_time_major(unpacked_outputs, drop_last=True),
+            discount=policy.config["gamma"],
+            rewards=make_time_major(rewards, drop_last=True),
+            values=make_time_major(values, drop_last=True),
+            bootstrap_value=make_time_major(values)[-1],
+            dist_class=policy.dist_class,
+            valid_mask=make_time_major(mask, drop_last=True),
+            vf_loss_coeff=policy.config["vf_loss_coeff"],
+            entropy_coeff=policy.config["entropy_coeff"],
+            clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
+            clip_pg_rho_threshold=policy.config[
+                "vtrace_clip_pg_rho_threshold"],
+            clip_param=policy.config["clip_param"])
+    else:
+        logger.info("Using PPO surrogate loss (vtrace=False)")
+        policy.loss = PPOSurrogateLoss(
+            prev_actions_logp=make_time_major(prev_action_dist.logp(actions)),
+            actions_logp=make_time_major(action_dist.logp(actions)),
+            action_kl=prev_action_dist.kl(action_dist),
+            actions_entropy=make_time_major(action_dist.entropy()),
+            values=make_time_major(values),
+            valid_mask=make_time_major(mask),
+            advantages=make_time_major(
+                batch_tensors[Postprocessing.ADVANTAGES]),
+            value_targets=make_time_major(
+                batch_tensors[Postprocessing.VALUE_TARGETS]),
+            vf_loss_coeff=policy.config["vf_loss_coeff"],
+            entropy_coeff=policy.config["entropy_coeff"],
+            clip_param=policy.config["clip_param"])
+
+    return policy.loss.total_loss
+
+
+def stats(policy, batch_tensors):
+    values_batched = _make_time_major(
+        policy, policy.value_function, drop_last=policy.config["vtrace"])
+
+    return {
+        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
+        "policy_loss": policy.loss.pi_loss,
+        "entropy": policy.loss.entropy,
+        "var_gnorm": tf.global_norm(policy.var_list),
+        "vf_loss": policy.loss.vf_loss,
+        "vf_explained_var": explained_variance(
+            tf.reshape(policy.loss.value_targets, [-1]),
+            tf.reshape(values_batched, [-1])),
+    }
+
+
+def grad_stats(policy, grads):
+    return {
+        "grad_gnorm": tf.global_norm(grads),
+    }
+
+
+def postprocess_trajectory(policy,
+                           sample_batch,
+                           other_agent_batches=None,
+                           episode=None):
+    if not policy.config["vtrace"]:
+        completed = sample_batch["dones"][-1]
+        if completed:
+            last_r = 0.0
         else:
-            actions = ModelCatalog.get_action_placeholder(action_space)
-            dones = tf.placeholder(tf.bool, [None], name="dones")
-            rewards = tf.placeholder(tf.float32, [None], name="rewards")
-            behaviour_logits = tf.placeholder(
-                tf.float32, [None, logit_dim], name="behaviour_logits")
-            observations = tf.placeholder(
-                tf.float32, [None] + list(observation_space.shape))
-            existing_state_in = None
-            existing_seq_lens = None
-
-            if not self.config["vtrace"]:
-                adv_ph = tf.placeholder(
-                    tf.float32, name="advantages", shape=(None, ))
-                value_targets = tf.placeholder(
-                    tf.float32, name="value_targets", shape=(None, ))
-        self.observations = observations
-
-        # Unpack behaviour logits
-        unpacked_behaviour_logits = tf.split(
-            behaviour_logits, output_hidden_shape, axis=1)
-
-        # Setup the policy
-        dist_class, logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"])
-        prev_actions = ModelCatalog.get_action_placeholder(action_space)
-        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
-        self.model = ModelCatalog.get_model(
-            {
-                "obs": observations,
-                "prev_actions": prev_actions,
-                "prev_rewards": prev_rewards,
-                "is_training": self._get_is_training_placeholder(),
-            },
-            observation_space,
-            action_space,
-            logit_dim,
-            self.config["model"],
-            state_in=existing_state_in,
-            seq_lens=existing_seq_lens)
-        unpacked_outputs = tf.split(
-            self.model.outputs, output_hidden_shape, axis=1)
-
-        dist_inputs = unpacked_outputs if is_multidiscrete else \
-            self.model.outputs
-        prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \
-            behaviour_logits
-
-        action_dist = dist_class(dist_inputs)
-        prev_action_dist = dist_class(prev_dist_inputs)
-
-        values = self.model.value_function()
-        self.value_function = values
+            next_state = []
+            for i in range(len(policy.model.state_in)):
+                next_state.append([sample_batch["state_out_{}".format(i)][-1]])
+            last_r = policy.value(sample_batch["new_obs"][-1], *next_state)
+        batch = compute_advantages(
+            sample_batch,
+            last_r,
+            policy.config["gamma"],
+            policy.config["lambda"],
+            use_gae=policy.config["use_gae"])
+    else:
+        batch = sample_batch
+    del batch.data["new_obs"]  # not used, so save some bandwidth
+    return batch
+
+
+def add_values_and_logits(policy):
+    out = {BEHAVIOUR_LOGITS: policy.model.outputs}
+    if not policy.config["vtrace"]:
+        out[SampleBatch.VF_PREDS] = policy.value_function
+    return out
+
+
+def validate_config(policy, obs_space, action_space, config):
+    assert config["batch_mode"] == "truncate_episodes", \
+        "Must use `truncate_episodes` batch mode with V-trace."
+
+
+def optimizer(policy):
+    if policy.config["opt_type"] == "adam":
+        return tf.train.AdamOptimizer(policy.cur_lr)
+    else:
+        return tf.train.RMSPropOptimizer(policy.cur_lr, policy.config["decay"],
+                                         policy.config["momentum"],
+                                         policy.config["epsilon"])
+
+
+def gradients(policy, optimizer, loss):
+    grads = tf.gradients(loss, policy.var_list)
+    policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
+    clipped_grads = list(zip(policy.grads, policy.var_list))
+    return clipped_grads
+
+
+class ValueNetworkMixin(object):
+    def __init__(self):
+        self.value_function = self.model.value_function()
         self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           tf.get_variable_scope().name)
 
-        def make_time_major(tensor, drop_last=False):
-            """Swaps batch and trajectory axis.
-            Args:
-                tensor: A tensor or list of tensors to reshape.
-                drop_last: A bool indicating whether to drop the last
-                trajectory item.
-            Returns:
-                res: A tensor with swapped axes or a list of tensors with
-                swapped axes.
-            """
-            if isinstance(tensor, list):
-                return [make_time_major(t, drop_last) for t in tensor]
-
-            if self.model.state_init:
-                B = tf.shape(self.model.seq_lens)[0]
-                T = tf.shape(tensor)[0] // B
-            else:
-                # Important: chop the tensor into batches at known episode cut
-                # boundaries. TODO(ekl) this is kind of a hack
-                T = self.config["sample_batch_size"]
-                B = tf.shape(tensor)[0] // T
-            rs = tf.reshape(tensor,
-                            tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))
-
-            # swap B and T axes
-            res = tf.transpose(
-                rs,
-                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))
-
-            if drop_last:
-                return res[:-1]
-            return res
-
-        if self.model.state_in:
-            max_seq_len = tf.reduce_max(self.model.seq_lens) - 1
-            mask = tf.sequence_mask(self.model.seq_lens, max_seq_len)
-            mask = tf.reshape(mask, [-1])
-        else:
-            mask = tf.ones_like(rewards)
-
-        # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
-        if self.config["vtrace"]:
-            logger.info("Using V-Trace surrogate loss (vtrace=True)")
-
-            # Prepare actions for loss
-            loss_actions = actions if is_multidiscrete else tf.expand_dims(
-                actions, axis=1)
-
-            self.loss = VTraceSurrogateLoss(
-                actions=make_time_major(loss_actions, drop_last=True),
-                prev_actions_logp=make_time_major(
-                    prev_action_dist.logp(actions), drop_last=True),
-                actions_logp=make_time_major(
-                    action_dist.logp(actions), drop_last=True),
-                action_kl=prev_action_dist.kl(action_dist),
-                actions_entropy=make_time_major(
-                    action_dist.entropy(), drop_last=True),
-                dones=make_time_major(dones, drop_last=True),
-                behaviour_logits=make_time_major(
-                    unpacked_behaviour_logits, drop_last=True),
-                target_logits=make_time_major(
-                    unpacked_outputs, drop_last=True),
-                discount=config["gamma"],
-                rewards=make_time_major(rewards, drop_last=True),
-                values=make_time_major(values, drop_last=True),
-                bootstrap_value=make_time_major(values)[-1],
-                dist_class=dist_class,
-                valid_mask=make_time_major(mask, drop_last=True),
-                vf_loss_coeff=self.config["vf_loss_coeff"],
-                entropy_coeff=self.config["entropy_coeff"],
-                clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
-                clip_pg_rho_threshold=self.config[
-                    "vtrace_clip_pg_rho_threshold"],
-                clip_param=self.config["clip_param"])
-        else:
-            logger.info("Using PPO surrogate loss (vtrace=False)")
-            self.loss = PPOSurrogateLoss(
-                prev_actions_logp=make_time_major(
-                    prev_action_dist.logp(actions)),
-                actions_logp=make_time_major(action_dist.logp(actions)),
-                action_kl=prev_action_dist.kl(action_dist),
-                actions_entropy=make_time_major(action_dist.entropy()),
-                values=make_time_major(values),
-                valid_mask=make_time_major(mask),
-                advantages=make_time_major(adv_ph),
-                value_targets=make_time_major(value_targets),
-                vf_loss_coeff=self.config["vf_loss_coeff"],
-                entropy_coeff=self.config["entropy_coeff"],
-                clip_param=self.config["clip_param"])
-
-        # KL divergence between worker and learner logits for debugging
-        model_dist = MultiCategorical(unpacked_outputs)
-        behaviour_dist = MultiCategorical(unpacked_behaviour_logits)
-
-        kls = model_dist.kl(behaviour_dist)
-        if len(kls) > 1:
-            self.KL_stats = {}
-
-            for i, kl in enumerate(kls):
-                self.KL_stats.update({
-                    "mean_KL_{}".format(i): tf.reduce_mean(kl),
-                    "max_KL_{}".format(i): tf.reduce_max(kl),
-                })
-        else:
-            self.KL_stats = {
-                "mean_KL": tf.reduce_mean(kls[0]),
-                "max_KL": tf.reduce_max(kls[0]),
-            }
-
-        # Initialize TFPolicyGraph
-        loss_in = [
-            ("actions", actions),
-            ("dones", dones),
-            ("behaviour_logits", behaviour_logits),
-            ("rewards", rewards),
-            ("obs", observations),
-            ("prev_actions", prev_actions),
-            ("prev_rewards", prev_rewards),
-        ]
-        if not self.config["vtrace"]:
-            loss_in.append(("advantages", adv_ph))
-            loss_in.append(("value_targets", value_targets))
-        LearningRateSchedule.__init__(self, self.config["lr"],
-                                      self.config["lr_schedule"])
-        TFPolicyGraph.__init__(
-            self,
-            observation_space,
-            action_space,
-            self.sess,
-            obs_input=observations,
-            action_sampler=action_dist.sample(),
-            action_prob=action_dist.sampled_action_prob(),
-            loss=self.loss.total_loss,
-            model=self.model,
-            loss_inputs=loss_in,
-            state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out,
-            prev_action_input=prev_actions,
-            prev_reward_input=prev_rewards,
-            seq_lens=self.model.seq_lens,
-            max_seq_len=self.config["model"]["max_seq_len"],
-            batch_divisibility_req=self.config["sample_batch_size"])
-
-        self.sess.run(tf.global_variables_initializer())
-
-        values_batched = make_time_major(
-            values, drop_last=self.config["vtrace"])
-        self.stats_fetches = {
-            LEARNER_STATS_KEY: dict({
-                "cur_lr": tf.cast(self.cur_lr, tf.float64),
-                "policy_loss": self.loss.pi_loss,
-                "entropy": self.loss.entropy,
-                "grad_gnorm": tf.global_norm(self._grads),
-                "var_gnorm": tf.global_norm(self.var_list),
-                "vf_loss": self.loss.vf_loss,
-                "vf_explained_var": explained_variance(
-                    tf.reshape(self.loss.value_targets, [-1]),
-                    tf.reshape(values_batched, [-1])),
-            }, **self.KL_stats),
-        }
-
-    def optimizer(self):
-        if self.config["opt_type"] == "adam":
-            return tf.train.AdamOptimizer(self.cur_lr)
-        else:
-            return tf.train.RMSPropOptimizer(self.cur_lr, self.config["decay"],
-                                             self.config["momentum"],
-                                             self.config["epsilon"])
-
-    def gradients(self, optimizer, loss):
-        grads = tf.gradients(loss, self.var_list)
-        self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
-        clipped_grads = list(zip(self.grads, self.var_list))
-        return clipped_grads
-
-    def extra_compute_grad_fetches(self):
-        return self.stats_fetches
-
     def value(self, ob, *args):
-        feed_dict = {self.observations: [ob], self.model.seq_lens: [1]}
+        feed_dict = {self._obs_input: [ob], self.model.seq_lens: [1]}
         assert len(args) == len(self.model.state_in), \
             (args, self.model.state_in)
         for k, v in zip(self.model.state_in, args):
             feed_dict[k] = v
-        vf = self.sess.run(self.value_function, feed_dict)
+        vf = self._sess.run(self.value_function, feed_dict)
         return vf[0]
 
-    def get_initial_state(self):
-        return self.model.state_init
 
-    def copy(self, existing_inputs):
-        return AsyncPPOPolicyGraph(
-            self.observation_space,
-            self.action_space,
-            self.config,
-            existing_inputs=existing_inputs)
+def setup_mixins(policy, obs_space, action_space, config):
+    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+    ValueNetworkMixin.__init__(policy)
+
+
+AsyncPPOTFPolicy = build_tf_policy(
+    name="AsyncPPOTFPolicy",
+    get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG,
+    loss_fn=build_appo_surrogate_loss,
+    stats_fn=stats,
+    grad_stats_fn=grad_stats,
+    postprocess_fn=postprocess_trajectory,
+    optimizer_fn=optimizer,
+    extra_action_fetches_fn=add_values_and_logits,
+    before_init=validate_config,
+    before_loss_init=setup_mixins,
+    mixins=[LearningRateSchedule, ValueNetworkMixin],
+    get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index d425770048e3..64f8f747d984 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -37,9 +37,11 @@ def __init__(self,
                  config,
                  loss_fn,
                  stats_fn=None,
+                 grad_stats_fn=None,
                  before_loss_init=None,
                  make_action_sampler=None,
-                 existing_inputs=None):
+                 existing_inputs=None,
+                 get_batch_divisibility_req=None):
         """Initialize a dynamic TF policy graph.
 
         Arguments:
@@ -50,6 +52,8 @@ def __init__(self,
                 graph, and dict of experience tensor placeholders
             stats_fn (func): optional function that returns a dict of
                 TF fetches given the policy graph and batch input tensors
+            grad_stats_fn (func): optional function that returns a dict of
+                TF fetches given the policy graph and loss gradient tensors
             before_loss_init (func): optional function to run prior to loss
                 init that takes the same arguments as __init__
             make_action_sampler (func): optional function that returns a
@@ -59,10 +63,13 @@ def __init__(self,
             existing_inputs (OrderedDict): when copying a policy graph, this
                 specifies an existing dict of placeholders to use instead of
                 defining new ones
+            get_batch_divisibility_req (func): optional function that returns
+                the divisibility requirement for sample batches
         """
         self.config = config
         self._loss_fn = loss_fn
         self._stats_fn = stats_fn
+        self._grad_stats_fn = grad_stats_fn
 
         # Setup standard placeholders
         if existing_inputs is not None:
@@ -90,11 +97,12 @@ def __init__(self,
             assert not existing_inputs, \
                 "Cloning not supported with custom action sampler"
             self.model = None
+            self.dist_class = None
             self.action_dist = None
             action_sampler, action_prob = make_action_sampler(
                 self, input_dict, obs_space, action_space, config)
         else:
-            dist_class, logit_dim = ModelCatalog.get_action_dist(
+            self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                 action_space, self.config["model"])
             if existing_inputs:
                 existing_state_in = [
@@ -116,12 +124,16 @@ def __init__(self,
                 self.config["model"],
                 state_in=existing_state_in,
                 seq_lens=existing_seq_lens)
-            self.action_dist = dist_class(self.model.outputs)
+            self.action_dist = self.dist_class(self.model.outputs)
             action_sampler = self.action_dist.sample()
             action_prob = self.action_dist.sampled_action_prob()
 
         # Phase 1 init
         sess = tf.get_default_session()
+        if get_batch_divisibility_req:
+            batch_divisibility_req = get_batch_divisibility_req(self)
+        else:
+            batch_divisibility_req = 1
         TFPolicyGraph.__init__(
             self,
             obs_space,
@@ -138,7 +150,8 @@ def __init__(self,
             prev_action_input=prev_actions,
             prev_reward_input=prev_rewards,
             seq_lens=self.model and self.model.seq_lens,
-            max_seq_len=config["model"]["max_seq_len"])
+            max_seq_len=config["model"]["max_seq_len"],
+            batch_divisibility_req=batch_divisibility_req)
 
         # Phase 2 init
         before_loss_init(self, obs_space, action_space, config)
@@ -184,6 +197,9 @@ def copy(self, existing_inputs):
         TFPolicyGraph._initialize_loss(
             instance, loss, [(k, existing_inputs[i])
                              for i, (k, _) in enumerate(self._loss_inputs)])
+        if instance._grad_stats_fn:
+            instance._stats_fetches.update(
+                instance._grad_stats_fn(instance, instance._grads))
         return instance
 
     @override(PolicyGraph)
@@ -205,7 +221,7 @@ def fake_array(tensor):
             SampleBatch.CUR_OBS: fake_array(self._obs_input),
             SampleBatch.NEXT_OBS: fake_array(self._obs_input),
             SampleBatch.ACTIONS: fake_array(self._sampler),
-            SampleBatch.REWARDS: np.array([0], dtype=np.int32),
+            SampleBatch.REWARDS: np.array([0], dtype=np.float32),
             SampleBatch.DONES: np.array([False], dtype=np.bool),
         }
         state_init = self.get_initial_state()
@@ -253,6 +269,7 @@ def fake_array(tensor):
             self._stats_fetches.update(self._stats_fn(self, batch_tensors))
         for k in sorted(batch_tensors.accessed_keys):
             loss_inputs.append((k, batch_tensors[k]))
-
         TFPolicyGraph._initialize_loss(self, loss, loss_inputs)
+        if self._grad_stats_fn:
+            self._stats_fetches.update(self._grad_stats_fn(self, self._grads))
         self._sess.run(tf.global_variables_initializer())
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index 40294ef4139d..888e82b316eb 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -13,6 +13,7 @@ def build_tf_policy(name,
                     get_default_config,
                     loss_fn,
                     stats_fn=None,
+                    grad_stats_fn=None,
                     extra_action_fetches_fn=None,
                     postprocess_fn=None,
                     optimizer_fn=None,
@@ -21,7 +22,8 @@ def build_tf_policy(name,
                     before_loss_init=None,
                     after_init=None,
                     make_action_sampler=None,
-                    mixins=None):
+                    mixins=None,
+                    get_batch_divisibility_req=None):
     """Helper function for creating a dynamic tf policy graph at runtime.
 
     Arguments:
@@ -32,6 +34,8 @@ def build_tf_policy(name,
             and dict of experience tensor placeholders
         stats_fn (func): optional function that returns a dict of
             TF fetches given the policy graph and batch input tensors
+        grad_stats_fn (func): optional function that returns a dict of
+            TF fetches given the policy graph and loss gradient tensors
         extra_action_fetches_fn (func): optional function that returns
             a dict of TF fetches given the policy graph object
         postprocess_fn (func): optional experience postprocessing function
@@ -54,6 +58,8 @@ def build_tf_policy(name,
         mixins (list): list of any class mixins for the returned policy class.
             These mixins will be applied in order and will have higher
             precedence than the DynamicTFPolicyGraph class
+        get_batch_divisibility_req (func): optional function that returns
+            the divisibility requirement for sample batches
 
     Returns:
         a DynamicTFPolicyGraph instance that uses the specified args
@@ -96,7 +102,8 @@ def before_loss_init_wrapper(policy, obs_space, action_space,
                 action_space,
                 config,
                 loss_fn,
-                stats_fn,
+                stats_fn=stats_fn,
+                grad_stats_fn=grad_stats_fn,
                 before_loss_init=before_loss_init_wrapper,
                 existing_inputs=existing_inputs)
 

From 3f64d4faa70ecbbca49522b5e7b0a377c98e1ceb Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 16:52:00 -0700
Subject: [PATCH 31/39] forgot grads

---
 python/ray/rllib/agents/ppo/appo_policy_graph.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py
index b2ff83ee2e85..e78eac98662b 100644
--- a/python/ray/rllib/agents/ppo/appo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py
@@ -342,7 +342,7 @@ def validate_config(policy, obs_space, action_space, config):
         "Must use `truncate_episodes` batch mode with V-trace."
 
 
-def optimizer(policy):
+def choose_optimizer(policy):
     if policy.config["opt_type"] == "adam":
         return tf.train.AdamOptimizer(policy.cur_lr)
     else:
@@ -351,7 +351,7 @@ def optimizer(policy):
                                          policy.config["epsilon"])
 
 
-def gradients(policy, optimizer, loss):
+def clip_gradients(policy, optimizer, loss):
     grads = tf.gradients(loss, policy.var_list)
     policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
     clipped_grads = list(zip(policy.grads, policy.var_list))
@@ -386,7 +386,8 @@ def setup_mixins(policy, obs_space, action_space, config):
     stats_fn=stats,
     grad_stats_fn=grad_stats,
     postprocess_fn=postprocess_trajectory,
-    optimizer_fn=optimizer,
+    optimizer_fn=choose_optimizer,
+    gradients_fn=clip_gradients,
     extra_action_fetches_fn=add_values_and_logits,
     before_init=validate_config,
     before_loss_init=setup_mixins,

From 0c6a22c84ed7a805db8285eb2b493e7cf0388fa9 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 17:22:29 -0700
Subject: [PATCH 32/39] default policy optimizer

---
 python/ray/rllib/agents/pg/pg.py            | 10 +---------
 python/ray/rllib/agents/trainer_template.py | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index 84cb9b9d8917..a28c7f73e3a0 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -5,7 +5,6 @@
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.trainer_template import build_trainer
 from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
-from ray.rllib.optimizers import SyncSamplesOptimizer
 
 # yapf: disable
 # __sphinx_doc_begin__
@@ -21,12 +20,6 @@
 # yapf: enable
 
 
-def make_policy_optimizer(local_ev, remote_evs, config):
-    optimizer_config = dict(config["optimizer"],
-                            **{"train_batch_size": config["train_batch_size"]})
-    return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config)
-
-
 def get_policy_class(config):
     if config["use_pytorch"]:
         from ray.rllib.agents.pg.torch_pg_policy_graph import PGTorchPolicy
@@ -39,5 +32,4 @@ def get_policy_class(config):
     "PG",
     default_config=DEFAULT_CONFIG,
     default_policy=PGTFPolicy,
-    get_policy_class=get_policy_class,
-    make_policy_optimizer=make_policy_optimizer)
+    get_policy_class=get_policy_class)
diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py
index 643835fc2890..d27cf63f43f8 100644
--- a/python/ray/rllib/agents/trainer_template.py
+++ b/python/ray/rllib/agents/trainer_template.py
@@ -3,6 +3,7 @@
 from __future__ import print_function
 
 from ray.rllib.agents.trainer import Trainer
+from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.rllib.utils.annotations import override, DeveloperAPI
 
 
@@ -10,7 +11,7 @@
 def build_trainer(name,
                   default_config,
                   default_policy,
-                  make_policy_optimizer,
+                  make_policy_optimizer=None,
                   validate_config=None,
                   get_policy_class=None,
                   before_train_step=None,
@@ -22,8 +23,9 @@ def build_trainer(name,
         name (str): name of the trainer (e.g., "PPO")
         default_config (dict): the default config dict of the algorithm
         default_policy (cls): the default PolicyGraph class to use
-        make_policy_optimizer (func): function that returns a PolicyOptimizer
-            instance given (local_evaluator, remote_evaluators, config)
+        make_policy_optimizer (func): optional function that returns a
+            PolicyOptimizer instance given
+            (local_evaluator, remote_evaluators, config)
         validate_config (func): optional callback that checks a given config
             for correctness. It may mutate the config as needed.
         get_policy_class (func): optional callback that takes a config and
@@ -64,6 +66,13 @@ def _init(self, config, env_creator):
             if make_policy_optimizer:
                 self.optimizer = make_policy_optimizer(
                     self.local_evaluator, self.remote_evaluators, config)
+            else:
+                optimizer_config = dict(
+                    config["optimizer"],
+                    **{"train_batch_size": config["train_batch_size"]})
+                self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
+                                                      self.remote_evaluators,
+                                                      **optimizer_config)
 
         @override(Trainer)
         def _train(self):

From c7e0320af9cf84772930c2110862a3580e2b196b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 17:25:43 -0700
Subject: [PATCH 33/39] make default config optional

---
 python/ray/rllib/agents/pg/pg.py                     | 2 +-
 python/ray/rllib/agents/ppo/ppo.py                   | 2 +-
 python/ray/rllib/agents/trainer_template.py          | 7 ++++---
 python/ray/rllib/evaluation/tf_policy_template.py    | 9 +++++----
 python/ray/rllib/evaluation/torch_policy_template.py | 9 +++++----
 5 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py
index a28c7f73e3a0..ffbb899d1b9e 100644
--- a/python/ray/rllib/agents/pg/pg.py
+++ b/python/ray/rllib/agents/pg/pg.py
@@ -29,7 +29,7 @@ def get_policy_class(config):
 
 
 PGTrainer = build_trainer(
-    "PG",
+    name="PG",
     default_config=DEFAULT_CONFIG,
     default_policy=PGTFPolicy,
     get_policy_class=get_policy_class)
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index 99a5240b00ea..d3f5abdaa95c 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -156,7 +156,7 @@ def validate_config(config):
 
 
 PPOTrainer = build_trainer(
-    "PPO",
+    name="PPO",
     default_config=DEFAULT_CONFIG,
     default_policy=PPOTFPolicy,
     make_policy_optimizer=make_optimizer,
diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py
index d27cf63f43f8..618bc3b30ace 100644
--- a/python/ray/rllib/agents/trainer_template.py
+++ b/python/ray/rllib/agents/trainer_template.py
@@ -9,8 +9,8 @@
 
 @DeveloperAPI
 def build_trainer(name,
-                  default_config,
                   default_policy,
+                  default_config=None,
                   make_policy_optimizer=None,
                   validate_config=None,
                   get_policy_class=None,
@@ -21,8 +21,9 @@ def build_trainer(name,
 
     Arguments:
         name (str): name of the trainer (e.g., "PPO")
-        default_config (dict): the default config dict of the algorithm
         default_policy (cls): the default PolicyGraph class to use
+        default_config (dict): the default config dict of the algorithm,
+            otherwises uses the Trainer default config
         make_policy_optimizer (func): optional function that returns a
             PolicyOptimizer instance given
             (local_evaluator, remote_evaluators, config)
@@ -49,7 +50,7 @@ def build_trainer(name,
 
     class trainer_cls(Trainer):
         _name = name
-        _default_config = default_config
+        _default_config = default_config or Trainer.COMMON_CONFIG
         _policy_graph = default_policy
 
         def _init(self, config, env_creator):
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index 888e82b316eb..138533d589d0 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -10,8 +10,8 @@
 
 @DeveloperAPI
 def build_tf_policy(name,
-                    get_default_config,
                     loss_fn,
+                    get_default_config=None,
                     stats_fn=None,
                     grad_stats_fn=None,
                     extra_action_fetches_fn=None,
@@ -28,10 +28,10 @@ def build_tf_policy(name,
 
     Arguments:
         name (str): name of the graph (e.g., "PPOPolicy")
-        get_default_config (func): function that returns the default config
-            to merge with any overrides
         loss_fn (func): function that returns a loss tensor the policy graph,
             and dict of experience tensor placeholders
+        get_default_config (func): optional function that returns the default
+            config to merge with any overrides
         stats_fn (func): optional function that returns a dict of
             TF fetches given the policy graph and batch input tensors
         grad_stats_fn (func): optional function that returns a dict of
@@ -82,7 +82,8 @@ def __init__(self,
                      action_space,
                      config,
                      existing_inputs=None):
-            config = dict(get_default_config(), **config)
+            if get_default_config:
+                config = dict(get_default_config(), **config)
 
             if before_init:
                 before_init(self, obs_space, action_space, config)
diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py
index 374ed7395b2a..a666ed83d752 100644
--- a/python/ray/rllib/evaluation/torch_policy_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_template.py
@@ -10,8 +10,8 @@
 
 @DeveloperAPI
 def build_torch_policy(name,
-                       get_default_config,
                        loss_fn,
+                       get_default_config=None,
                        stats_fn=None,
                        postprocess_fn=None,
                        extra_action_out_fn=None,
@@ -25,10 +25,10 @@ def build_torch_policy(name,
 
     Arguments:
         name (str): name of the graph (e.g., "PPOPolicy")
-        get_default_config (func): function that returns the default config
-            to merge with any overrides
         loss_fn (func): function that returns a loss tensor the policy graph,
             and dict of experience tensor placeholders
+        get_default_config (func): optional function that returns the default
+            config to merge with any overrides
         stats_fn (func): optional function that returns a dict of
             values given the policy graph and batch input tensors
         postprocess_fn (func): optional experience postprocessing function
@@ -68,7 +68,8 @@ class new_base(mixins.pop(), base):
 
     class graph_cls(base):
         def __init__(self, obs_space, action_space, config):
-            config = dict(get_default_config(), **config)
+            if get_default_config:
+                config = dict(get_default_config(), **config)
             self.config = config
 
             if before_init:

From 46523a0b9ce1d4ff9be0b2232422d2545e1a18a2 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 17:34:44 -0700
Subject: [PATCH 34/39] add config to optimizer

---
 .../ray/rllib/agents/a3c/a3c_torch_policy_graph.py |  4 ++--
 python/ray/rllib/agents/pg/pg_policy_graph.py      |  4 ++--
 .../ray/rllib/agents/pg/torch_pg_policy_graph.py   |  4 ++--
 python/ray/rllib/agents/ppo/appo_policy_graph.py   |  7 +++----
 python/ray/rllib/evaluation/tf_policy_graph.py     |  2 +-
 python/ray/rllib/evaluation/tf_policy_template.py  | 14 +++++++-------
 .../ray/rllib/evaluation/torch_policy_template.py  | 10 +++++-----
 7 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
index 15301f85b53e..fa6f857f9eca 100644
--- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -66,8 +66,8 @@ def apply_grad_clipping(policy):
     return info
 
 
-def torch_optimizer(policy):
-    return torch.optim.Adam(policy.model.parameters(), lr=policy.config["lr"])
+def torch_optimizer(policy, config):
+    return torch.optim.Adam(policy.model.parameters(), lr=config["lr"])
 
 
 class ValueNetworkMixin(object):
diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index d4f97605e2e5..666c62a13f2f 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -28,8 +28,8 @@ def postprocess_advantages(policy,
         sample_batch, 0.0, policy.config["gamma"], use_gae=False)
 
 
-def make_optimizer(policy):
-    return tf.train.AdamOptimizer(learning_rate=policy.config["lr"])
+def make_optimizer(policy, config):
+    return tf.train.AdamOptimizer(learning_rate=config["lr"])
 
 
 PGTFPolicy = build_tf_policy(
diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
index ea280f3f6c08..6a929536a858 100644
--- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
@@ -36,8 +36,8 @@ def pg_loss_stats(policy, batch_tensors):
     return {"policy_loss": policy.pi_err.item()}
 
 
-def make_optimizer(policy):
-    return torch.optim.Adam(policy._model.parameters(), lr=policy.config["lr"])
+def make_optimizer(policy, config):
+    return torch.optim.Adam(policy._model.parameters(), lr=config["lr"])
 
 
 PGTorchPolicy = build_torch_policy(
diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py
index e78eac98662b..438618cbf34e 100644
--- a/python/ray/rllib/agents/ppo/appo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py
@@ -342,13 +342,12 @@ def validate_config(policy, obs_space, action_space, config):
         "Must use `truncate_episodes` batch mode with V-trace."
 
 
-def choose_optimizer(policy):
+def choose_optimizer(policy, config):
     if policy.config["opt_type"] == "adam":
         return tf.train.AdamOptimizer(policy.cur_lr)
     else:
-        return tf.train.RMSPropOptimizer(policy.cur_lr, policy.config["decay"],
-                                         policy.config["momentum"],
-                                         policy.config["epsilon"])
+        return tf.train.RMSPropOptimizer(policy.cur_lr, config["decay"],
+                                         config["momentum"], config["epsilon"])
 
 
 def clip_gradients(policy, optimizer, loss):
diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index 853a6110eaf1..e20c03fc0d60 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -284,7 +284,7 @@ def extra_compute_grad_fetches(self):
     @DeveloperAPI
     def optimizer(self):
         """TF optimizer to use for policy optimization."""
-        return tf.train.AdamOptimizer()
+        return tf.train.AdamOptimizer(self.config["lr"])
 
     @DeveloperAPI
     def gradients(self, optimizer, loss):
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index 138533d589d0..b654e6e64f28 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -24,24 +24,24 @@ def build_tf_policy(name,
                     make_action_sampler=None,
                     mixins=None,
                     get_batch_divisibility_req=None):
-    """Helper function for creating a dynamic tf policy graph at runtime.
+    """Helper function for creating a dynamic tf policy at runtime.
 
     Arguments:
         name (str): name of the graph (e.g., "PPOPolicy")
-        loss_fn (func): function that returns a loss tensor the policy graph,
+        loss_fn (func): function that returns a loss tensor the policy,
             and dict of experience tensor placeholders
         get_default_config (func): optional function that returns the default
             config to merge with any overrides
         stats_fn (func): optional function that returns a dict of
-            TF fetches given the policy graph and batch input tensors
+            TF fetches given the policy and batch input tensors
         grad_stats_fn (func): optional function that returns a dict of
-            TF fetches given the policy graph and loss gradient tensors
+            TF fetches given the policy and loss gradient tensors
         extra_action_fetches_fn (func): optional function that returns
-            a dict of TF fetches given the policy graph object
+            a dict of TF fetches given the policy object
         postprocess_fn (func): optional experience postprocessing function
             that takes the same args as PolicyGraph.postprocess_trajectory()
         optimizer_fn (func): optional function that returns a tf.Optimizer
-            given the policy graph object
+            given the policy and config
         gradients_fn (func): optional function that returns a list of gradients
             given a tf optimizer and loss tensor. If not specified, this
             defaults to optimizer.compute_gradients(loss)
@@ -124,7 +124,7 @@ def postprocess_trajectory(self,
         @override(TFPolicyGraph)
         def optimizer(self):
             if optimizer_fn:
-                return optimizer_fn(self)
+                return optimizer_fn(self, self.config)
             else:
                 return TFPolicyGraph.optimizer(self)
 
diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py
index a666ed83d752..fdc0cf5bca0e 100644
--- a/python/ray/rllib/evaluation/torch_policy_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_template.py
@@ -21,16 +21,16 @@ def build_torch_policy(name,
                        after_init=None,
                        make_model_and_action_dist=None,
                        mixins=None):
-    """Helper function for creating a torch policy graph at runtime.
+    """Helper function for creating a torch policy at runtime.
 
     Arguments:
         name (str): name of the graph (e.g., "PPOPolicy")
-        loss_fn (func): function that returns a loss tensor the policy graph,
+        loss_fn (func): function that returns a loss tensor the policy,
             and dict of experience tensor placeholders
         get_default_config (func): optional function that returns the default
             config to merge with any overrides
         stats_fn (func): optional function that returns a dict of
-            values given the policy graph and batch input tensors
+            values given the policy and batch input tensors
         postprocess_fn (func): optional experience postprocessing function
             that takes the same args as PolicyGraph.postprocess_trajectory()
         extra_action_out_fn (func): optional function that returns
@@ -38,7 +38,7 @@ def build_torch_policy(name,
         extra_grad_process_fn (func): optional function that is called after
             gradients are computed and returns processing info
         optimizer_fn (func): optional function that returns a torch optimizer
-            given the policy graph object
+            given the policy and config
         before_init (func): optional function to run at the beginning of
             __init__ that takes the same arguments as __init__
         after_init (func): optional function to run at the end of __init__
@@ -117,7 +117,7 @@ def extra_action_out(self, model_out):
         @override(TorchPolicyGraph)
         def optimizer(self):
             if optimizer_fn:
-                return optimizer_fn(self)
+                return optimizer_fn(self, self.config)
             else:
                 return TorchPolicyGraph.optimizer(self)
 

From 8a48029fb18313b1ac9ec5927dfd9eda5a337d80 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 17:35:41 -0700
Subject: [PATCH 35/39] use lr by default in optimizer

---
 python/ray/rllib/agents/pg/pg_policy_graph.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py
index 666c62a13f2f..54fcd041cc72 100644
--- a/python/ray/rllib/agents/pg/pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/pg_policy_graph.py
@@ -28,13 +28,8 @@ def postprocess_advantages(policy,
         sample_batch, 0.0, policy.config["gamma"], use_gae=False)
 
 
-def make_optimizer(policy, config):
-    return tf.train.AdamOptimizer(learning_rate=config["lr"])
-
-
 PGTFPolicy = build_tf_policy(
     name="PGTFPolicy",
     get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
     postprocess_fn=postprocess_advantages,
-    loss_fn=policy_gradient_loss,
-    optimizer_fn=make_optimizer)
+    loss_fn=policy_gradient_loss)

From 65db45e0037fb6db64b02e30b330f55dcc3e7aa0 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 17:36:59 -0700
Subject: [PATCH 36/39] update

---
 python/ray/rllib/evaluation/tf_policy_graph.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py
index e20c03fc0d60..b921e6cfb0d1 100644
--- a/python/ray/rllib/evaluation/tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/tf_policy_graph.py
@@ -284,7 +284,10 @@ def extra_compute_grad_fetches(self):
     @DeveloperAPI
     def optimizer(self):
         """TF optimizer to use for policy optimization."""
-        return tf.train.AdamOptimizer(self.config["lr"])
+        if hasattr(self, "config"):
+            return tf.train.AdamOptimizer(self.config["lr"])
+        else:
+            return tf.train.AdamOptimizer()
 
     @DeveloperAPI
     def gradients(self, optimizer, loss):

From 4830ab6f21527d08c2e4211a81853d61c2ed7077 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 18:02:29 -0700
Subject: [PATCH 37/39] comments

---
 python/ray/rllib/agents/ppo/appo_policy_graph.py     | 4 +++-
 python/ray/rllib/evaluation/tf_policy_template.py    | 8 ++++----
 python/ray/rllib/evaluation/torch_policy_template.py | 8 ++++----
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py
index 438618cbf34e..5aa76913194f 100644
--- a/python/ray/rllib/agents/ppo/appo_policy_graph.py
+++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py
@@ -164,11 +164,13 @@ def __init__(self,
 
 def _make_time_major(policy, tensor, drop_last=False):
     """Swaps batch and trajectory axis.
-    Args:
+
+    Arguments:
         policy: Policy reference
         tensor: A tensor or list of tensors to reshape.
         drop_last: A bool indicating whether to drop the last
         trajectory item.
+
     Returns:
         res: A tensor with swapped axes or a list of tensors with
         swapped axes.
diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py
index b654e6e64f28..b2549e973a65 100644
--- a/python/ray/rllib/evaluation/tf_policy_template.py
+++ b/python/ray/rllib/evaluation/tf_policy_template.py
@@ -46,11 +46,11 @@ def build_tf_policy(name,
             given a tf optimizer and loss tensor. If not specified, this
             defaults to optimizer.compute_gradients(loss)
         before_init (func): optional function to run at the beginning of
-            __init__ that takes the same arguments as __init__
+            policy init that takes the same arguments as the policy constructor
         before_loss_init (func): optional function to run prior to loss
-            init that takes the same arguments as __init__
-        after_init (func): optional function to run at the end of __init__
-            that takes the same arguments as __init__
+            init that takes the same arguments as the policy constructor
+        after_init (func): optional function to run at the end of policy init
+            that takes the same arguments as the policy constructor
         make_action_sampler (func): optional function that returns a
             tuple of action and action prob tensors. The function takes
             (policy, input_dict, obs_space, action_space, config) as its
diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py
index fdc0cf5bca0e..7f65c2b963b8 100644
--- a/python/ray/rllib/evaluation/torch_policy_template.py
+++ b/python/ray/rllib/evaluation/torch_policy_template.py
@@ -40,11 +40,11 @@ def build_torch_policy(name,
         optimizer_fn (func): optional function that returns a torch optimizer
             given the policy and config
         before_init (func): optional function to run at the beginning of
-            __init__ that takes the same arguments as __init__
-        after_init (func): optional function to run at the end of __init__
-            that takes the same arguments as __init__
+            policy init that takes the same arguments as the policy constructor
+        after_init (func): optional function to run at the end of policy init
+            that takes the same arguments as the policy constructor
         make_model_and_action_dist (func): optional func that takes the same
-            arguments as __init__ and returns a tuple of model instance and
+            arguments as policy init and returns a tuple of model instance and
             torch action distribution class. If not specified, the default
             model and action dist from the catalog will be used
         mixins (list): list of any class mixins for the returned policy class.

From 816d590e4dad5f9b7c683cfe6de27ce8540ce353 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 18:36:21 -0700
Subject: [PATCH 38/39] remove optimizer

---
 python/ray/rllib/agents/pg/torch_pg_policy_graph.py | 9 +--------
 python/ray/rllib/evaluation/torch_policy_graph.py   | 6 +++++-
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
index 6a929536a858..cda1b6eb5057 100644
--- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
+++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py
@@ -2,8 +2,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import torch
-
 import ray
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
@@ -36,14 +34,9 @@ def pg_loss_stats(policy, batch_tensors):
     return {"policy_loss": policy.pi_err.item()}
 
 
-def make_optimizer(policy, config):
-    return torch.optim.Adam(policy._model.parameters(), lr=config["lr"])
-
-
 PGTorchPolicy = build_torch_policy(
     name="PGTorchPolicy",
     get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
     loss_fn=pg_torch_loss,
     stats_fn=pg_loss_stats,
-    postprocess_fn=postprocess_advantages,
-    optimizer_fn=make_optimizer)
+    postprocess_fn=postprocess_advantages)
diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py
index 3a1464606612..ccf1b9eeb81d 100644
--- a/python/ray/rllib/evaluation/torch_policy_graph.py
+++ b/python/ray/rllib/evaluation/torch_policy_graph.py
@@ -161,7 +161,11 @@ def extra_grad_info(self, batch_tensors):
 
     def optimizer(self):
         """Custom PyTorch optimizer to use."""
-        return torch.optim.Adam(self._model.parameters())
+        if hasattr(self, "config"):
+            return torch.optim.Adam(
+                self._model.parameters(), lr=self.config["lr"])
+        else:
+            return torch.optim.Adam(self._model.parameters())
 
     def _lazy_tensor_dict(self, postprocessed_batch):
         batch_tensors = UsageTrackingDict(postprocessed_batch)

From 65173a57b6af328616a832e47f06a24b8586d5ef Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 17 May 2019 18:48:05 -0700
Subject: [PATCH 39/39] fix tuple actions support in dynamic tf graph

---
 python/ray/rllib/evaluation/dynamic_tf_policy_graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
index 64f8f747d984..73e08fcf9093 100644
--- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
+++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py
@@ -220,7 +220,7 @@ def fake_array(tensor):
             SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input),
             SampleBatch.CUR_OBS: fake_array(self._obs_input),
             SampleBatch.NEXT_OBS: fake_array(self._obs_input),
-            SampleBatch.ACTIONS: fake_array(self._sampler),
+            SampleBatch.ACTIONS: fake_array(self._prev_action_input),
             SampleBatch.REWARDS: np.array([0], dtype=np.float32),
             SampleBatch.DONES: np.array([False], dtype=np.bool),
         }