diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py
index a2441f0b5bf6..9086f968ed69 100644
--- a/python/ray/rllib/__init__.py
+++ b/python/ray/rllib/__init__.py
@@ -6,6 +6,11 @@
 # This file is imported from the tune module in order to register RLlib agents.
 from ray.tune.registry import register_trainable
 
+from ray.rllib.utils.policy_graph import PolicyGraph
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
+from ray.rllib.optimizers.sample_batch import SampleBatch
+
 
 def _register_all():
     for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
@@ -16,3 +21,7 @@ def _register_all():
 
 
 _register_all()
+
+__all__ = [
+    "PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch"
+]
diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py
index 569b50c44420..8a2089db30c9 100644
--- a/python/ray/rllib/a3c/a3c.py
+++ b/python/ray/rllib/a3c/a3c.py
@@ -2,7 +2,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 import pickle
 import os
 
@@ -10,14 +9,14 @@
 from ray.rllib.agent import Agent
 from ray.rllib.optimizers import AsyncOptimizer
 from ray.rllib.utils import FilterManager
-from ray.rllib.a3c.a3c_evaluator import A3CEvaluator, RemoteA3CEvaluator, \
-    GPURemoteA3CEvaluator
-from ray.tune.result import TrainingResult
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
+    collect_metrics
+from ray.rllib.a3c.common import get_policy_cls
 from ray.tune.trial import Resources
 
 DEFAULT_CONFIG = {
     # Number of workers (excluding master)
-    "num_workers": 4,
+    "num_workers": 2,
     # Size of rollout batch
     "batch_size": 10,
     # Use LSTM model - only applicable for image states
@@ -42,6 +41,8 @@
     "entropy_coeff": -0.01,
     # Whether to place workers on GPUs
     "use_gpu_for_workers": False,
+    # Whether to emit extra summary stats
+    "summarize": False,
     # Model and preprocessor options
     "model": {
         # (Image statespace) - Converts image to Channels = 1
@@ -78,56 +79,48 @@ def default_resource_request(cls, config):
             extra_gpu=cf["use_gpu_for_workers"] and cf["num_workers"] or 0)
 
     def _init(self):
-        self.local_evaluator = A3CEvaluator(
-            self.registry,
-            self.env_creator,
-            self.config,
-            self.logdir,
-            start_sampler=False)
-        if self.config["use_gpu_for_workers"]:
-            remote_cls = GPURemoteA3CEvaluator
+        self.policy_cls = get_policy_cls(self.config)
+
+        if self.config["use_pytorch"]:
+            session_creator = None
         else:
-            remote_cls = RemoteA3CEvaluator
+            import tensorflow as tf
+
+            def session_creator():
+                return tf.Session(
+                    config=tf.ConfigProto(
+                        intra_op_parallelism_threads=1,
+                        inter_op_parallelism_threads=1,
+                        gpu_options=tf.GPUOptions(allow_growth=True)))
+
+        remote_cls = CommonPolicyEvaluator.as_remote(
+            num_gpus=1 if self.config["use_gpu_for_workers"] else 0)
+        self.local_evaluator = CommonPolicyEvaluator(
+            self.env_creator, self.policy_cls,
+            batch_steps=self.config["batch_size"],
+            batch_mode="truncate_episodes",
+            tf_session_creator=session_creator,
+            registry=self.registry, env_config=self.config["env_config"],
+            model_config=self.config["model"], policy_config=self.config)
         self.remote_evaluators = [
-            remote_cls.remote(self.registry, self.env_creator, self.config,
-                              self.logdir)
-            for i in range(self.config["num_workers"])
-        ]
-        self.optimizer = AsyncOptimizer(self.config["optimizer"],
-                                        self.local_evaluator,
-                                        self.remote_evaluators)
+            remote_cls.remote(
+                self.env_creator, self.policy_cls,
+                batch_steps=self.config["batch_size"],
+                batch_mode="truncate_episodes", sample_async=True,
+                tf_session_creator=session_creator,
+                registry=self.registry, env_config=self.config["env_config"],
+                model_config=self.config["model"], policy_config=self.config)
+            for i in range(self.config["num_workers"])]
+
+        self.optimizer = AsyncOptimizer(
+            self.config["optimizer"], self.local_evaluator,
+            self.remote_evaluators)
 
     def _train(self):
         self.optimizer.step()
-        FilterManager.synchronize(self.local_evaluator.filters,
-                                  self.remote_evaluators)
-        res = self._fetch_metrics_from_remote_evaluators()
-        return res
-
-    def _fetch_metrics_from_remote_evaluators(self):
-        episode_rewards = []
-        episode_lengths = []
-        metric_lists = [
-            a.get_completed_rollout_metrics.remote()
-            for a in self.remote_evaluators
-        ]
-        for metrics in metric_lists:
-            for episode in ray.get(metrics):
-                episode_lengths.append(episode.episode_length)
-                episode_rewards.append(episode.episode_reward)
-        avg_reward = (np.mean(episode_rewards)
-                      if episode_rewards else float('nan'))
-        avg_length = (np.mean(episode_lengths)
-                      if episode_lengths else float('nan'))
-        timesteps = np.sum(episode_lengths) if episode_lengths else 0
-
-        result = TrainingResult(
-            episode_reward_mean=avg_reward,
-            episode_len_mean=avg_length,
-            timesteps_this_iter=timesteps,
-            info={})
-
-        return result
+        FilterManager.synchronize(
+            self.local_evaluator.filters, self.remote_evaluators)
+        return collect_metrics(self.local_evaluator, self.remote_evaluators)
 
     def _stop(self):
         # workaround for https://github.com/ray-project/ray/issues/1516
@@ -154,7 +147,10 @@ def _restore(self, checkpoint_path):
         ])
         self.local_evaluator.restore(extra_data["local_state"])
 
-    def compute_action(self, observation):
+    def compute_action(self, observation, state=None):
+        if state is None:
+            state = []
         obs = self.local_evaluator.obs_filter(observation, update=False)
-        action, info = self.local_evaluator.policy.compute(obs)
-        return action
+        return self.local_evaluator.for_policy(
+            lambda p: p.compute_single_action(
+                obs, state, is_training=False)[0])
diff --git a/python/ray/rllib/a3c/a3c_evaluator.py b/python/ray/rllib/a3c/a3c_evaluator.py
deleted file mode 100644
index 74d201016adf..000000000000
--- a/python/ray/rllib/a3c/a3c_evaluator.py
+++ /dev/null
@@ -1,119 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pickle
-
-import ray
-from ray.rllib.models import ModelCatalog
-from ray.rllib.optimizers import PolicyEvaluator
-from ray.rllib.a3c.common import get_policy_cls
-from ray.rllib.utils.filter import get_filter
-from ray.rllib.utils.sampler import AsyncSampler
-from ray.rllib.utils.process_rollout import process_rollout
-
-
-class A3CEvaluator(PolicyEvaluator):
-    """Actor object to start running simulation on workers.
-
-    The gradient computation is also executed from this object.
-
-    Attributes:
-        policy: Copy of graph used for policy. Used by sampler and gradients.
-        obs_filter: Observation filter used in environment sampling
-        rew_filter: Reward filter used in rollout post-processing.
-        sampler: Component for interacting with environment and generating
-            rollouts.
-        logdir: Directory for logging.
-    """
-    def __init__(
-            self, registry, env_creator, config, logdir, start_sampler=True):
-        env = ModelCatalog.get_preprocessor_as_wrapper(
-            registry, env_creator(config["env_config"]), config["model"])
-        self.env = env
-        policy_cls = get_policy_cls(config)
-        # TODO(rliaw): should change this to be just env.observation_space
-        self.policy = policy_cls(
-            registry, env.observation_space.shape, env.action_space, config)
-        self.config = config
-
-        # Technically not needed when not remote
-        self.obs_filter = get_filter(
-            config["observation_filter"], env.observation_space.shape)
-        self.rew_filter = get_filter(config["reward_filter"], ())
-        self.filters = {"obs_filter": self.obs_filter,
-                        "rew_filter": self.rew_filter}
-        self.sampler = AsyncSampler(env, self.policy, self.obs_filter,
-                                    config["batch_size"])
-        if start_sampler and self.sampler._async:
-            self.sampler.start()
-        self.logdir = logdir
-
-    def sample(self):
-        rollout = self.sampler.get_data()
-        samples = process_rollout(
-            rollout, self.rew_filter, gamma=self.config["gamma"],
-            lambda_=self.config["lambda"], use_gae=True)
-        return samples
-
-    def get_completed_rollout_metrics(self):
-        """Returns metrics on previously completed rollouts.
-
-        Calling this clears the queue of completed rollout metrics.
-        """
-        return self.sampler.get_metrics()
-
-    def compute_gradients(self, samples):
-        gradient, info = self.policy.compute_gradients(samples)
-        return gradient, {}
-
-    def apply_gradients(self, grads):
-        self.policy.apply_gradients(grads)
-
-    def get_weights(self):
-        return self.policy.get_weights()
-
-    def set_weights(self, params):
-        self.policy.set_weights(params)
-
-    def save(self):
-        filters = self.get_filters(flush_after=True)
-        weights = self.get_weights()
-        return pickle.dumps({
-            "filters": filters,
-            "weights": weights})
-
-    def restore(self, objs):
-        objs = pickle.loads(objs)
-        self.sync_filters(objs["filters"])
-        self.set_weights(objs["weights"])
-
-    def sync_filters(self, new_filters):
-        """Changes self's filter to given and rebases any accumulated delta.
-
-        Args:
-            new_filters (dict): Filters with new state to update local copy.
-        """
-        assert all(k in new_filters for k in self.filters)
-        for k in self.filters:
-            self.filters[k].sync(new_filters[k])
-
-    def get_filters(self, flush_after=False):
-        """Returns a snapshot of filters.
-
-        Args:
-            flush_after (bool): Clears the filter buffer state.
-
-        Returns:
-            return_filters (dict): Dict for serializable filters
-        """
-        return_filters = {}
-        for k, f in self.filters.items():
-            return_filters[k] = f.as_serializable()
-            if flush_after:
-                f.clear_buffer()
-        return return_filters
-
-
-RemoteA3CEvaluator = ray.remote(A3CEvaluator)
-GPURemoteA3CEvaluator = ray.remote(num_gpus=1)(A3CEvaluator)
diff --git a/python/ray/rllib/a3c/a3c_tf_policy.py b/python/ray/rllib/a3c/a3c_tf_policy.py
new file mode 100644
index 000000000000..e2a8da233880
--- /dev/null
+++ b/python/ray/rllib/a3c/a3c_tf_policy.py
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import gym
+
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.process_rollout import compute_advantages
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+
+
+class A3CTFPolicyGraph(TFPolicyGraph):
+    """The TF policy base class."""
+
+    def __init__(self, ob_space, action_space, registry, config):
+        self.registry = registry
+        self.local_steps = 0
+        self.config = config
+        self.summarize = config.get("summarize")
+
+        self._setup_graph(ob_space, action_space)
+        assert all(hasattr(self, attr)
+                   for attr in ["vf", "logits", "x", "var_list"])
+        print("Setting up loss")
+        self.setup_loss(action_space)
+        self.is_training = tf.placeholder_with_default(True, ())
+        self.sess = tf.get_default_session()
+
+        TFPolicyGraph.__init__(
+            self, self.sess, obs_input=self.x,
+            action_sampler=self.action_dist.sample(), loss=self.loss,
+            loss_inputs=self.loss_in, is_training=self.is_training,
+            state_inputs=self.state_in, state_outputs=self.state_out)
+
+        self.sess.run(tf.global_variables_initializer())
+
+        if self.summarize:
+            bs = tf.to_float(tf.shape(self.x)[0])
+            tf.summary.scalar("model/policy_graph", self.pi_loss / bs)
+            tf.summary.scalar("model/value_loss", self.vf_loss / bs)
+            tf.summary.scalar("model/entropy", self.entropy / bs)
+            tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads))
+            tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
+            self.summary_op = tf.summary.merge_all()
+
+    def _setup_graph(self, ob_space, ac_space):
+        raise NotImplementedError
+
+    def setup_loss(self, action_space):
+        if isinstance(action_space, gym.spaces.Box):
+            ac_size = action_space.shape[0]
+            self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
+        elif isinstance(action_space, gym.spaces.Discrete):
+            self.ac = tf.placeholder(tf.int64, [None], name="ac")
+        else:
+            raise UnsupportedSpaceException(
+                "Action space {} is not supported for A3C.".format(
+                    action_space))
+        self.adv = tf.placeholder(tf.float32, [None], name="adv")
+        self.r = tf.placeholder(tf.float32, [None], name="r")
+
+        log_prob = self.action_dist.logp(self.ac)
+
+        # The "policy gradients" loss: its derivative is precisely the policy
+        # gradient. Notice that self.ac is a placeholder that is provided
+        # externally. adv will contain the advantages, as calculated in
+        # compute_advantages.
+        self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
+
+        delta = self.vf - self.r
+        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
+        self.entropy = tf.reduce_sum(self.action_dist.entropy())
+        self.loss = (self.pi_loss +
+                     self.vf_loss * self.config["vf_loss_coeff"] +
+                     self.entropy * self.config["entropy_coeff"])
+
+    def optimizer(self):
+        return tf.train.AdamOptimizer(self.config["lr"])
+
+    def gradients(self, optimizer):
+        grads = tf.gradients(self.loss, self.var_list)
+        self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
+        clipped_grads = list(zip(self.grads, self.var_list))
+        return clipped_grads
+
+    def extra_compute_grad_fetches(self):
+        if self.summarize:
+            return {"summary": self.summary_op}
+        else:
+            return {}
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        completed = sample_batch["dones"][-1]
+        if completed:
+            last_r = 0.0
+        else:
+            next_state = []
+            for i in range(len(self.state_in)):
+                next_state.append([sample_batch["state_out_{}".format(i)][-1]])
+            last_r = self.value(sample_batch["new_obs"][-1], *next_state)
+        return compute_advantages(
+            sample_batch, last_r, self.config["gamma"], self.config["lambda"])
diff --git a/python/ray/rllib/a3c/a3c_torch_policy.py b/python/ray/rllib/a3c/a3c_torch_policy.py
new file mode 100644
index 000000000000..786a21553a49
--- /dev/null
+++ b/python/ray/rllib/a3c/a3c_torch_policy.py
@@ -0,0 +1,113 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from threading import Lock
+
+import torch
+import torch.nn.functional as F
+
+from ray.rllib.models.pytorch.misc import var_to_np, convert_batch
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.utils.process_rollout import compute_advantages
+from ray.rllib.utils.policy_graph import PolicyGraph
+
+
+class SharedTorchPolicy(PolicyGraph):
+    """A simple, non-recurrent PyTorch policy example."""
+
+    def __init__(self, obs_space, action_space, registry, config):
+        self.registry = registry
+        self.local_steps = 0
+        self.config = config
+        self.summarize = config.get("summarize")
+        self.setup_graph(obs_space, action_space)
+        torch.set_num_threads(2)
+        self.lock = Lock()
+
+    def setup_graph(self, obs_space, action_space):
+        _, self.logit_dim = ModelCatalog.get_action_dist(action_space)
+        self._model = ModelCatalog.get_torch_model(
+            self.registry, obs_space.shape, self.logit_dim,
+            self.config["model"])
+        self.optimizer = torch.optim.Adam(
+            self._model.parameters(), lr=self.config["lr"])
+
+    def compute_single_action(self, obs, state, is_training=False):
+        assert not state, "RNN not supported"
+        with self.lock:
+            ob = torch.from_numpy(obs).float().unsqueeze(0)
+            logits, values = self._model(ob)
+            samples = F.softmax(logits, dim=1).multinomial(1).squeeze()
+            values = values.squeeze()
+            return var_to_np(samples), [], {"vf_preds": var_to_np(values)}
+
+    def compute_gradients(self, samples):
+        with self.lock:
+            self.backward(samples)
+            # Note that return values are just references;
+            # calling zero_grad will modify the values
+            return [p.grad.data.numpy() for p in self._model.parameters()], {}
+
+    def apply_gradients(self, grads):
+        self.optimizer.zero_grad()
+        for g, p in zip(grads, self._model.parameters()):
+            p.grad = torch.from_numpy(g)
+        self.optimizer.step()
+        return {}
+
+    def get_weights(self):
+        # !! This only returns references to the data.
+        return self._model.state_dict()
+
+    def set_weights(self, weights):
+        with self.lock:
+            self._model.load_state_dict(weights)
+
+    def value(self, obs):
+        with self.lock:
+            obs = torch.from_numpy(obs).float().unsqueeze(0)
+            res = self._model.hidden_layers(obs)
+            res = self._model.value_branch(res)
+            res = res.squeeze()
+            return var_to_np(res)
+
+    def forward(self, obs_batch, actions):
+        logits, values = self._model(obs_batch)
+        log_probs = F.log_softmax(logits, dim=1)
+        probs = F.softmax(logits, dim=1)
+        action_log_probs = log_probs.gather(1, actions.view(-1, 1))
+        entropy = -(log_probs * probs).sum(-1).sum()
+        return values, action_log_probs, entropy
+
+    def backward(self, sample_batch):
+        """Loss is encoded here.
+
+        Defining a new loss function would start by rewriting this function.
+        """
+
+        states, actions, advs, rs = convert_batch(sample_batch)
+        values, action_log_probs, entropy = self.forward(states, actions)
+        pi_err = -advs.dot(action_log_probs.reshape(-1))
+        value_err = F.mse_loss(values.reshape(-1), rs)
+
+        self.optimizer.zero_grad()
+
+        overall_err = sum([
+            pi_err,
+            self.config["vf_loss_coeff"] * value_err,
+            self.config["entropy_coeff"] * entropy,
+        ])
+
+        overall_err.backward()
+        torch.nn.utils.clip_grad_norm_(self._model.parameters(),
+                                       self.config["grad_clip"])
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        completed = sample_batch["dones"][-1]
+        if completed:
+            last_r = 0.0
+        else:
+            last_r = self.value(sample_batch["new_obs"][-1])
+        return compute_advantages(
+            sample_batch, last_r, self.config["gamma"], self.config["lambda"])
diff --git a/python/ray/rllib/a3c/common.py b/python/ray/rllib/a3c/common.py
index da29eb452f02..cc2179c2f6ff 100644
--- a/python/ray/rllib/a3c/common.py
+++ b/python/ray/rllib/a3c/common.py
@@ -8,7 +8,7 @@ def get_policy_cls(config):
         from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
         policy_cls = SharedModelLSTM
     elif config["use_pytorch"]:
-        from ray.rllib.a3c.shared_torch_policy import SharedTorchPolicy
+        from ray.rllib.a3c.a3c_torch_policy import SharedTorchPolicy
         policy_cls = SharedTorchPolicy
     else:
         from ray.rllib.a3c.shared_model import SharedModel
diff --git a/python/ray/rllib/a3c/policy.py b/python/ray/rllib/a3c/policy.py
deleted file mode 100644
index 1e9639fd71af..000000000000
--- a/python/ray/rllib/a3c/policy.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-class Policy(object):
-    """The policy base class."""
-    def __init__(self, ob_space, action_space, name="local", summarize=True):
-        pass
-
-    def apply_gradients(self, grads):
-        raise NotImplementedError
-
-    def get_weights(self):
-        raise NotImplementedError
-
-    def set_weights(self, weights):
-        raise NotImplementedError
-
-    def compute_gradients(self, samples):
-        raise NotImplementedError
-
-    def compute(self, observations):
-        """Compute action for a _single_ observation"""
-        raise NotImplementedError
-
-    def value(self, ob):
-        raise NotImplementedError
diff --git a/python/ray/rllib/a3c/shared_model.py b/python/ray/rllib/a3c/shared_model.py
index 8209be159ed4..3a093fa906f8 100644
--- a/python/ray/rllib/a3c/shared_model.py
+++ b/python/ray/rllib/a3c/shared_model.py
@@ -4,30 +4,27 @@
 
 import tensorflow as tf
 from ray.rllib.models.misc import linear, normc_initializer
-from ray.rllib.a3c.tfpolicy import TFPolicy
+from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
 from ray.rllib.models.catalog import ModelCatalog
 
 
-class SharedModel(TFPolicy):
+class SharedModel(A3CTFPolicyGraph):
 
-    other_output = ["vf_preds"]
-    is_recurrent = False
-
-    def __init__(self, registry, ob_space, ac_space, config, **kwargs):
+    def __init__(self, ob_space, ac_space, registry, config, **kwargs):
         super(SharedModel, self).__init__(
-            registry, ob_space, ac_space, config, **kwargs)
+            ob_space, ac_space, registry, config, **kwargs)
 
     def _setup_graph(self, ob_space, ac_space):
-        self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
+        self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
         dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
         self._model = ModelCatalog.get_model(
             self.registry, self.x, self.logit_dim, self.config["model"])
         self.logits = self._model.outputs
-        self.curr_dist = dist_class(self.logits)
+        self.action_dist = dist_class(self.logits)
         self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
                                     normc_initializer(1.0)), [-1])
 
-        self.sample = self.curr_dist.sample()
+        self.sample = self.action_dist.sample()
         self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           tf.get_variable_scope().name)
         self.global_step = tf.get_variable(
@@ -35,28 +32,20 @@ def _setup_graph(self, ob_space, ac_space):
             initializer=tf.constant_initializer(0, dtype=tf.int32),
             trainable=False)
 
-    def compute_gradients(self, samples):
-        info = {}
-        feed_dict = {
-            self.x: samples["obs"],
-            self.ac: samples["actions"],
-            self.adv: samples["advantages"],
-            self.r: samples["value_targets"],
-        }
-        self.grads = [g for g in self.grads if g is not None]
-        self.local_steps += 1
-        if self.summarize:
-            grad, summ = self.sess.run([self.grads, self.summary_op],
-                                       feed_dict=feed_dict)
-            info['summary'] = summ
-        else:
-            grad = self.sess.run(self.grads, feed_dict=feed_dict)
-        return grad, info
-
-    def compute(self, ob, *args):
-        action, vf = self.sess.run([self.sample, self.vf],
-                                   {self.x: [ob]})
-        return action[0], {"vf_preds": vf[0]}
+        self.state_in = []
+        self.state_out = []
+
+    def setup_loss(self, action_space):
+        A3CTFPolicyGraph.setup_loss(self, action_space)
+        self.loss_in = [
+            ("obs", self.x),
+            ("actions", self.ac),
+            ("advantages", self.adv),
+            ("value_targets", self.r),
+        ]
+
+    def extra_compute_action_fetches(self):
+        return {"vf_preds": self.vf}
 
     def value(self, ob, *args):
         vf = self.sess.run(self.vf, {self.x: [ob]})
diff --git a/python/ray/rllib/a3c/shared_model_lstm.py b/python/ray/rllib/a3c/shared_model_lstm.py
index 37f71e490467..7cb64e684aa6 100644
--- a/python/ray/rllib/a3c/shared_model_lstm.py
+++ b/python/ray/rllib/a3c/shared_model_lstm.py
@@ -5,43 +5,32 @@
 import tensorflow as tf
 from ray.rllib.models.misc import linear, normc_initializer
 from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.a3c.tfpolicy import TFPolicy
+from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
 from ray.rllib.models.lstm import LSTM
 
 
-class SharedModelLSTM(TFPolicy):
-    """
-    Attributes:
-        other_output (list): Other than `action`, the other return values from
-            `compute_gradients`.
-        is_recurrent (bool): True if is a recurrent network (requires features
-            to be tracked).
-    """
+class SharedModelLSTM(A3CTFPolicyGraph):
 
-    other_output = ["vf_preds", "features"]
-    is_recurrent = True
-
-    def __init__(self, registry, ob_space, ac_space, config, **kwargs):
+    def __init__(self, ob_space, ac_space, registry, config, **kwargs):
         super(SharedModelLSTM, self).__init__(
-            registry, ob_space, ac_space, config, **kwargs)
+            ob_space, ac_space, registry, config, **kwargs)
 
     def _setup_graph(self, ob_space, ac_space):
-        self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
+        self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
         dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
         self._model = LSTM(self.x, self.logit_dim, {})
 
-        self.state_init = self._model.state_init
         self.state_in = self._model.state_in
         self.state_out = self._model.state_out
 
         self.logits = self._model.outputs
-        self.curr_dist = dist_class(self.logits)
+        self.action_dist = dist_class(self.logits)
         # with tf.variable_scope("vf"):
         #     vf_model = ModelCatalog.get_model(self.x, 1)
         self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
                                     normc_initializer(1.0)), [-1])
 
-        self.sample = self.curr_dist.sample()
+        self.sample = self.action_dist.sample()
         self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           tf.get_variable_scope().name)
         self.global_step = tf.get_variable(
@@ -49,42 +38,25 @@ def _setup_graph(self, ob_space, ac_space):
             initializer=tf.constant_initializer(0, dtype=tf.int32),
             trainable=False)
 
-    def compute_gradients(self, samples):
-        """Computing the gradient is actually model-dependent.
+    def get_initial_state(self):
+        return self._model.state_init
 
-        The LSTM needs its hidden states in order to compute the gradient
-        accurately.
-        """
-        features = samples["features"][0]
-        feed_dict = {
-            self.x: samples["obs"],
-            self.ac: samples["actions"],
-            self.adv: samples["advantages"],
-            self.r: samples["value_targets"],
-            self.state_in[0]: features[0],
-            self.state_in[1]: features[1]
-        }
-        info = {}
-        self.local_steps += 1
-        if self.summarize and self.local_steps % 10 == 0:
-            grad, summ = self.sess.run([self.grads, self.summary_op],
-                                       feed_dict=feed_dict)
-            info['summary'] = summ
-        else:
-            grad = self.sess.run(self.grads, feed_dict=feed_dict)
-        return grad, info
+    def setup_loss(self, action_space):
+        A3CTFPolicyGraph.setup_loss(self, action_space)
+        self.loss_in = [
+            ("obs", self.x),
+            ("actions", self.ac),
+            ("advantages", self.adv),
+            ("value_targets", self.r),
+            ("state_in_0", self.state_in[0]),
+            ("state_in_1", self.state_in[1]),
+        ]
 
-    def compute(self, ob, c, h):
-        action, vf, c, h = self.sess.run(
-            [self.sample, self.vf] + self.state_out,
-            {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})
-        return action[0], {"vf_preds": vf[0], "features": (c, h)}
+    def extra_compute_action_fetches(self):
+        return {"vf_preds": self.vf}
 
     def value(self, ob, c, h):
         vf = self.sess.run(self.vf, {self.x: [ob],
                                      self.state_in[0]: c,
                                      self.state_in[1]: h})
         return vf[0]
-
-    def get_initial_features(self):
-        return self.state_init
diff --git a/python/ray/rllib/a3c/tfpolicy.py b/python/ray/rllib/a3c/tfpolicy.py
deleted file mode 100644
index 1fbb46bdfe78..000000000000
--- a/python/ray/rllib/a3c/tfpolicy.py
+++ /dev/null
@@ -1,106 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import ray
-import gym
-from ray.rllib.a3c.policy import Policy
-
-
-class TFPolicy(Policy):
-    """The policy base class."""
-    def __init__(self, registry, ob_space, action_space, config,
-                 name="local", summarize=True):
-        self.registry = registry
-        self.local_steps = 0
-        self.config = config
-        self.summarize = summarize
-        worker_device = "/job:localhost/replica:0/task:0/cpu:0"
-        self.g = tf.Graph()
-        with self.g.as_default(), tf.device(worker_device):
-            with tf.variable_scope(name):
-                self._setup_graph(ob_space, action_space)
-                assert all(hasattr(self, attr)
-                           for attr in ["vf", "logits", "x", "var_list"])
-            print("Setting up loss")
-            self.setup_loss(action_space)
-            self.setup_gradients()
-            self.initialize()
-
-    def _setup_graph(self, ob_space, ac_space):
-        raise NotImplementedError
-
-    def setup_loss(self, action_space):
-        if isinstance(action_space, gym.spaces.Box):
-            ac_size = action_space.shape[0]
-            self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
-        elif isinstance(action_space, gym.spaces.Discrete):
-            self.ac = tf.placeholder(tf.int64, [None], name="ac")
-        else:
-            raise NotImplementedError(
-                "action space" + str(type(action_space)) +
-                "currently not supported")
-        self.adv = tf.placeholder(tf.float32, [None], name="adv")
-        self.r = tf.placeholder(tf.float32, [None], name="r")
-
-        log_prob = self.curr_dist.logp(self.ac)
-
-        # The "policy gradients" loss: its derivative is precisely the policy
-        # gradient. Notice that self.ac is a placeholder that is provided
-        # externally. adv will contain the advantages, as calculated in
-        # process_rollout.
-        self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
-
-        delta = self.vf - self.r
-        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
-        self.entropy = tf.reduce_sum(self.curr_dist.entropy())
-        self.loss = (self.pi_loss +
-                     self.vf_loss * self.config["vf_loss_coeff"] +
-                     self.entropy * self.config["entropy_coeff"])
-
-    def setup_gradients(self):
-        grads = tf.gradients(self.loss, self.var_list)
-        self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
-        grads_and_vars = list(zip(self.grads, self.var_list))
-        opt = tf.train.AdamOptimizer(self.config["lr"])
-        self._apply_gradients = opt.apply_gradients(grads_and_vars)
-
-    def initialize(self):
-        if self.summarize:
-            bs = tf.to_float(tf.shape(self.x)[0])
-            tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
-            tf.summary.scalar("model/value_loss", self.vf_loss / bs)
-            tf.summary.scalar("model/entropy", self.entropy / bs)
-            tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
-            tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
-            self.summary_op = tf.summary.merge_all()
-
-        # TODO(rliaw): Can consider exposing these parameters
-        self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
-            intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
-            gpu_options=tf.GPUOptions(allow_growth=True)))
-        self.variables = ray.experimental.TensorFlowVariables(self.loss,
-                                                              self.sess)
-        self.sess.run(tf.global_variables_initializer())
-
-    def apply_gradients(self, grads):
-        feed_dict = {self.grads[i]: grads[i]
-                     for i in range(len(grads))}
-        self.sess.run(self._apply_gradients, feed_dict=feed_dict)
-
-    def get_weights(self):
-        weights = self.variables.get_weights()
-        return weights
-
-    def set_weights(self, weights):
-        self.variables.set_weights(weights)
-
-    def compute_gradients(self, samples):
-        raise NotImplementedError
-
-    def compute(self, observation):
-        raise NotImplementedError
-
-    def value(self, ob):
-        raise NotImplementedError
diff --git a/python/ray/rllib/agent.py b/python/ray/rllib/agent.py
index 5699022b2a8e..4c17de01fa43 100644
--- a/python/ray/rllib/agent.py
+++ b/python/ray/rllib/agent.py
@@ -61,7 +61,7 @@ class Agent(Trainable):
     """
 
     _allow_unknown_configs = False
-    _allow_unknown_subkeys = []
+    _allow_unknown_subkeys = ["env_config", "model", "optimizer"]
 
     @classmethod
     def resource_help(cls, config):
diff --git a/python/ray/rllib/bc/bc_evaluator.py b/python/ray/rllib/bc/bc_evaluator.py
index 8499ba1e023e..27e011ac6cc4 100644
--- a/python/ray/rllib/bc/bc_evaluator.py
+++ b/python/ray/rllib/bc/bc_evaluator.py
@@ -17,8 +17,7 @@ def __init__(self, registry, env_creator, config, logdir):
         env = ModelCatalog.get_preprocessor_as_wrapper(registry, env_creator(
             config["env_config"]), config["model"])
         self.dataset = ExperienceDataset(config["dataset_path"])
-        # TODO(rliaw): should change this to be just env.observation_space
-        self.policy = BCPolicy(registry, env.observation_space.shape,
+        self.policy = BCPolicy(registry, env.observation_space,
                                env.action_space, config)
         self.config = config
         self.logdir = logdir
diff --git a/python/ray/rllib/bc/policy.py b/python/ray/rllib/bc/policy.py
index 11178a50d23a..2c4210a57cf5 100644
--- a/python/ray/rllib/bc/policy.py
+++ b/python/ray/rllib/bc/policy.py
@@ -6,30 +6,22 @@
 import gym
 
 import ray
-from ray.rllib.a3c.policy import Policy
 from ray.rllib.models.catalog import ModelCatalog
 
 
-class BCPolicy(Policy):
-    def __init__(self, registry, ob_space, action_space, config, name="local",
-                 summarize=True):
-        super(BCPolicy, self).__init__(ob_space, action_space, name, summarize)
+class BCPolicy(object):
+    def __init__(self, registry, obs_space, action_space, config):
         self.registry = registry
         self.local_steps = 0
         self.config = config
-        self.summarize = summarize
-        worker_device = "/job:localhost/replica:0/task:0/cpu:0"
-        self.g = tf.Graph()
-        with self.g.as_default(), tf.device(worker_device):
-            with tf.variable_scope(name):
-                self._setup_graph(ob_space, action_space)
-            print("Setting up loss")
-            self.setup_loss(action_space)
-            self.setup_gradients()
-            self.initialize()
+        self.summarize = config.get("summarize")
+        self._setup_graph(obs_space, action_space)
+        self.setup_loss(action_space)
+        self.setup_gradients()
+        self.initialize()
 
-    def _setup_graph(self, ob_space, ac_space):
-        self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
+    def _setup_graph(self, obs_space, ac_space):
+        self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
         dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
         self._model = ModelCatalog.get_model(
             self.registry, self.x, self.logit_dim, self.config["model"])
diff --git a/python/ray/rllib/ddpg/apex.py b/python/ray/rllib/ddpg/apex.py
index d3402e61bc06..1799e71a8d9b 100644
--- a/python/ray/rllib/ddpg/apex.py
+++ b/python/ray/rllib/ddpg/apex.py
@@ -8,25 +8,25 @@
 APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
     DDPG_CONFIG,
     {
-        'optimizer_class': 'ApexOptimizer',
-        'optimizer_config':
+        "optimizer_class": "ApexOptimizer",
+        "optimizer_config":
             merge_dicts(
-                DDPG_CONFIG['optimizer_config'], {
-                    'max_weight_sync_delay': 400,
-                    'num_replay_buffer_shards': 4,
-                    'debug': False
+                DDPG_CONFIG["optimizer_config"], {
+                    "max_weight_sync_delay": 400,
+                    "num_replay_buffer_shards": 4,
+                    "debug": False
                 }),
-        'n_step': 3,
-        'num_workers': 32,
-        'buffer_size': 2000000,
-        'learning_starts': 50000,
-        'train_batch_size': 512,
-        'sample_batch_size': 50,
-        'max_weight_sync_delay': 400,
-        'target_network_update_freq': 500000,
-        'timesteps_per_iteration': 25000,
-        'per_worker_exploration': True,
-        'worker_side_prioritization': True,
+        "n_step": 3,
+        "num_workers": 32,
+        "buffer_size": 2000000,
+        "learning_starts": 50000,
+        "train_batch_size": 512,
+        "sample_batch_size": 50,
+        "max_weight_sync_delay": 400,
+        "target_network_update_freq": 500000,
+        "timesteps_per_iteration": 25000,
+        "per_worker_exploration": True,
+        "worker_side_prioritization": True,
     },
 )
 
diff --git a/python/ray/rllib/ddpg/ddpg.py b/python/ray/rllib/ddpg/ddpg.py
index 25fda8321e20..06f6128e81f7 100644
--- a/python/ray/rllib/ddpg/ddpg.py
+++ b/python/ray/rllib/ddpg/ddpg.py
@@ -2,17 +2,9 @@
 from __future__ import division
 from __future__ import print_function
 
-import pickle
-import os
-
-import numpy as np
-import tensorflow as tf
-
-import ray
-from ray.rllib import optimizers
-from ray.rllib.ddpg.ddpg_evaluator import DDPGEvaluator
-from ray.rllib.agent import Agent
-from ray.tune.result import TrainingResult
+from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
+from ray.rllib.dqn.dqn import DQNAgent
+from ray.rllib.ddpg.ddpg_policy_graph import DDPGPolicyGraph
 
 OPTIMIZER_SHARED_CONFIGS = [
     "buffer_size", "prioritized_replay", "prioritized_replay_alpha",
@@ -23,247 +15,120 @@
 DEFAULT_CONFIG = {
     # === Model ===
     # Hidden layer sizes of the policy networks
-    'actor_hiddens': [64, 64],
+    "actor_hiddens": [64, 64],
     # Hidden layer sizes of the policy networks
-    'critic_hiddens': [64, 64],
+    "critic_hiddens": [64, 64],
     # N-step Q learning
-    'n_step': 1,
+    "n_step": 1,
     # Config options to pass to the model constructor
-    'model': {},
+    "model": {},
     # Discount factor for the MDP
-    'gamma': 0.99,
+    "gamma": 0.99,
     # Arguments to pass to the env creator
-    'env_config': {},
+    "env_config": {},
 
     # === Exploration ===
     # Max num timesteps for annealing schedules. Exploration is annealed from
     # 1.0 to exploration_fraction over this number of timesteps scaled by
     # exploration_fraction
-    'schedule_max_timesteps': 100000,
+    "schedule_max_timesteps": 100000,
     # Number of env steps to optimize for before returning
-    'timesteps_per_iteration': 1000,
+    "timesteps_per_iteration": 1000,
     # Fraction of entire training period over which the exploration rate is
     # annealed
-    'exploration_fraction': 0.1,
+    "exploration_fraction": 0.1,
     # Final value of random action probability
-    'exploration_final_eps': 0.02,
+    "exploration_final_eps": 0.02,
     # OU-noise scale
-    'noise_scale': 0.1,
+    "noise_scale": 0.1,
     # theta
-    'exploration_theta': 0.15,
+    "exploration_theta": 0.15,
     # sigma
-    'exploration_sigma': 0.2,
+    "exploration_sigma": 0.2,
     # Update the target network every `target_network_update_freq` steps.
-    'target_network_update_freq': 0,
+    "target_network_update_freq": 0,
     # Update the target by \tau * policy + (1-\tau) * target_policy
-    'tau': 0.002,
-    # Whether to start with random actions instead of noops.
-    'random_starts': True,
+    "tau": 0.002,
 
     # === Replay buffer ===
     # Size of the replay buffer. Note that if async_updates is set, then
     # each worker will have a replay buffer of this size.
-    'buffer_size': 50000,
+    "buffer_size": 50000,
     # If True prioritized replay buffer will be used.
-    'prioritized_replay': True,
+    "prioritized_replay": True,
     # Alpha parameter for prioritized replay buffer.
-    'prioritized_replay_alpha': 0.6,
+    "prioritized_replay_alpha": 0.6,
     # Beta parameter for sampling from prioritized replay buffer.
-    'prioritized_replay_beta': 0.4,
+    "prioritized_replay_beta": 0.4,
     # Epsilon to add to the TD errors when updating priorities.
-    'prioritized_replay_eps': 1e-6,
+    "prioritized_replay_eps": 1e-6,
     # Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
-    'clip_rewards': True,
+    "clip_rewards": True,
 
     # === Optimization ===
     # Learning rate for adam optimizer
-    'actor_lr': 1e-4,
-    'critic_lr': 1e-3,
+    "actor_lr": 1e-4,
+    "critic_lr": 1e-3,
     # If True, use huber loss instead of squared loss for critic network
     # Conventionally, no need to clip gradients if using a huber loss
-    'use_huber': False,
+    "use_huber": False,
     # Threshold of a huber loss
-    'huber_threshold': 1.0,
+    "huber_threshold": 1.0,
     # Weights for L2 regularization
-    'l2_reg': 1e-6,
+    "l2_reg": 1e-6,
     # If not None, clip gradients during optimization at this value
-    'grad_norm_clipping': None,
+    "grad_norm_clipping": None,
     # How many steps of the model to sample before learning starts.
-    'learning_starts': 1500,
+    "learning_starts": 1500,
     # Update the replay buffer with this many samples at once. Note that this
     # setting applies per-worker if num_workers > 1.
-    'sample_batch_size': 1,
+    "sample_batch_size": 1,
     # Size of a batched sampled from replay buffer for training. Note that
     # if async_updates is set, then each worker returns gradients for a
     # batch of this size.
-    'train_batch_size': 256,
-    # Smooth the current average reward over this many previous episodes.
-    'smoothing_num_episodes': 100,
-
-    # === Tensorflow ===
-    # Arguments to pass to tensorflow
-    'tf_session_args': {
-        "device_count": {
-            "CPU": 2
-        },
-        "log_device_placement": False,
-        "allow_soft_placement": True,
-        "gpu_options": {
-            "allow_growth": True
-        },
-        "inter_op_parallelism_threads": 1,
-        "intra_op_parallelism_threads": 1,
-    },
+    "train_batch_size": 256,
 
     # === Parallelism ===
+    # Whether to use a GPU for local optimization.
+    "gpu": False,
     # Number of workers for collecting samples with. This only makes sense
     # to increase if your environment is particularly slow to sample, or if
-    # you're using the Async or Ape-X optimizers.
-    'num_workers': 0,
+    # you"re using the Async or Ape-X optimizers.
+    "num_workers": 0,
     # Whether to allocate GPUs for workers (if > 0).
-    'num_gpus_per_worker': 0,
+    "num_gpus_per_worker": 0,
+    # Whether to allocate CPUs for workers (if > 0).
+    "num_cpus_per_worker": 1,
     # Optimizer class to use.
-    'optimizer_class': "LocalSyncReplayOptimizer",
+    "optimizer_class": "LocalSyncReplayOptimizer",
     # Config to pass to the optimizer.
-    'optimizer_config': {},
+    "optimizer_config": {},
     # Whether to use a distribution of epsilons across workers for exploration.
-    'per_worker_exploration': False,
+    "per_worker_exploration": False,
     # Whether to compute priorities on workers.
-    'worker_side_prioritization': False
+    "worker_side_prioritization": False
 }
 
 
-class DDPGAgent(Agent):
+class DDPGAgent(DQNAgent):
     _agent_name = "DDPG"
     _allow_unknown_subkeys = [
-        "model", "optimizer", "tf_session_args", "env_config"
-    ]
+        "model", "optimizer", "tf_session_args", "env_config"]
     _default_config = DEFAULT_CONFIG
+    _policy_graph = DDPGPolicyGraph
 
-    def _init(self):
-        self.local_evaluator = DDPGEvaluator(self.registry, self.env_creator,
-                                             self.config, self.logdir, 0)
-        remote_cls = ray.remote(
-            num_cpus=1,
-            num_gpus=self.config["num_gpus_per_worker"])(DDPGEvaluator)
-        self.remote_evaluators = [
-            remote_cls.remote(self.registry, self.env_creator, self.config,
-                              self.logdir, i)
-            for i in range(self.config["num_workers"])
-        ]
-
-        for k in OPTIMIZER_SHARED_CONFIGS:
-            if k not in self.config["optimizer_config"]:
-                self.config["optimizer_config"][k] = self.config[k]
-
-        self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
-            self.config["optimizer_config"], self.local_evaluator,
-            self.remote_evaluators)
-
-        self.saver = tf.train.Saver(max_to_keep=None)
-        self.last_target_update_ts = 0
-        self.num_target_updates = 0
-
-    @property
-    def global_timestep(self):
-        return self.optimizer.num_steps_sampled
-
-    def update_target_if_needed(self):
-        if self.global_timestep - self.last_target_update_ts > \
-                self.config["target_network_update_freq"]:
-            self.local_evaluator.update_target()
-            self.last_target_update_ts = self.global_timestep
-            self.num_target_updates += 1
-
-    def _train(self):
-        start_timestep = self.global_timestep
-
-        while (self.global_timestep - start_timestep <
-               self.config["timesteps_per_iteration"]):
-
-            self.optimizer.step()
-            self.update_target_if_needed()
-
-        self.local_evaluator.set_global_timestep(self.global_timestep)
-        for e in self.remote_evaluators:
-            e.set_global_timestep.remote(self.global_timestep)
-
-        return self._train_stats(start_timestep)
-
-    def _train_stats(self, start_timestep):
-        if self.remote_evaluators:
-            stats = ray.get([e.stats.remote() for e in self.remote_evaluators])
-        else:
-            stats = self.local_evaluator.stats()
-            if not isinstance(stats, list):
-                stats = [stats]
-
-        mean_100ep_reward = 0.0
-        mean_100ep_length = 0.0
-        num_episodes = 0
-        explorations = []
-
+    def _make_exploration_schedule(self, worker_index):
+        # Override DQN's schedule to take into account `noise_scale`
         if self.config["per_worker_exploration"]:
-            # Return stats from workers with the lowest 20% of exploration
-            test_stats = stats[-int(max(1, len(stats) * 0.2)):]
+            assert self.config["num_workers"] > 1, \
+                "This requires multiple workers"
+            return ConstantSchedule(
+                self.config["noise_scale"] * 0.4 **
+                (1 + worker_index / float(self.config["num_workers"] - 1) * 7))
         else:
-            test_stats = stats
-
-        for s in test_stats:
-            mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
-            mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
-
-        for s in stats:
-            num_episodes += s["num_episodes"]
-            explorations.append(s["exploration"])
-
-        opt_stats = self.optimizer.stats()
-
-        result = TrainingResult(
-            episode_reward_mean=mean_100ep_reward,
-            episode_len_mean=mean_100ep_length,
-            episodes_total=num_episodes,
-            timesteps_this_iter=self.global_timestep - start_timestep,
-            info=dict({
-                "min_exploration": min(explorations),
-                "max_exploration": max(explorations),
-                "num_target_updates": self.num_target_updates,
-            }, **opt_stats))
-
-        return result
-
-    def _stop(self):
-        # workaround for https://github.com/ray-project/ray/issues/1516
-        for ev in self.remote_evaluators:
-            ev.__ray_terminate__.remote()
-
-    def _save(self, checkpoint_dir):
-        checkpoint_path = self.saver.save(
-            self.local_evaluator.sess,
-            os.path.join(checkpoint_dir, "checkpoint"),
-            global_step=self.iteration)
-        extra_data = [
-            self.local_evaluator.save(),
-            ray.get([e.save.remote() for e in self.remote_evaluators]),
-            self.optimizer.save(), self.num_target_updates,
-            self.last_target_update_ts
-        ]
-        pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
-        return checkpoint_path
-
-    def _restore(self, checkpoint_path):
-        self.saver.restore(self.local_evaluator.sess, checkpoint_path)
-        extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
-        self.local_evaluator.restore(extra_data[0])
-        ray.get([
-            e.restore.remote(d)
-            for (d, e) in zip(extra_data[1], self.remote_evaluators)
-        ])
-        self.optimizer.restore(extra_data[2])
-        self.num_target_updates = extra_data[3]
-        self.last_target_update_ts = extra_data[4]
-
-    def compute_action(self, observation):
-        return self.local_evaluator.ddpg_graph.act(self.local_evaluator.sess,
-                                                   np.array(observation)[None],
-                                                   0.0)[0]
+            return LinearSchedule(
+                schedule_timesteps=int(self.config["exploration_fraction"] *
+                                       self.config["schedule_max_timesteps"]),
+                initial_p=self.config["noise_scale"] * 1.0,
+                final_p=self.config["noise_scale"] *
+                self.config["exploration_final_eps"])
diff --git a/python/ray/rllib/ddpg/ddpg_evaluator.py b/python/ray/rllib/ddpg/ddpg_evaluator.py
deleted file mode 100644
index 5a68c4b583ee..000000000000
--- a/python/ray/rllib/ddpg/ddpg_evaluator.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from gym.spaces import Box
-import numpy as np
-import tensorflow as tf
-
-import ray
-from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.ddpg import models
-from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
-from ray.rllib.optimizers import SampleBatch, PolicyEvaluator
-from ray.rllib.utils.compression import pack
-from ray.rllib.dqn.dqn_evaluator import adjust_nstep
-from ray.rllib.dqn.common.wrappers import wrap_dqn
-
-
-class DDPGEvaluator(PolicyEvaluator):
-    """The base DDPG Evaluator."""
-
-    def __init__(self, registry, env_creator, config, logdir, worker_index):
-        env = env_creator(config["env_config"])
-        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
-        self.env = env
-        self.config = config
-
-        # when env.action_space is of Box type, e.g., Pendulum-v0
-        # action_space.low is [-2.0], high is [2.0]
-        # take action by calling, e.g., env.step([3.5])
-        if not isinstance(env.action_space, Box):
-            raise UnsupportedSpaceException(
-                "Action space {} is not supported for DDPG.".format(
-                    env.action_space))
-
-        tf_config = tf.ConfigProto(**config["tf_session_args"])
-        self.sess = tf.Session(config=tf_config)
-        self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)
-
-        # Use either a different `eps` per worker, or a linear schedule.
-        if config["per_worker_exploration"]:
-            assert config["num_workers"] > 1, "This requires multiple workers"
-            self.exploration = ConstantSchedule(
-                config["noise_scale"] * 0.4 **
-                (1 + worker_index / float(config["num_workers"] - 1) * 7))
-        else:
-            self.exploration = LinearSchedule(
-                schedule_timesteps=int(config["exploration_fraction"] *
-                                       config["schedule_max_timesteps"]),
-                initial_p=config["noise_scale"] * 1.0,
-                final_p=config["noise_scale"] *
-                config["exploration_final_eps"])
-
-        # Initialize the parameters and copy them to the target network.
-        self.sess.run(tf.global_variables_initializer())
-        # hard instead of soft
-        self.ddpg_graph.update_target(self.sess, 1.0)
-        self.global_timestep = 0
-        self.local_timestep = 0
-
-        # Note that this encompasses both the policy and Q-value networks and
-        # their corresponding target networks
-        self.variables = ray.experimental.TensorFlowVariables(
-            tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess)
-
-        self.episode_rewards = [0.0]
-        self.episode_lengths = [0.0]
-        self.saved_mean_reward = None
-
-        self.obs = self.env.reset()
-
-    def set_global_timestep(self, global_timestep):
-        self.global_timestep = global_timestep
-
-    def update_target(self):
-        self.ddpg_graph.update_target(self.sess)
-
-    def sample(self):
-        obs, actions, rewards, new_obs, dones = [], [], [], [], []
-        for _ in range(
-                self.config["sample_batch_size"] + self.config["n_step"] - 1):
-            ob, act, rew, ob1, done = self._step(self.global_timestep)
-            obs.append(ob)
-            actions.append(act)
-            rewards.append(rew)
-            new_obs.append(ob1)
-            dones.append(done)
-
-        # N-step Q adjustments
-        if self.config["n_step"] > 1:
-            # Adjust for steps lost from truncation
-            self.local_timestep -= (self.config["n_step"] - 1)
-            adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
-                         actions, rewards, new_obs, dones)
-
-        batch = SampleBatch({
-            "obs": [pack(np.array(o)) for o in obs],
-            "actions": actions,
-            "rewards": rewards,
-            "new_obs": [pack(np.array(o)) for o in new_obs],
-            "dones": dones,
-            "weights": np.ones_like(rewards)
-        })
-        assert (batch.count == self.config["sample_batch_size"])
-
-        # Prioritize on the worker side
-        if self.config["worker_side_prioritization"]:
-            td_errors = self.ddpg_graph.compute_td_error(
-                self.sess, obs, batch["actions"], batch["rewards"], new_obs,
-                batch["dones"], batch["weights"])
-            new_priorities = (
-                np.abs(td_errors) + self.config["prioritized_replay_eps"])
-            batch.data["weights"] = new_priorities
-
-        return batch
-
-    def compute_gradients(self, samples):
-        td_err, grads = self.ddpg_graph.compute_gradients(
-            self.sess, samples["obs"], samples["actions"], samples["rewards"],
-            samples["new_obs"], samples["dones"], samples["weights"])
-        return grads, {"td_error": td_err}
-
-    def apply_gradients(self, grads):
-        self.ddpg_graph.apply_gradients(self.sess, grads)
-
-    def compute_apply(self, samples):
-        td_error = self.ddpg_graph.compute_apply(
-            self.sess, samples["obs"], samples["actions"], samples["rewards"],
-            samples["new_obs"], samples["dones"], samples["weights"])
-        return {"td_error": td_error}
-
-    def get_weights(self):
-        return self.variables.get_weights()
-
-    def set_weights(self, weights):
-        self.variables.set_weights(weights)
-
-    def _step(self, global_timestep):
-        """Takes a single step, and returns the result of the step."""
-        action = self.ddpg_graph.act(
-            self.sess,
-            np.array(self.obs)[None],
-            self.exploration.value(global_timestep))[0]
-        new_obs, rew, done, _ = self.env.step(action)
-        ret = (self.obs, action, rew, new_obs, float(done))
-        self.obs = new_obs
-        self.episode_rewards[-1] += rew
-        self.episode_lengths[-1] += 1
-        if done:
-            self.obs = self.env.reset()
-            self.episode_rewards.append(0.0)
-            self.episode_lengths.append(0.0)
-            # reset UO noise for each episode
-            self.ddpg_graph.reset_noise(self.sess)
-
-        self.local_timestep += 1
-        return ret
-
-    def stats(self):
-        n = self.config["smoothing_num_episodes"] + 1
-        mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
-        mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
-        exploration = self.exploration.value(self.global_timestep)
-        return {
-            "mean_100ep_reward": mean_100ep_reward,
-            "mean_100ep_length": mean_100ep_length,
-            "num_episodes": len(self.episode_rewards),
-            "exploration": exploration,
-            "local_timestep": self.local_timestep,
-        }
-
-    def save(self):
-        return [
-            self.exploration, self.episode_rewards, self.episode_lengths,
-            self.saved_mean_reward, self.obs, self.global_timestep,
-            self.local_timestep
-        ]
-
-    def restore(self, data):
-        self.exploration = data[0]
-        self.episode_rewards = data[1]
-        self.episode_lengths = data[2]
-        self.saved_mean_reward = data[3]
-        self.obs = data[4]
-        self.global_timestep = data[5]
-        self.local_timestep = data[6]
diff --git a/python/ray/rllib/ddpg/ddpg_policy_graph.py b/python/ray/rllib/ddpg/ddpg_policy_graph.py
new file mode 100644
index 000000000000..51572659b4e9
--- /dev/null
+++ b/python/ray/rllib/ddpg/ddpg_policy_graph.py
@@ -0,0 +1,327 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from gym.spaces import Box
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+
+import ray
+from ray.rllib.dqn.dqn_policy_graph import _huber_loss, _minimize_and_clip, \
+    _scope_vars, _postprocess_dqn
+from ray.rllib.models import ModelCatalog
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+
+
+A_SCOPE = "a_func"
+P_SCOPE = "p_func"
+P_TARGET_SCOPE = "target_p_func"
+Q_SCOPE = "q_func"
+Q_TARGET_SCOPE = "target_q_func"
+
+
+def _build_p_network(registry, inputs, dim_actions, config):
+    """
+    map an observation (i.e., state) to an action where
+    each entry takes value from (0, 1) due to the sigmoid function
+    """
+    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
+
+    hiddens = config["actor_hiddens"]
+    action_out = frontend.last_layer
+    for hidden in hiddens:
+        action_out = layers.fully_connected(
+            action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
+    # Use sigmoid layer to bound values within (0, 1)
+    # shape of action_scores is [batch_size, dim_actions]
+    action_scores = layers.fully_connected(
+        action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
+
+    return action_scores
+
+
+# As a stochastic policy for inference, but a deterministic policy for training
+# thus ignore batch_size issue when constructing a stochastic action
+def _build_action_network(p_values, low_action, high_action, stochastic, eps,
+                          theta, sigma):
+    # shape is [None, dim_action]
+    deterministic_actions = (high_action - low_action) * p_values + low_action
+
+    exploration_sample = tf.get_variable(
+        name="ornstein_uhlenbeck",
+        dtype=tf.float32,
+        initializer=low_action.size * [.0],
+        trainable=False)
+    normal_sample = tf.random_normal(
+        shape=[low_action.size], mean=0.0, stddev=1.0)
+    exploration_value = tf.assign_add(
+        exploration_sample,
+        theta * (.0 - exploration_sample) + sigma * normal_sample)
+    stochastic_actions = deterministic_actions + eps * (
+        high_action - low_action) * exploration_value
+
+    return tf.cond(stochastic, lambda: stochastic_actions,
+                   lambda: deterministic_actions)
+
+
+def _build_q_network(registry, inputs, action_inputs, config):
+    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
+
+    hiddens = config["critic_hiddens"]
+
+    q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
+    for hidden in hiddens:
+        q_out = layers.fully_connected(
+            q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
+    q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
+
+    return q_scores
+
+
+class DDPGPolicyGraph(TFPolicyGraph):
+    def __init__(self, observation_space, action_space, registry, config):
+        if not isinstance(action_space, Box):
+            raise UnsupportedSpaceException(
+                "Action space {} is not supported for DDPG.".format(
+                    action_space))
+
+        self.config = config
+        self.cur_epsilon = 1.0
+        dim_actions = action_space.shape[0]
+        low_action = action_space.low
+        high_action = action_space.high
+        self.actor_optimizer = tf.train.AdamOptimizer(
+            learning_rate=config["actor_lr"])
+        self.critic_optimizer = tf.train.AdamOptimizer(
+            learning_rate=config["critic_lr"])
+
+        # Action inputs
+        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
+        self.eps = tf.placeholder(tf.float32, (), name="eps")
+        self.cur_observations = tf.placeholder(
+            tf.float32, shape=(None, ) + observation_space.shape)
+
+        # Actor: P (policy) network
+        with tf.variable_scope(P_SCOPE) as scope:
+            p_values = _build_p_network(registry, self.cur_observations,
+                                        dim_actions, config)
+            self.p_func_vars = _scope_vars(scope.name)
+
+        # Action outputs
+        with tf.variable_scope(A_SCOPE):
+            self.output_actions = _build_action_network(
+                p_values, low_action, high_action, self.stochastic, self.eps,
+                config["exploration_theta"], config["exploration_sigma"])
+
+        with tf.variable_scope(A_SCOPE, reuse=True):
+            exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
+            self.reset_noise_op = tf.assign(exploration_sample,
+                                            dim_actions * [.0])
+
+        # Replay inputs
+        self.obs_t = tf.placeholder(
+            tf.float32,
+            shape=(None, ) + observation_space.shape,
+            name="observation")
+        self.act_t = tf.placeholder(
+            tf.float32, shape=(None, ) + action_space.shape, name="action")
+        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
+        self.obs_tp1 = tf.placeholder(
+            tf.float32, shape=(None, ) + observation_space.shape)
+        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
+        self.importance_weights = tf.placeholder(
+            tf.float32, [None], name="weight")
+
+        # p network evaluation
+        with tf.variable_scope(P_SCOPE, reuse=True) as scope:
+            self.p_t = _build_p_network(
+                registry, self.obs_t, dim_actions, config)
+
+        # target p network evaluation
+        with tf.variable_scope(P_TARGET_SCOPE) as scope:
+            p_tp1 = _build_p_network(
+                registry, self.obs_tp1, dim_actions, config)
+            target_p_func_vars = _scope_vars(scope.name)
+
+        # Action outputs
+        with tf.variable_scope(A_SCOPE, reuse=True):
+            deterministic_flag = tf.constant(value=False, dtype=tf.bool)
+            zero_eps = tf.constant(value=.0, dtype=tf.float32)
+            output_actions = _build_action_network(
+                self.p_t, low_action, high_action, deterministic_flag,
+                zero_eps, config["exploration_theta"],
+                config["exploration_sigma"])
+
+            output_actions_estimated = _build_action_network(
+                p_tp1, low_action, high_action, deterministic_flag,
+                zero_eps, config["exploration_theta"],
+                config["exploration_sigma"])
+
+        # q network evaluation
+        with tf.variable_scope(Q_SCOPE) as scope:
+            q_t = _build_q_network(
+                registry, self.obs_t, self.act_t, config)
+            self.q_func_vars = _scope_vars(scope.name)
+        with tf.variable_scope(Q_SCOPE, reuse=True):
+            q_tp0 = _build_q_network(
+                registry, self.obs_t, output_actions, config)
+
+        # target q network evalution
+        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
+            q_tp1 = _build_q_network(
+                registry, self.obs_tp1, output_actions_estimated, config)
+            target_q_func_vars = _scope_vars(scope.name)
+
+        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
+
+        q_tp1_best = tf.squeeze(
+            input=q_tp1, axis=len(q_tp1.shape) - 1)
+        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
+
+        # compute RHS of bellman equation
+        q_t_selected_target = (
+            self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
+
+        # compute the error (potentially clipped)
+        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
+        if config.get("use_huber"):
+            errors = _huber_loss(self.td_error, config.get("huber_threshold"))
+        else:
+            errors = 0.5 * tf.square(self.td_error)
+
+        self.loss = tf.reduce_mean(self.importance_weights * errors)
+
+        # for policy gradient
+        self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
+
+        if config["l2_reg"] is not None:
+            for var in self.p_func_vars:
+                if "bias" not in var.name:
+                    self.actor_loss += (
+                        config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
+            for var in self.q_func_vars:
+                if "bias" not in var.name:
+                    self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
+                        var)
+
+        # update_target_fn will be called periodically to copy Q network to
+        # target Q network
+        self.tau_value = config.get("tau")
+        self.tau = tf.placeholder(tf.float32, (), name="tau")
+        update_target_expr = []
+        for var, var_target in zip(
+                sorted(self.q_func_vars, key=lambda v: v.name),
+                sorted(target_q_func_vars, key=lambda v: v.name)):
+            update_target_expr.append(
+                var_target.assign(self.tau * var +
+                                  (1.0 - self.tau) * var_target))
+        for var, var_target in zip(
+                sorted(self.p_func_vars, key=lambda v: v.name),
+                sorted(target_p_func_vars, key=lambda v: v.name)):
+            update_target_expr.append(
+                var_target.assign(self.tau * var +
+                                  (1.0 - self.tau) * var_target))
+        self.update_target_expr = tf.group(*update_target_expr)
+
+        self.sess = tf.get_default_session()
+        self.loss_inputs = [
+            ("obs", self.obs_t),
+            ("actions", self.act_t),
+            ("rewards", self.rew_t),
+            ("new_obs", self.obs_tp1),
+            ("dones", self.done_mask),
+            ("weights", self.importance_weights),
+        ]
+        self.is_training = tf.placeholder_with_default(True, ())
+        TFPolicyGraph.__init__(
+            self, self.sess, obs_input=self.cur_observations,
+            action_sampler=self.output_actions, loss=self.loss,
+            loss_inputs=self.loss_inputs, is_training=self.is_training)
+        self.sess.run(tf.global_variables_initializer())
+
+        # Note that this encompasses both the policy and Q-value networks and
+        # their corresponding target networks
+        self.variables = ray.experimental.TensorFlowVariables(
+            tf.group(q_tp0, q_tp1), self.sess)
+
+        # Hard initial update
+        self.update_target(tau=1.0)
+
+    def gradients(self, optimizer):
+        if self.config["grad_norm_clipping"] is not None:
+            actor_grads_and_vars = _minimize_and_clip(
+                self.actor_optimizer,
+                self.actor_loss,
+                var_list=self.p_func_vars,
+                clip_val=self.config["grad_norm_clipping"])
+            critic_grads_and_vars = _minimize_and_clip(
+                self.critic_optimizer,
+                self.loss,
+                var_list=self.q_func_vars,
+                clip_val=self.config["grad_norm_clipping"])
+        else:
+            actor_grads_and_vars = self.actor_optimizer.compute_gradients(
+                self.actor_loss, var_list=self.p_func_vars)
+            critic_grads_and_vars = self.critic_optimizer.compute_gradients(
+                self.loss, var_list=self.q_func_vars)
+        actor_grads_and_vars = [
+            (g, v) for (g, v) in actor_grads_and_vars if g is not None]
+        critic_grads_and_vars = [
+            (g, v) for (g, v) in critic_grads_and_vars if g is not None]
+        grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
+        return grads_and_vars
+
+    def extra_compute_action_feed_dict(self):
+        return {
+            self.stochastic: True,
+            self.eps: self.cur_epsilon,
+        }
+
+    def extra_compute_grad_fetches(self):
+        return {
+            "td_error": self.td_error,
+        }
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        return _postprocess_dqn(self, sample_batch)
+
+    def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
+                         importance_weights):
+        td_err = self.sess.run(
+            self.td_error,
+            feed_dict={
+                self.obs_t: [np.array(ob) for ob in obs_t],
+                self.act_t: act_t,
+                self.rew_t: rew_t,
+                self.obs_tp1: [np.array(ob) for ob in obs_tp1],
+                self.done_mask: done_mask,
+                self.importance_weights: importance_weights
+            })
+        return td_err
+
+    def reset_noise(self, sess):
+        sess.run(self.reset_noise_op)
+
+    # support both hard and soft sync
+    def update_target(self, tau=None):
+        return self.sess.run(
+            self.update_target_expr,
+            feed_dict={self.tau: tau or self.tau_value})
+
+    def set_epsilon(self, epsilon):
+        self.cur_epsilon = epsilon
+
+    def get_weights(self):
+        return self.variables.get_weights()
+
+    def set_weights(self, weights):
+        self.variables.set_weights(weights)
+
+    def get_state(self):
+        return [TFPolicyGraph.get_state(self), self.cur_epsilon]
+
+    def set_state(self, state):
+        TFPolicyGraph.set_state(self, state[0])
+        self.set_epsilon(state[1])
diff --git a/python/ray/rllib/ddpg/models.py b/python/ray/rllib/ddpg/models.py
deleted file mode 100644
index d58f37dc6417..000000000000
--- a/python/ray/rllib/ddpg/models.py
+++ /dev/null
@@ -1,391 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import tensorflow as tf
-import tensorflow.contrib.layers as layers
-
-from ray.rllib.models import ModelCatalog
-
-
-def _build_p_network(registry, inputs, dim_actions, config):
-    """
-    map an observation (i.e., state) to an action where
-    each entry takes value from (0, 1) due to the sigmoid function
-    """
-    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
-
-    hiddens = config["actor_hiddens"]
-    action_out = frontend.last_layer
-    for hidden in hiddens:
-        action_out = layers.fully_connected(
-            action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
-    # Use sigmoid layer to bound values within (0, 1)
-    # shape of action_scores is [batch_size, dim_actions]
-    action_scores = layers.fully_connected(
-        action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
-
-    return action_scores
-
-
-# As a stochastic policy for inference, but a deterministic policy for training
-# thus ignore batch_size issue when constructing a stochastic action
-def _build_action_network(p_values, low_action, high_action, stochastic, eps,
-                          theta, sigma):
-    # shape is [None, dim_action]
-    deterministic_actions = (high_action - low_action) * p_values + low_action
-
-    exploration_sample = tf.get_variable(
-        name="ornstein_uhlenbeck",
-        dtype=tf.float32,
-        initializer=low_action.size * [.0],
-        trainable=False)
-    normal_sample = tf.random_normal(
-        shape=[low_action.size], mean=0.0, stddev=1.0)
-    exploration_value = tf.assign_add(
-        exploration_sample,
-        theta * (.0 - exploration_sample) + sigma * normal_sample)
-    stochastic_actions = deterministic_actions + eps * (
-        high_action - low_action) * exploration_value
-
-    return tf.cond(stochastic, lambda: stochastic_actions,
-                   lambda: deterministic_actions)
-
-
-def _build_q_network(registry, inputs, action_inputs, config):
-    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
-
-    hiddens = config["critic_hiddens"]
-
-    q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
-    for hidden in hiddens:
-        q_out = layers.fully_connected(
-            q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
-    q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
-
-    return q_scores
-
-
-def _huber_loss(x, delta=1.0):
-    """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
-    return tf.where(
-        tf.abs(x) < delta,
-        tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
-
-
-def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
-    """Minimized `objective` using `optimizer` w.r.t. variables in
-    `var_list` while ensure the norm of the gradients for each
-    variable is clipped to `clip_val`
-    """
-    gradients = optimizer.compute_gradients(objective, var_list=var_list)
-    for i, (grad, var) in enumerate(gradients):
-        if grad is not None:
-            gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
-    return gradients
-
-
-def _scope_vars(scope, trainable_only=False):
-    """
-    Get variables inside a scope
-    The scope can be specified as a string
-
-    Parameters
-    ----------
-    scope: str or VariableScope
-      scope in which the variables reside.
-    trainable_only: bool
-      whether or not to return only the variables that were marked as
-      trainable.
-
-    Returns
-    -------
-    vars: [tf.Variable]
-      list of variables in `scope`.
-    """
-    return tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES
-        if trainable_only else tf.GraphKeys.VARIABLES,
-        scope=scope if isinstance(scope, str) else scope.name)
-
-
-class ModelAndLoss(object):
-    """Holds the model and loss function.
-
-    Both graphs are necessary in order for the multi-gpu SGD implementation
-    to create towers on each device.
-    """
-
-    def __init__(self, registry, dim_actions, low_action, high_action, config,
-                 obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
-        # p network evaluation
-        with tf.variable_scope("p_func", reuse=True) as scope:
-            self.p_t = _build_p_network(registry, obs_t, dim_actions, config)
-
-        # target p network evaluation
-        with tf.variable_scope("target_p_func") as scope:
-            self.p_tp1 = _build_p_network(registry, obs_tp1, dim_actions,
-                                          config)
-            self.target_p_func_vars = _scope_vars(scope.name)
-
-        # Action outputs
-        with tf.variable_scope("a_func", reuse=True):
-            deterministic_flag = tf.constant(value=False, dtype=tf.bool)
-            zero_eps = tf.constant(value=.0, dtype=tf.float32)
-            output_actions = _build_action_network(
-                self.p_t, low_action, high_action, deterministic_flag,
-                zero_eps, config["exploration_theta"],
-                config["exploration_sigma"])
-
-            output_actions_estimated = _build_action_network(
-                self.p_tp1, low_action, high_action, deterministic_flag,
-                zero_eps, config["exploration_theta"],
-                config["exploration_sigma"])
-
-        # q network evaluation
-        with tf.variable_scope("q_func") as scope:
-            self.q_t = _build_q_network(registry, obs_t, act_t, config)
-            self.q_func_vars = _scope_vars(scope.name)
-        with tf.variable_scope("q_func", reuse=True):
-            self.q_tp0 = _build_q_network(registry, obs_t, output_actions,
-                                          config)
-
-        # target q network evalution
-        with tf.variable_scope("target_q_func") as scope:
-            self.q_tp1 = _build_q_network(registry, obs_tp1,
-                                          output_actions_estimated, config)
-            self.target_q_func_vars = _scope_vars(scope.name)
-
-        q_t_selected = tf.squeeze(self.q_t, axis=len(self.q_t.shape) - 1)
-
-        q_tp1_best = tf.squeeze(
-            input=self.q_tp1, axis=len(self.q_tp1.shape) - 1)
-        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
-
-        # compute RHS of bellman equation
-        q_t_selected_target = (
-            rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
-
-        # compute the error (potentially clipped)
-        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-        if config.get("use_huber"):
-            errors = _huber_loss(self.td_error, config.get("huber_threshold"))
-        else:
-            errors = 0.5 * tf.square(self.td_error)
-
-        weighted_error = tf.reduce_mean(importance_weights * errors)
-
-        self.loss = weighted_error
-
-        # for policy gradient
-        self.actor_loss = -1.0 * tf.reduce_mean(self.q_tp0)
-
-
-class DDPGGraph(object):
-    def __init__(self, registry, env, config, logdir):
-        self.env = env
-        dim_actions = env.action_space.shape[0]
-        low_action = env.action_space.low
-        high_action = env.action_space.high
-        actor_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["actor_lr"])
-        critic_optimizer = tf.train.AdamOptimizer(
-            learning_rate=config["critic_lr"])
-
-        # Action inputs
-        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.eps = tf.placeholder(tf.float32, (), name="eps")
-        self.cur_observations = tf.placeholder(
-            tf.float32, shape=(None, ) + env.observation_space.shape)
-
-        # Actor: P (policy) network
-        p_scope_name = "p_func"
-        with tf.variable_scope(p_scope_name) as scope:
-            p_values = _build_p_network(registry, self.cur_observations,
-                                        dim_actions, config)
-            p_func_vars = _scope_vars(scope.name)
-
-        # Action outputs
-        a_scope_name = "a_func"
-        with tf.variable_scope(a_scope_name):
-            self.output_actions = _build_action_network(
-                p_values, low_action, high_action, self.stochastic, self.eps,
-                config["exploration_theta"], config["exploration_sigma"])
-
-        with tf.variable_scope(a_scope_name, reuse=True):
-            exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
-            self.reset_noise_op = tf.assign(exploration_sample,
-                                            dim_actions * [.0])
-
-        # Replay inputs
-        self.obs_t = tf.placeholder(
-            tf.float32,
-            shape=(None, ) + env.observation_space.shape,
-            name="observation")
-        self.act_t = tf.placeholder(
-            tf.float32, shape=(None, ) + env.action_space.shape, name="action")
-        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
-        self.obs_tp1 = tf.placeholder(
-            tf.float32, shape=(None, ) + env.observation_space.shape)
-        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
-        self.importance_weights = tf.placeholder(
-            tf.float32, [None], name="weight")
-
-        def build_loss(obs_t, act_t, rew_t, obs_tp1, done_mask,
-                       importance_weights):
-            return ModelAndLoss(registry, dim_actions, low_action, high_action,
-                                config, obs_t, act_t, rew_t, obs_tp1,
-                                done_mask, importance_weights)
-
-        self.loss_inputs = [
-            ("obs", self.obs_t),
-            ("actions", self.act_t),
-            ("rewards", self.rew_t),
-            ("new_obs", self.obs_tp1),
-            ("dones", self.done_mask),
-            ("weights", self.importance_weights),
-        ]
-
-        loss_obj = build_loss(self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
-                              self.done_mask, self.importance_weights)
-
-        self.build_loss = build_loss
-
-        actor_loss = loss_obj.actor_loss
-        weighted_error = loss_obj.loss
-        q_func_vars = loss_obj.q_func_vars
-        target_p_func_vars = loss_obj.target_p_func_vars
-        target_q_func_vars = loss_obj.target_q_func_vars
-        self.p_t = loss_obj.p_t
-        self.q_t = loss_obj.q_t
-        self.q_tp0 = loss_obj.q_tp0
-        self.q_tp1 = loss_obj.q_tp1
-        self.td_error = loss_obj.td_error
-
-        if config["l2_reg"] is not None:
-            for var in p_func_vars:
-                if "bias" not in var.name:
-                    actor_loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)
-            for var in q_func_vars:
-                if "bias" not in var.name:
-                    weighted_error += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
-                        var)
-
-        # compute optimization op (potentially with gradient clipping)
-        if config["grad_norm_clipping"] is not None:
-            self.actor_grads_and_vars = _minimize_and_clip(
-                actor_optimizer,
-                actor_loss,
-                var_list=p_func_vars,
-                clip_val=config["grad_norm_clipping"])
-            self.critic_grads_and_vars = _minimize_and_clip(
-                critic_optimizer,
-                weighted_error,
-                var_list=q_func_vars,
-                clip_val=config["grad_norm_clipping"])
-        else:
-            self.actor_grads_and_vars = actor_optimizer.compute_gradients(
-                actor_loss, var_list=p_func_vars)
-            self.critic_grads_and_vars = critic_optimizer.compute_gradients(
-                weighted_error, var_list=q_func_vars)
-        self.actor_grads_and_vars = [(g, v)
-                                     for (g, v) in self.actor_grads_and_vars
-                                     if g is not None]
-        self.critic_grads_and_vars = [(g, v)
-                                      for (g, v) in self.critic_grads_and_vars
-                                      if g is not None]
-        self.grads_and_vars = (
-            self.actor_grads_and_vars + self.critic_grads_and_vars)
-        self.grads = [g for (g, v) in self.grads_and_vars]
-        self.actor_train_expr = actor_optimizer.apply_gradients(
-            self.actor_grads_and_vars)
-        self.critic_train_expr = critic_optimizer.apply_gradients(
-            self.critic_grads_and_vars)
-
-        # update_target_fn will be called periodically to copy Q network to
-        # target Q network
-        self.tau_value = config.get("tau")
-        self.tau = tf.placeholder(tf.float32, (), name="tau")
-        update_target_expr = []
-        for var, var_target in zip(
-                sorted(q_func_vars, key=lambda v: v.name),
-                sorted(target_q_func_vars, key=lambda v: v.name)):
-            update_target_expr.append(
-                var_target.assign(self.tau * var +
-                                  (1.0 - self.tau) * var_target))
-        for var, var_target in zip(
-                sorted(p_func_vars, key=lambda v: v.name),
-                sorted(target_p_func_vars, key=lambda v: v.name)):
-            update_target_expr.append(
-                var_target.assign(self.tau * var +
-                                  (1.0 - self.tau) * var_target))
-        self.update_target_expr = tf.group(*update_target_expr)
-
-    # support both hard and soft sync
-    def update_target(self, sess, tau=None):
-        return sess.run(
-            self.update_target_expr,
-            feed_dict={self.tau: tau or self.tau_value})
-
-    def act(self, sess, obs, eps, stochastic=True):
-        return sess.run(
-            self.output_actions,
-            feed_dict={
-                self.cur_observations: obs,
-                self.stochastic: stochastic,
-                self.eps: eps
-            })
-
-    def compute_gradients(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                          importance_weights):
-        td_err, grads = sess.run(
-            [self.td_error, self.grads],
-            feed_dict={
-                self.obs_t: obs_t,
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: obs_tp1,
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err, grads
-
-    def compute_td_error(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                         importance_weights):
-        td_err = sess.run(
-            self.td_error,
-            feed_dict={
-                self.obs_t: [np.array(ob) for ob in obs_t],
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: [np.array(ob) for ob in obs_tp1],
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err
-
-    def apply_gradients(self, sess, grads):
-        assert len(grads) == len(self.grads_and_vars)
-        feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)}
-        sess.run(
-            [self.critic_train_expr, self.actor_train_expr],
-            feed_dict=feed_dict)
-
-    def compute_apply(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-                      importance_weights):
-        td_err, _, _ = sess.run(
-            [self.td_error, self.critic_train_expr, self.actor_train_expr],
-            feed_dict={
-                self.obs_t: obs_t,
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: obs_tp1,
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err
-
-    def reset_noise(self, sess):
-        sess.run(self.reset_noise_op)
diff --git a/python/ray/rllib/ddpg2/ddpg_evaluator.py b/python/ray/rllib/ddpg2/ddpg_evaluator.py
index 8a5ab5ed3f3a..5ba71028ce89 100644
--- a/python/ray/rllib/ddpg2/ddpg_evaluator.py
+++ b/python/ray/rllib/ddpg2/ddpg_evaluator.py
@@ -9,7 +9,7 @@
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.optimizers import PolicyEvaluator
 from ray.rllib.utils.filter import NoFilter
-from ray.rllib.utils.process_rollout import process_rollout
+from ray.rllib.utils.process_rollout import compute_advantages
 from ray.rllib.utils.sampler import SyncSampler
 
 
@@ -34,9 +34,7 @@ def sample(self):
 
         # since each sample is one step, no discounting needs to be applied;
         # this does not involve config["gamma"]
-        samples = process_rollout(
-                    rollout, NoFilter(),
-                    gamma=1.0, use_gae=False)
+        samples = compute_advantages(rollout, 0.0, gamma=1.0, use_gae=False)
 
         return samples
 
diff --git a/python/ray/rllib/ddpg2/models.py b/python/ray/rllib/ddpg2/models.py
index e785f518f541..855964ffb19c 100644
--- a/python/ray/rllib/ddpg2/models.py
+++ b/python/ray/rllib/ddpg2/models.py
@@ -227,7 +227,7 @@ def set_weights(self, weights):
         self.critic_vars.set_weights(critic_weights)
         self.actor_vars.set_weights(actor_weights)
 
-    def compute(self, ob):
+    def compute_single_action(self, ob, h, is_training):
         """Returns action, given state."""
         flattened_ob = np.reshape(ob, [-1, np.prod(ob.shape)])
         action = self.sess.run(self.output_action, {self.obs: flattened_ob})
@@ -235,7 +235,10 @@ def compute(self, ob):
             action += self.epsilon * self.rand_process.sample()
             if (self.epsilon > 0):
                 self.epsilon -= self.config["noise_epsilon"]
-        return action[0], {}
+        return action[0], [], {}
 
     def value(self, *args):
         return 0
+
+    def get_initial_state(self):
+        return []
diff --git a/python/ray/rllib/dqn/apex.py b/python/ray/rllib/dqn/apex.py
index 6de53203770c..34e6ecd912ef 100644
--- a/python/ray/rllib/dqn/apex.py
+++ b/python/ray/rllib/dqn/apex.py
@@ -9,26 +9,26 @@
 APEX_DEFAULT_CONFIG = merge_dicts(
     DQN_CONFIG,
     {
-        'optimizer_class': 'ApexOptimizer',
-        'optimizer_config':
+        "optimizer_class": "ApexOptimizer",
+        "optimizer_config":
             merge_dicts(
-                DQN_CONFIG['optimizer_config'], {
-                    'max_weight_sync_delay': 400,
-                    'num_replay_buffer_shards': 4,
-                    'debug': False
+                DQN_CONFIG["optimizer_config"], {
+                    "max_weight_sync_delay": 400,
+                    "num_replay_buffer_shards": 4,
+                    "debug": False
                 }),
-        'n_step': 3,
-        'gpu': True,
-        'num_workers': 32,
-        'buffer_size': 2000000,
-        'learning_starts': 50000,
-        'train_batch_size': 512,
-        'sample_batch_size': 50,
-        'max_weight_sync_delay': 400,
-        'target_network_update_freq': 500000,
-        'timesteps_per_iteration': 25000,
-        'per_worker_exploration': True,
-        'worker_side_prioritization': True,
+        "n_step": 3,
+        "gpu": True,
+        "num_workers": 32,
+        "buffer_size": 2000000,
+        "learning_starts": 50000,
+        "train_batch_size": 512,
+        "sample_batch_size": 50,
+        "max_weight_sync_delay": 400,
+        "target_network_update_freq": 500000,
+        "timesteps_per_iteration": 25000,
+        "per_worker_exploration": True,
+        "worker_side_prioritization": True,
     },
 )
 
diff --git a/python/ray/rllib/dqn/dqn.py b/python/ray/rllib/dqn/dqn.py
index 0bf99cf1ff2a..9d2f698cf162 100644
--- a/python/ray/rllib/dqn/dqn.py
+++ b/python/ray/rllib/dqn/dqn.py
@@ -5,14 +5,13 @@
 import pickle
 import os
 
-import numpy as np
-import tensorflow as tf
-
 import ray
 from ray.rllib import optimizers
-from ray.rllib.dqn.dqn_evaluator import DQNEvaluator
+from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
+from ray.rllib.dqn.dqn_policy_graph import DQNPolicyGraph
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
+    collect_metrics
 from ray.rllib.agent import Agent
-from ray.tune.result import TrainingResult
 from ray.tune.trial import Resources
 
 
@@ -24,101 +23,84 @@
 DEFAULT_CONFIG = {
     # === Model ===
     # Whether to use dueling dqn
-    'dueling': True,
+    "dueling": True,
     # Whether to use double dqn
-    'double_q': True,
+    "double_q": True,
     # Hidden layer sizes of the state and action value networks
-    'hiddens': [256],
+    "hiddens": [256],
     # N-step Q learning
-    'n_step': 1,
+    "n_step": 1,
     # Config options to pass to the model constructor
-    'model': {},
+    "model": {},
     # Discount factor for the MDP
-    'gamma': 0.99,
+    "gamma": 0.99,
     # Arguments to pass to the env creator
-    'env_config': {},
+    "env_config": {},
 
     # === Exploration ===
     # Max num timesteps for annealing schedules. Exploration is annealed from
     # 1.0 to exploration_fraction over this number of timesteps scaled by
     # exploration_fraction
-    'schedule_max_timesteps': 100000,
+    "schedule_max_timesteps": 100000,
     # Number of env steps to optimize for before returning
-    'timesteps_per_iteration': 1000,
+    "timesteps_per_iteration": 1000,
     # Fraction of entire training period over which the exploration rate is
     # annealed
-    'exploration_fraction': 0.1,
+    "exploration_fraction": 0.1,
     # Final value of random action probability
-    'exploration_final_eps': 0.02,
+    "exploration_final_eps": 0.02,
     # Update the target network every `target_network_update_freq` steps.
-    'target_network_update_freq': 500,
-    # Whether to start with random actions instead of noops.
-    'random_starts': True,
+    "target_network_update_freq": 500,
 
     # === Replay buffer ===
     # Size of the replay buffer. Note that if async_updates is set, then
     # each worker will have a replay buffer of this size.
-    'buffer_size': 50000,
+    "buffer_size": 50000,
     # If True prioritized replay buffer will be used.
-    'prioritized_replay': True,
+    "prioritized_replay": True,
     # Alpha parameter for prioritized replay buffer.
-    'prioritized_replay_alpha': 0.6,
+    "prioritized_replay_alpha": 0.6,
     # Beta parameter for sampling from prioritized replay buffer.
-    'prioritized_replay_beta': 0.4,
+    "prioritized_replay_beta": 0.4,
     # Epsilon to add to the TD errors when updating priorities.
-    'prioritized_replay_eps': 1e-6,
+    "prioritized_replay_eps": 1e-6,
     # Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
-    'clip_rewards': True,
+    "clip_rewards": True,
 
     # === Optimization ===
     # Learning rate for adam optimizer
-    'lr': 5e-4,
+    "lr": 5e-4,
     # If not None, clip gradients during optimization at this value
-    'grad_norm_clipping': 40,
+    "grad_norm_clipping": 40,
     # How many steps of the model to sample before learning starts.
-    'learning_starts': 1000,
+    "learning_starts": 1000,
     # Update the replay buffer with this many samples at once. Note that
     # this setting applies per-worker if num_workers > 1.
-    'sample_batch_size': 4,
+    "sample_batch_size": 4,
     # Size of a batched sampled from replay buffer for training. Note that
     # if async_updates is set, then each worker returns gradients for a
     # batch of this size.
-    'train_batch_size': 32,
-    # Smooth the current average reward over this many previous episodes.
-    'smoothing_num_episodes': 100,
-
-    # === Tensorflow ===
-    # Arguments to pass to tensorflow
-    'tf_session_args': {
-        "device_count": {"CPU": 2},
-        "log_device_placement": False,
-        "allow_soft_placement": True,
-        "gpu_options": {
-            "allow_growth": True
-        },
-        "inter_op_parallelism_threads": 1,
-        "intra_op_parallelism_threads": 1,
-    },
+    "train_batch_size": 32,
 
     # === Parallelism ===
     # Whether to use a GPU for local optimization.
-    'gpu': False,
+    "gpu": False,
     # Number of workers for collecting samples with. This only makes sense
     # to increase if your environment is particularly slow to sample, or if
-    # you're using the Async or Ape-X optimizers.
-    'num_workers': 0,
+    # you"re using the Async or Ape-X optimizers.
+    "num_workers": 0,
     # Whether to allocate GPUs for workers (if > 0).
-    'num_gpus_per_worker': 0,
+    "num_gpus_per_worker": 0,
     # Whether to allocate CPUs for workers (if > 0).
-    'num_cpus_per_worker': 1,
+    "num_cpus_per_worker": 1,
     # Optimizer class to use.
-    'optimizer_class': "LocalSyncReplayOptimizer",
+    "optimizer_class": "LocalSyncReplayOptimizer",
     # Config to pass to the optimizer.
-    'optimizer_config': {},
+    "optimizer_config": {},
     # Whether to use a distribution of epsilons across workers for exploration.
-    'per_worker_exploration': False,
+    "per_worker_exploration": False,
     # Whether to compute priorities on workers.
-    'worker_side_prioritization': False
+    "worker_side_prioritization": False
 }
 
 
@@ -127,6 +109,7 @@ class DQNAgent(Agent):
     _allow_unknown_subkeys = [
         "model", "optimizer", "tf_session_args", "env_config"]
     _default_config = DEFAULT_CONFIG
+    _policy_graph = DQNPolicyGraph
 
     @classmethod
     def default_resource_request(cls, config):
@@ -137,16 +120,31 @@ def default_resource_request(cls, config):
             extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
 
     def _init(self):
-        self.local_evaluator = DQNEvaluator(
-            self.registry, self.env_creator, self.config, self.logdir, 0)
-        remote_cls = ray.remote(
+        adjusted_batch_size = (
+            self.config["sample_batch_size"] + self.config["n_step"] - 1)
+        self.local_evaluator = CommonPolicyEvaluator(
+            self.env_creator, self._policy_graph,
+            batch_steps=adjusted_batch_size,
+            batch_mode="pack_episodes", preprocessor_pref="deepmind",
+            compress_observations=True,
+            registry=self.registry, env_config=self.config["env_config"],
+            model_config=self.config["model"], policy_config=self.config)
+        remote_cls = CommonPolicyEvaluator.as_remote(
             num_cpus=self.config["num_cpus_per_worker"],
-            num_gpus=self.config["num_gpus_per_worker"])(
-            DQNEvaluator)
+            num_gpus=self.config["num_gpus_per_worker"])
         self.remote_evaluators = [
             remote_cls.remote(
-                self.registry, self.env_creator, self.config, self.logdir,
-                i)
+                self.env_creator, self._policy_graph,
+                batch_steps=adjusted_batch_size,
+                batch_mode="pack_episodes", preprocessor_pref="deepmind",
+                compress_observations=True,
+                registry=self.registry, env_config=self.config["env_config"],
+                model_config=self.config["model"], policy_config=self.config)
+            for _ in range(self.config["num_workers"])]
+
+        self.exploration0 = self._make_exploration_schedule(0)
+        self.explorations = [
+            self._make_exploration_schedule(i)
             for i in range(self.config["num_workers"])]
 
         for k in OPTIMIZER_SHARED_CONFIGS:
@@ -157,10 +155,25 @@ def _init(self):
             self.config["optimizer_config"], self.local_evaluator,
             self.remote_evaluators)
 
-        self.saver = tf.train.Saver(max_to_keep=None)
         self.last_target_update_ts = 0
         self.num_target_updates = 0
 
+    def _make_exploration_schedule(self, worker_index):
+        # Use either a different `eps` per worker, or a linear schedule.
+        if self.config["per_worker_exploration"]:
+            assert self.config["num_workers"] > 1, \
+                "This requires multiple workers"
+            return ConstantSchedule(
+                0.4 ** (
+                    1 + worker_index / float(
+                        self.config["num_workers"] - 1) * 7))
+        return LinearSchedule(
+            schedule_timesteps=int(
+                self.config["exploration_fraction"] *
+                self.config["schedule_max_timesteps"]),
+            initial_p=1.0,
+            final_p=self.config["exploration_final_eps"])
+
     @property
     def global_timestep(self):
         return self.optimizer.num_steps_sampled
@@ -168,7 +181,7 @@ def global_timestep(self):
     def update_target_if_needed(self):
         if self.global_timestep - self.last_target_update_ts > \
                 self.config["target_network_update_freq"]:
-            self.local_evaluator.update_target()
+            self.local_evaluator.for_policy(lambda p: p.update_target())
             self.last_target_update_ts = self.global_timestep
             self.num_target_updates += 1
 
@@ -177,58 +190,25 @@ def _train(self):
 
         while (self.global_timestep - start_timestep <
                self.config["timesteps_per_iteration"]):
-
             self.optimizer.step()
             self.update_target_if_needed()
 
-        self.local_evaluator.set_global_timestep(self.global_timestep)
-        for e in self.remote_evaluators:
-            e.set_global_timestep.remote(self.global_timestep)
-
-        return self._train_stats(start_timestep)
-
-    def _train_stats(self, start_timestep):
-        if self.remote_evaluators:
-            stats = ray.get([
-                e.stats.remote() for e in self.remote_evaluators])
-        else:
-            stats = self.local_evaluator.stats()
-            if not isinstance(stats, list):
-                stats = [stats]
-
-        mean_100ep_reward = 0.0
-        mean_100ep_length = 0.0
-        num_episodes = 0
-        explorations = []
-
-        if self.config["per_worker_exploration"]:
-            # Return stats from workers with the lowest 20% of exploration
-            test_stats = stats[-int(max(1, len(stats)*0.2)):]
-        else:
-            test_stats = stats
-
-        for s in test_stats:
-            mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
-            mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
-
-        for s in stats:
-            num_episodes += s["num_episodes"]
-            explorations.append(s["exploration"])
-
-        opt_stats = self.optimizer.stats()
-
-        result = TrainingResult(
-            episode_reward_mean=mean_100ep_reward,
-            episode_len_mean=mean_100ep_length,
-            episodes_total=num_episodes,
-            timesteps_this_iter=self.global_timestep - start_timestep,
+        exp_vals = [self.exploration0.value(self.global_timestep)]
+        self.local_evaluator.for_policy(
+            lambda p: p.set_epsilon(exp_vals[0]))
+        for i, e in enumerate(self.remote_evaluators):
+            exp_val = self.explorations[i].value(self.global_timestep)
+            e.for_policy.remote(lambda p: p.set_epsilon(exp_val))
+            exp_vals.append(exp_val)
+
+        result = collect_metrics(
+            self.local_evaluator, self.remote_evaluators)
+        return result._replace(
             info=dict({
-                "min_exploration": min(explorations),
-                "max_exploration": max(explorations),
+                "min_exploration": min(exp_vals),
+                "max_exploration": max(exp_vals),
                 "num_target_updates": self.num_target_updates,
-            }, **opt_stats))
-
-        return result
+            }, **self.optimizer.stats()))
 
     def _stop(self):
         # workaround for https://github.com/ray-project/ray/issues/1516
@@ -236,10 +216,8 @@ def _stop(self):
             ev.__ray_terminate__.remote()
 
     def _save(self, checkpoint_dir):
-        checkpoint_path = self.saver.save(
-            self.local_evaluator.sess,
-            os.path.join(checkpoint_dir, "checkpoint"),
-            global_step=self.iteration)
+        checkpoint_path = os.path.join(
+            checkpoint_dir, "checkpoint-{}".format(self.iteration))
         extra_data = [
             self.local_evaluator.save(),
             ray.get([e.save.remote() for e in self.remote_evaluators]),
@@ -250,7 +228,6 @@ def _save(self, checkpoint_dir):
         return checkpoint_path
 
     def _restore(self, checkpoint_path):
-        self.saver.restore(self.local_evaluator.sess, checkpoint_path)
         extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
         self.local_evaluator.restore(extra_data[0])
         ray.get([
@@ -260,6 +237,9 @@ def _restore(self, checkpoint_path):
         self.num_target_updates = extra_data[3]
         self.last_target_update_ts = extra_data[4]
 
-    def compute_action(self, observation):
-        return self.local_evaluator.dqn_graph.act(
-            self.local_evaluator.sess, np.array(observation)[None], 0.0)[0]
+    def compute_action(self, observation, state=None):
+        if state is None:
+            state = []
+        return self.local_evaluator.for_policy(
+            lambda p: p.compute_single_action(
+                observation, state, is_training=False)[0])
diff --git a/python/ray/rllib/dqn/dqn_evaluator.py b/python/ray/rllib/dqn/dqn_evaluator.py
deleted file mode 100644
index 758dc5f819d4..000000000000
--- a/python/ray/rllib/dqn/dqn_evaluator.py
+++ /dev/null
@@ -1,207 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from gym.spaces import Discrete
-import numpy as np
-import tensorflow as tf
-
-import ray
-from ray.rllib.utils.error import UnsupportedSpaceException
-from ray.rllib.dqn import models
-from ray.rllib.dqn.common.wrappers import wrap_dqn
-from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
-from ray.rllib.optimizers import SampleBatch, PolicyEvaluator
-from ray.rllib.utils.compression import pack
-
-
-def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
-    """Rewrites the given trajectory fragments to encode n-step rewards.
-
-    reward[i] = (
-        reward[i] * gamma**0 +
-        reward[i+1] * gamma**1 +
-        ... +
-        reward[i+n_step-1] * gamma**(n_step-1))
-
-    The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
-
-    If the episode finishes, the reward will be truncated. After this rewrite,
-    all the arrays will be shortened by (n_step - 1).
-    """
-    for i in range(len(rewards) - n_step + 1):
-        if dones[i]:
-            continue  # episode end
-        for j in range(1, n_step):
-            new_obs[i] = new_obs[i + j]
-            rewards[i] += gamma ** j * rewards[i + j]
-            if dones[i + j]:
-                break  # episode end
-    # truncate ends of the trajectory
-    new_len = len(obs) - n_step + 1
-    for arr in [obs, actions, rewards, new_obs, dones]:
-        del arr[new_len:]
-
-
-class DQNEvaluator(PolicyEvaluator):
-    """The DQN Evaluator.
-
-    TODO(rliaw): Support observation/reward filters?"""
-
-    def __init__(self, registry, env_creator, config, logdir, worker_index):
-        env = env_creator(config["env_config"])
-        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
-        self.env = env
-        self.config = config
-
-        if not isinstance(env.action_space, Discrete):
-            raise UnsupportedSpaceException(
-                "Action space {} is not supported for DQN.".format(
-                    env.action_space))
-
-        tf_config = tf.ConfigProto(**config["tf_session_args"])
-        self.sess = tf.Session(config=tf_config)
-        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)
-
-        # Use either a different `eps` per worker, or a linear schedule.
-        if config["per_worker_exploration"]:
-            assert config["num_workers"] > 1, "This requires multiple workers"
-            self.exploration = ConstantSchedule(
-                0.4 ** (
-                    1 + worker_index / float(config["num_workers"] - 1) * 7))
-        else:
-            self.exploration = LinearSchedule(
-                schedule_timesteps=int(
-                    config["exploration_fraction"] *
-                    config["schedule_max_timesteps"]),
-                initial_p=1.0,
-                final_p=config["exploration_final_eps"])
-
-        # Initialize the parameters and copy them to the target network.
-        self.sess.run(tf.global_variables_initializer())
-        self.dqn_graph.update_target(self.sess)
-        self.global_timestep = 0
-        self.local_timestep = 0
-
-        # Note that this encompasses both the Q and target network
-        self.variables = ray.experimental.TensorFlowVariables(
-            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)
-
-        self.episode_rewards = [0.0]
-        self.episode_lengths = [0.0]
-        self.saved_mean_reward = None
-
-        self.obs = self.env.reset()
-
-    def set_global_timestep(self, global_timestep):
-        self.global_timestep = global_timestep
-
-    def update_target(self):
-        self.dqn_graph.update_target(self.sess)
-
-    def sample(self):
-        obs, actions, rewards, new_obs, dones = [], [], [], [], []
-        for _ in range(
-                self.config["sample_batch_size"] + self.config["n_step"] - 1):
-            ob, act, rew, ob1, done = self._step(self.global_timestep)
-            obs.append(ob)
-            actions.append(act)
-            rewards.append(rew)
-            new_obs.append(ob1)
-            dones.append(done)
-
-        # N-step Q adjustments
-        if self.config["n_step"] > 1:
-            # Adjust for steps lost from truncation
-            self.local_timestep -= (self.config["n_step"] - 1)
-            adjust_nstep(
-                self.config["n_step"], self.config["gamma"],
-                obs, actions, rewards, new_obs, dones)
-
-        batch = SampleBatch({
-            "obs": [pack(np.array(o)) for o in obs], "actions": actions,
-            "rewards": rewards,
-            "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones,
-            "weights": np.ones_like(rewards)})
-        assert (batch.count == self.config["sample_batch_size"])
-
-        # Prioritize on the worker side
-        if self.config["worker_side_prioritization"]:
-            td_errors = self.dqn_graph.compute_td_error(
-                self.sess, obs, batch["actions"], batch["rewards"],
-                new_obs, batch["dones"], batch["weights"])
-            new_priorities = (
-                np.abs(td_errors) + self.config["prioritized_replay_eps"])
-            batch.data["weights"] = new_priorities
-
-        return batch
-
-    def compute_gradients(self, samples):
-        td_err, grads = self.dqn_graph.compute_gradients(
-            self.sess, samples["obs"], samples["actions"], samples["rewards"],
-            samples["new_obs"], samples["dones"], samples["weights"])
-        return grads, {"td_error": td_err}
-
-    def apply_gradients(self, grads):
-        self.dqn_graph.apply_gradients(self.sess, grads)
-
-    def compute_apply(self, samples):
-        td_error = self.dqn_graph.compute_apply(
-            self.sess, samples["obs"], samples["actions"], samples["rewards"],
-            samples["new_obs"], samples["dones"], samples["weights"])
-        return {"td_error": td_error}
-
-    def get_weights(self):
-        return self.variables.get_weights()
-
-    def set_weights(self, weights):
-        self.variables.set_weights(weights)
-
-    def _step(self, global_timestep):
-        """Takes a single step, and returns the result of the step."""
-        action = self.dqn_graph.act(
-            self.sess, np.array(self.obs)[None],
-            self.exploration.value(global_timestep))[0]
-        new_obs, rew, done, _ = self.env.step(action)
-        ret = (self.obs, action, rew, new_obs, float(done))
-        self.obs = new_obs
-        self.episode_rewards[-1] += rew
-        self.episode_lengths[-1] += 1
-        if done:
-            self.obs = self.env.reset()
-            self.episode_rewards.append(0.0)
-            self.episode_lengths.append(0.0)
-        self.local_timestep += 1
-        return ret
-
-    def stats(self):
-        n = self.config["smoothing_num_episodes"] + 1
-        mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
-        mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
-        exploration = self.exploration.value(self.global_timestep)
-        return {
-            "mean_100ep_reward": mean_100ep_reward,
-            "mean_100ep_length": mean_100ep_length,
-            "num_episodes": len(self.episode_rewards),
-            "exploration": exploration,
-            "local_timestep": self.local_timestep,
-        }
-
-    def save(self):
-        return [
-            self.exploration,
-            self.episode_rewards,
-            self.episode_lengths,
-            self.saved_mean_reward,
-            self.obs,
-            self.global_timestep,
-            self.local_timestep]
-
-    def restore(self, data):
-        self.exploration = data[0]
-        self.episode_rewards = data[1]
-        self.episode_lengths = data[2]
-        self.saved_mean_reward = data[3]
-        self.obs = data[4]
-        self.global_timestep = data[5]
-        self.local_timestep = data[6]
diff --git a/python/ray/rllib/dqn/models.py b/python/ray/rllib/dqn/dqn_policy_graph.py
similarity index 51%
rename from python/ray/rllib/dqn/models.py
rename to python/ray/rllib/dqn/dqn_policy_graph.py
index 6629b6126acf..75c1d06f0e4e 100644
--- a/python/ray/rllib/dqn/models.py
+++ b/python/ray/rllib/dqn/dqn_policy_graph.py
@@ -2,13 +2,240 @@
 from __future__ import division
 from __future__ import print_function
 
+from gym.spaces import Discrete
 import numpy as np
-
 import tensorflow as tf
 import tensorflow.contrib.layers as layers
 
 from ray.rllib.models import ModelCatalog
-from ray.rllib.optimizers.multi_gpu_impl import TOWER_SCOPE_NAME
+from ray.rllib.optimizers.sample_batch import SampleBatch
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+
+
+Q_SCOPE = "q_func"
+Q_TARGET_SCOPE = "target_q_func"
+
+
+def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
+    """Rewrites the given trajectory fragments to encode n-step rewards.
+
+    reward[i] = (
+        reward[i] * gamma**0 +
+        reward[i+1] * gamma**1 +
+        ... +
+        reward[i+n_step-1] * gamma**(n_step-1))
+
+    The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
+
+    If the episode finishes, the reward will be truncated. After this rewrite,
+    all the arrays will be shortened by (n_step - 1).
+    """
+    for i in range(len(rewards) - n_step + 1):
+        if dones[i]:
+            continue  # episode end
+        for j in range(1, n_step):
+            new_obs[i] = new_obs[i + j]
+            rewards[i] += gamma ** j * rewards[i + j]
+            if dones[i + j]:
+                break  # episode end
+    # truncate ends of the trajectory
+    new_len = len(obs) - n_step + 1
+    for arr in [obs, actions, rewards, new_obs, dones]:
+        del arr[new_len:]
+
+
+class DQNPolicyGraph(TFPolicyGraph):
+    def __init__(self, observation_space, action_space, registry, config):
+        if not isinstance(action_space, Discrete):
+            raise UnsupportedSpaceException(
+                "Action space {} is not supported for DQN.".format(
+                    action_space))
+
+        self.config = config
+        self.cur_epsilon = 1.0
+        num_actions = action_space.n
+
+        # Action inputs
+        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
+        self.eps = tf.placeholder(tf.float32, (), name="eps")
+        self.cur_observations = tf.placeholder(
+            tf.float32, shape=(None,) + observation_space.shape)
+
+        # Action Q network
+        with tf.variable_scope(Q_SCOPE) as scope:
+            q_values = _build_q_network(
+                registry, self.cur_observations, num_actions, config)
+            self.q_func_vars = _scope_vars(scope.name)
+
+        # Action outputs
+        self.output_actions = _build_action_network(
+            q_values,
+            self.cur_observations,
+            num_actions,
+            self.stochastic,
+            self.eps)
+
+        # Replay inputs
+        self.obs_t = tf.placeholder(
+            tf.float32, shape=(None,) + observation_space.shape)
+        self.act_t = tf.placeholder(tf.int32, [None], name="action")
+        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
+        self.obs_tp1 = tf.placeholder(
+            tf.float32, shape=(None,) + observation_space.shape)
+        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
+        self.importance_weights = tf.placeholder(
+            tf.float32, [None], name="weight")
+
+        # q network evaluation
+        with tf.variable_scope(Q_SCOPE, reuse=True):
+            q_t = _build_q_network(
+                registry, self.obs_t, num_actions, config)
+
+        # target q network evalution
+        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
+            q_tp1 = _build_q_network(
+                registry, self.obs_tp1, num_actions, config)
+            self.target_q_func_vars = _scope_vars(scope.name)
+
+        # q scores for actions which we know were selected in the given state.
+        q_t_selected = tf.reduce_sum(
+            q_t * tf.one_hot(self.act_t, num_actions), 1)
+
+        # compute estimate of best possible value starting from state at t + 1
+        if config["double_q"]:
+            with tf.variable_scope(Q_SCOPE, reuse=True):
+                q_tp1_using_online_net = _build_q_network(
+                    registry, self.obs_tp1, num_actions, config)
+            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
+            q_tp1_best = tf.reduce_sum(
+                q_tp1 * tf.one_hot(
+                    q_tp1_best_using_online_net, num_actions), 1)
+        else:
+            q_tp1_best = tf.reduce_max(q_tp1, 1)
+        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
+
+        # compute RHS of bellman equation
+        q_t_selected_target = (
+            self.rew_t +
+            config["gamma"] ** config["n_step"] * q_tp1_best_masked)
+
+        # compute the error (potentially clipped)
+        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
+        self.loss = tf.reduce_mean(
+            self.importance_weights * _huber_loss(self.td_error))
+
+        # update_target_fn will be called periodically to copy Q network to
+        # target Q network
+        update_target_expr = []
+        for var, var_target in zip(
+            sorted(self.q_func_vars, key=lambda v: v.name),
+                sorted(self.target_q_func_vars, key=lambda v: v.name)):
+            update_target_expr.append(var_target.assign(var))
+        self.update_target_expr = tf.group(*update_target_expr)
+
+        # initialize TFPolicyGraph
+        self.sess = tf.get_default_session()
+        self.loss_inputs = [
+            ("obs", self.obs_t),
+            ("actions", self.act_t),
+            ("rewards", self.rew_t),
+            ("new_obs", self.obs_tp1),
+            ("dones", self.done_mask),
+            ("weights", self.importance_weights),
+        ]
+        self.is_training = tf.placeholder_with_default(True, ())
+        TFPolicyGraph.__init__(
+            self, self.sess, obs_input=self.cur_observations,
+            action_sampler=self.output_actions, loss=self.loss,
+            loss_inputs=self.loss_inputs, is_training=self.is_training)
+        self.sess.run(tf.global_variables_initializer())
+
+    def optimizer(self):
+        return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
+
+    def gradients(self, optimizer):
+        if self.config["grad_norm_clipping"] is not None:
+            grads_and_vars = _minimize_and_clip(
+                optimizer, self.loss, var_list=self.q_func_vars,
+                clip_val=self.config["grad_norm_clipping"])
+        else:
+            grads_and_vars = optimizer.compute_gradients(
+                self.loss, var_list=self.q_func_vars)
+        grads_and_vars = [
+            (g, v) for (g, v) in grads_and_vars if g is not None]
+        return grads_and_vars
+
+    def extra_compute_action_feed_dict(self):
+        return {
+            self.stochastic: True,
+            self.eps: self.cur_epsilon,
+        }
+
+    def extra_compute_grad_fetches(self):
+        return {
+            "td_error": self.td_error,
+        }
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        return _postprocess_dqn(self, sample_batch)
+
+    def compute_td_error(
+            self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
+        td_err = self.sess.run(
+            self.td_error,
+            feed_dict={
+                self.obs_t: [np.array(ob) for ob in obs_t],
+                self.act_t: act_t,
+                self.rew_t: rew_t,
+                self.obs_tp1: [np.array(ob) for ob in obs_tp1],
+                self.done_mask: done_mask,
+                self.importance_weights: importance_weights
+            })
+        return td_err
+
+    def update_target(self):
+        return self.sess.run(self.update_target_expr)
+
+    def set_epsilon(self, epsilon):
+        self.cur_epsilon = epsilon
+
+    def get_state(self):
+        return [TFPolicyGraph.get_state(self), self.cur_epsilon]
+
+    def set_state(self, state):
+        TFPolicyGraph.set_state(self, state[0])
+        self.set_epsilon(state[1])
+
+
+def _postprocess_dqn(policy_graph, sample_batch):
+    obs, actions, rewards, new_obs, dones = [
+        list(x) for x in sample_batch.columns(
+            ["obs", "actions", "rewards", "new_obs", "dones"])]
+
+    # N-step Q adjustments
+    if policy_graph.config["n_step"] > 1:
+        adjust_nstep(
+            policy_graph.config["n_step"], policy_graph.config["gamma"],
+            obs, actions, rewards, new_obs, dones)
+
+    batch = SampleBatch({
+        "obs": obs, "actions": actions, "rewards": rewards,
+        "new_obs": new_obs, "dones": dones,
+        "weights": np.ones_like(rewards)})
+    assert batch.count == policy_graph.config["sample_batch_size"], \
+        (batch.count, policy_graph.config["sample_batch_size"])
+
+    # Prioritize on the worker side
+    if policy_graph.config["worker_side_prioritization"]:
+        td_errors = policy_graph.compute_td_error(
+            batch["obs"], batch["actions"], batch["rewards"],
+            batch["new_obs"], batch["dones"], batch["weights"])
+        new_priorities = (
+            np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
+        batch.data["weights"] = new_priorities
+
+    return batch
 
 
 def _build_q_network(registry, inputs, num_actions, config):
@@ -98,205 +325,3 @@ def _scope_vars(scope, trainable_only=False):
         tf.GraphKeys.TRAINABLE_VARIABLES
         if trainable_only else tf.GraphKeys.VARIABLES,
         scope=scope if isinstance(scope, str) else scope.name)
-
-
-class ModelAndLoss(object):
-    """Holds the model and loss function.
-
-    Both graphs are necessary in order for the multi-gpu SGD implementation
-    to create towers on each device.
-    """
-
-    def __init__(
-            self, registry, num_actions, config,
-            obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
-        # q network evaluation
-        with tf.variable_scope("q_func", reuse=True):
-            self.q_t = _build_q_network(registry, obs_t, num_actions, config)
-
-        # target q network evalution
-        with tf.variable_scope("target_q_func") as scope:
-            self.q_tp1 = _build_q_network(
-                registry, obs_tp1, num_actions, config)
-            self.target_q_func_vars = _scope_vars(scope.name)
-
-        # q scores for actions which we know were selected in the given state.
-        q_t_selected = tf.reduce_sum(
-            self.q_t * tf.one_hot(act_t, num_actions), 1)
-
-        # compute estimate of best possible value starting from state at t + 1
-        if config["double_q"]:
-            with tf.variable_scope("q_func", reuse=True):
-                q_tp1_using_online_net = _build_q_network(
-                    registry, obs_tp1, num_actions, config)
-            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
-            q_tp1_best = tf.reduce_sum(
-                self.q_tp1 * tf.one_hot(
-                    q_tp1_best_using_online_net, num_actions), 1)
-        else:
-            q_tp1_best = tf.reduce_max(self.q_tp1, 1)
-        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
-
-        # compute RHS of bellman equation
-        q_t_selected_target = (
-            rew_t + config["gamma"] ** config["n_step"] * q_tp1_best_masked)
-
-        # compute the error (potentially clipped)
-        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-        errors = _huber_loss(self.td_error)
-
-        weighted_error = tf.reduce_mean(importance_weights * errors)
-
-        self.loss = weighted_error
-
-
-class DQNGraph(object):
-    def __init__(self, registry, env, config, logdir):
-        self.env = env
-        num_actions = env.action_space.n
-        optimizer = tf.train.AdamOptimizer(learning_rate=config["lr"])
-
-        # Action inputs
-        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.eps = tf.placeholder(tf.float32, (), name="eps")
-        self.cur_observations = tf.placeholder(
-            tf.float32, shape=(None,) + env.observation_space.shape)
-
-        # Action Q network
-        q_scope_name = TOWER_SCOPE_NAME + "/q_func"
-        with tf.variable_scope(q_scope_name) as scope:
-            q_values = _build_q_network(
-                registry, self.cur_observations, num_actions, config)
-            q_func_vars = _scope_vars(scope.name)
-
-        # Action outputs
-        self.output_actions = _build_action_network(
-            q_values,
-            self.cur_observations,
-            num_actions,
-            self.stochastic,
-            self.eps)
-
-        # Replay inputs
-        self.obs_t = tf.placeholder(
-            tf.float32, shape=(None,) + env.observation_space.shape)
-        self.act_t = tf.placeholder(tf.int32, [None], name="action")
-        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
-        self.obs_tp1 = tf.placeholder(
-            tf.float32, shape=(None,) + env.observation_space.shape)
-        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
-        self.importance_weights = tf.placeholder(
-            tf.float32, [None], name="weight")
-
-        def build_loss(
-                obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
-            return ModelAndLoss(
-                registry,
-                num_actions, config,
-                obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights)
-
-        self.loss_inputs = [
-            ("obs", self.obs_t),
-            ("actions", self.act_t),
-            ("rewards", self.rew_t),
-            ("new_obs", self.obs_tp1),
-            ("dones", self.done_mask),
-            ("weights", self.importance_weights),
-        ]
-
-        with tf.variable_scope(TOWER_SCOPE_NAME):
-            loss_obj = build_loss(
-                self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
-                self.done_mask, self.importance_weights)
-
-        self.build_loss = build_loss
-
-        weighted_error = loss_obj.loss
-        target_q_func_vars = loss_obj.target_q_func_vars
-        self.q_t = loss_obj.q_t
-        self.q_tp1 = loss_obj.q_tp1
-        self.td_error = loss_obj.td_error
-
-        # compute optimization op (potentially with gradient clipping)
-        if config["grad_norm_clipping"] is not None:
-            self.grads_and_vars = _minimize_and_clip(
-                optimizer, weighted_error, var_list=q_func_vars,
-                clip_val=config["grad_norm_clipping"])
-        else:
-            self.grads_and_vars = optimizer.compute_gradients(
-                weighted_error, var_list=q_func_vars)
-        self.grads_and_vars = [
-            (g, v) for (g, v) in self.grads_and_vars if g is not None]
-        self.grads = [g for (g, v) in self.grads_and_vars]
-        self.train_expr = optimizer.apply_gradients(self.grads_and_vars)
-
-        # update_target_fn will be called periodically to copy Q network to
-        # target Q network
-        update_target_expr = []
-        for var, var_target in zip(
-            sorted(q_func_vars, key=lambda v: v.name),
-                sorted(target_q_func_vars, key=lambda v: v.name)):
-            update_target_expr.append(var_target.assign(var))
-        self.update_target_expr = tf.group(*update_target_expr)
-
-    def update_target(self, sess):
-        return sess.run(self.update_target_expr)
-
-    def act(self, sess, obs, eps, stochastic=True):
-        return sess.run(
-            self.output_actions,
-            feed_dict={
-                self.cur_observations: obs,
-                self.stochastic: stochastic,
-                self.eps: eps,
-            })
-
-    def compute_gradients(
-            self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-            importance_weights):
-        td_err, grads = sess.run(
-            [self.td_error, self.grads],
-            feed_dict={
-                self.obs_t: obs_t,
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: obs_tp1,
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err, grads
-
-    def compute_td_error(
-            self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-            importance_weights):
-        td_err = sess.run(
-            self.td_error,
-            feed_dict={
-                self.obs_t: [np.array(ob) for ob in obs_t],
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: [np.array(ob) for ob in obs_tp1],
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err
-
-    def apply_gradients(self, sess, grads):
-        assert len(grads) == len(self.grads_and_vars)
-        feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)}
-        sess.run(self.train_expr, feed_dict=feed_dict)
-
-    def compute_apply(
-            self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
-            importance_weights):
-        td_err, _ = sess.run(
-            [self.td_error, self.train_expr],
-            feed_dict={
-                self.obs_t: obs_t,
-                self.act_t: act_t,
-                self.rew_t: rew_t,
-                self.obs_tp1: obs_tp1,
-                self.done_mask: done_mask,
-                self.importance_weights: importance_weights
-            })
-        return td_err
diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py
index 1d950506b0b7..14d8a9371090 100644
--- a/python/ray/rllib/models/lstm.py
+++ b/python/ray/rllib/models/lstm.py
@@ -35,8 +35,8 @@ def _init(self, inputs, num_outputs, options):
             lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True)
         step_size = tf.shape(self.x)[:1]
 
-        c_init = np.zeros((1, lstm.state_size.c), np.float32)
-        h_init = np.zeros((1, lstm.state_size.h), np.float32)
+        c_init = np.zeros(lstm.state_size.c, np.float32)
+        h_init = np.zeros(lstm.state_size.h, np.float32)
         self.state_init = [c_init, h_init]
         c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
         h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
diff --git a/python/ray/rllib/models/pytorch/misc.py b/python/ray/rllib/models/pytorch/misc.py
index dc725265cf87..cd54fc04b178 100644
--- a/python/ray/rllib/models/pytorch/misc.py
+++ b/python/ray/rllib/models/pytorch/misc.py
@@ -7,18 +7,14 @@
 import torch
 
 
-def convert_batch(trajectory, has_features=False):
+def convert_batch(trajectory):
     """Convert trajectory from numpy to PT variable"""
     states = torch.from_numpy(trajectory["obs"]).float()
     acs = torch.from_numpy(trajectory["actions"])
     advs = torch.from_numpy(
         trajectory["advantages"].copy()).float().reshape(-1)
     rs = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
-    if has_features:
-        features = [torch.from_numpy(f) for f in trajectory["features"]]
-    else:
-        features = trajectory["features"]
-    return states, acs, advs, rs, features
+    return states, acs, advs, rs
 
 
 def var_to_np(var):
diff --git a/python/ray/rllib/optimizers/multi_gpu_impl.py b/python/ray/rllib/optimizers/multi_gpu_impl.py
index 1ff6bff3f05f..a7703eb46bd5 100644
--- a/python/ray/rllib/optimizers/multi_gpu_impl.py
+++ b/python/ray/rllib/optimizers/multi_gpu_impl.py
@@ -43,7 +43,7 @@ class LocalSyncParallelOptimizer(object):
             processed.
         build_loss: Function that takes the specified inputs and returns an
             object with a 'loss' property that is a scalar Tensor. For example,
-            ray.rllib.ppo.ProximalPolicyLoss.
+            ray.rllib.ppo.ProximalPolicyGraph.
         logdir: Directory to place debugging output in.
         grad_norm_clipping: None or int stdev to clip grad norms by
     """
diff --git a/python/ray/rllib/optimizers/policy_optimizer.py b/python/ray/rllib/optimizers/policy_optimizer.py
index 1e31edc66ea1..9f705af98394 100644
--- a/python/ray/rllib/optimizers/policy_optimizer.py
+++ b/python/ray/rllib/optimizers/policy_optimizer.py
@@ -38,18 +38,24 @@ def make(
 
         Args:
             evaluator_cls (class): Python class of the evaluators to create.
-            evaluator_args (list): List of constructor args for the evaluators.
+            evaluator_args (list|dict): Constructor args for the evaluators.
             num_workers (int): Number of remote evaluators to create in
                 addition to a local evaluator. This can be zero or greater.
             optimizer_config (dict): Keyword arguments to pass to the
                 optimizer class constructor.
         """
 
-        local_evaluator = evaluator_cls(*evaluator_args)
         remote_cls = ray.remote(**evaluator_resources)(evaluator_cls)
-        remote_evaluators = [
-            remote_cls.remote(*evaluator_args)
-            for _ in range(num_workers)]
+        if isinstance(evaluator_args, list):
+            local_evaluator = evaluator_cls(*evaluator_args)
+            remote_evaluators = [
+                remote_cls.remote(*evaluator_args)
+                for _ in range(num_workers)]
+        else:
+            local_evaluator = evaluator_cls(**evaluator_args)
+            remote_evaluators = [
+                remote_cls.remote(**evaluator_args)
+                for _ in range(num_workers)]
         return cls(optimizer_config, local_evaluator, remote_evaluators)
 
     def __init__(self, config, local_evaluator, remote_evaluators):
diff --git a/python/ray/rllib/optimizers/sample_batch.py b/python/ray/rllib/optimizers/sample_batch.py
index 5e5e1e95b0b3..35f8eec254d4 100644
--- a/python/ray/rllib/optimizers/sample_batch.py
+++ b/python/ray/rllib/optimizers/sample_batch.py
@@ -2,17 +2,22 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numpy as np
 
 
-def arrayify(s):
-    if type(s) in [int, float, str, np.ndarray]:
-        return s
-    elif type(s) is list:
-        # recursive call to convert LazyFrames to arrays
-        return np.array([arrayify(x) for x in s])
-    else:
-        return np.array(s)
+class SampleBatchBuilder(object):
+    """Util to build a SampleBatch incrementally."""
+
+    def __init__(self):
+        self.buffers = collections.defaultdict(list)
+
+    def add_values(self, **values):
+        for k, v in values.items():
+            self.buffers[k].append(v)
+
+    def build(self):
+        return SampleBatch({k: np.array(v) for k, v in self.buffers.items()})
 
 
 class SampleBatch(object):
diff --git a/python/ray/rllib/pg/pg.py b/python/ray/rllib/pg/pg.py
index c3726f89f504..0e8912ff036b 100644
--- a/python/ray/rllib/pg/pg.py
+++ b/python/ray/rllib/pg/pg.py
@@ -2,13 +2,11 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-import ray
-from ray.rllib.optimizers import LocalSyncOptimizer
-from ray.rllib.pg.pg_evaluator import PGEvaluator
 from ray.rllib.agent import Agent
-from ray.tune.result import TrainingResult
+from ray.rllib.optimizers import LocalSyncOptimizer
+from ray.rllib.pg.pg_policy_graph import PGPolicyGraph
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
+    collect_metrics
 from ray.tune.trial import Resources
 
 
@@ -33,7 +31,6 @@
 
 
 class PGAgent(Agent):
-
     """Simple policy gradient agent.
 
     This is an example agent to show how to implement algorithms in RLlib.
@@ -50,34 +47,28 @@ def default_resource_request(cls, config):
 
     def _init(self):
         self.optimizer = LocalSyncOptimizer.make(
-            evaluator_cls=PGEvaluator,
-            evaluator_args=[self.registry, self.env_creator, self.config],
+            evaluator_cls=CommonPolicyEvaluator,
+            evaluator_args={
+                "env_creator": self.env_creator,
+                "policy_graph": PGPolicyGraph,
+                "batch_steps": self.config["batch_size"],
+                "batch_mode": "truncate_episodes",
+                "registry": self.registry,
+                "model_config": self.config["model"],
+                "env_config": self.config["env_config"],
+                "policy_config": self.config,
+            },
             num_workers=self.config["num_workers"],
             optimizer_config=self.config["optimizer"])
 
     def _train(self):
         self.optimizer.step()
-
-        episode_rewards = []
-        episode_lengths = []
-        metric_lists = [a.get_completed_rollout_metrics.remote()
-                        for a in self.optimizer.remote_evaluators]
-        for metrics in metric_lists:
-            for episode in ray.get(metrics):
-                episode_lengths.append(episode.episode_length)
-                episode_rewards.append(episode.episode_reward)
-        avg_reward = np.mean(episode_rewards)
-        avg_length = np.mean(episode_lengths)
-        timesteps = np.sum(episode_lengths)
-
-        result = TrainingResult(
-            episode_reward_mean=avg_reward,
-            episode_len_mean=avg_length,
-            timesteps_this_iter=timesteps,
-            info={})
-
-        return result
-
-    def compute_action(self, obs):
-        action, info = self.optimizer.local_evaluator.policy.compute(obs)
-        return action
+        return collect_metrics(
+            self.optimizer.local_evaluator, self.optimizer.remote_evaluators)
+
+    def compute_action(self, observation, state=None):
+        if state is None:
+            state = []
+        return self.local_evaluator.for_policy(
+            lambda p: p.compute_single_action(
+                observation, state, is_training=False)[0])
diff --git a/python/ray/rllib/pg/pg_evaluator.py b/python/ray/rllib/pg/pg_evaluator.py
deleted file mode 100644
index 1f217ba02855..000000000000
--- a/python/ray/rllib/pg/pg_evaluator.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.models.catalog import ModelCatalog
-from ray.rllib.optimizers import PolicyEvaluator
-from ray.rllib.pg.policy import PGPolicy
-from ray.rllib.utils.filter import NoFilter
-from ray.rllib.utils.process_rollout import process_rollout
-from ray.rllib.utils.sampler import SyncSampler
-
-
-class PGEvaluator(PolicyEvaluator):
-    """Evaluator for simple policy gradient."""
-
-    def __init__(self, registry, env_creator, config):
-        self.env = ModelCatalog.get_preprocessor_as_wrapper(
-            registry, env_creator(config["env_config"]), config["model"])
-        self.config = config
-
-        self.policy = PGPolicy(registry, self.env.observation_space,
-                               self.env.action_space, config)
-        self.sampler = SyncSampler(
-                        self.env, self.policy, NoFilter(),
-                        config["batch_size"], horizon=config["horizon"])
-
-    def sample(self):
-        rollout = self.sampler.get_data()
-        samples = process_rollout(
-                    rollout, NoFilter(),
-                    gamma=self.config["gamma"], use_gae=False)
-        return samples
-
-    def get_completed_rollout_metrics(self):
-        """Returns metrics on previously completed rollouts.
-
-        Calling this clears the queue of completed rollout metrics.
-        """
-        return self.sampler.get_metrics()
-
-    def compute_gradients(self, samples):
-        """ Returns gradient w.r.t. samples."""
-        gradient, info = self.policy.compute_gradients(samples)
-        return gradient, {}
-
-    def apply_gradients(self, grads):
-        """Applies gradients to evaluator weights."""
-        self.policy.apply_gradients(grads)
-
-    def get_weights(self):
-        """Returns model weights."""
-        return self.policy.get_weights()
-
-    def set_weights(self, weights):
-        """Sets model weights."""
-        return self.policy.set_weights(weights)
diff --git a/python/ray/rllib/pg/pg_policy_graph.py b/python/ray/rllib/pg/pg_policy_graph.py
new file mode 100644
index 000000000000..b605a513f39c
--- /dev/null
+++ b/python/ray/rllib/pg/pg_policy_graph.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.utils.process_rollout import compute_advantages
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+
+
+class PGPolicyGraph(TFPolicyGraph):
+
+    def __init__(self, obs_space, action_space, registry, config):
+        self.config = config
+
+        # setup policy
+        self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape))
+        dist_class, self.logit_dim = ModelCatalog.get_action_dist(action_space)
+        self.model = ModelCatalog.get_model(
+            registry, self.x, self.logit_dim, options=self.config["model"])
+        self.dist = dist_class(self.model.outputs)  # logit for each action
+
+        # setup policy loss
+        self.ac = ModelCatalog.get_action_placeholder(action_space)
+        self.adv = tf.placeholder(tf.float32, [None], name="adv")
+        self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv)
+
+        # initialize TFPolicyGraph
+        self.sess = tf.get_default_session()
+        self.loss_in = [
+            ("obs", self.x),
+            ("actions", self.ac),
+            ("advantages", self.adv),
+        ]
+        self.is_training = tf.placeholder_with_default(True, ())
+        TFPolicyGraph.__init__(
+            self, self.sess, obs_input=self.x,
+            action_sampler=self.dist.sample(), loss=self.loss,
+            loss_inputs=self.loss_in, is_training=self.is_training)
+        self.sess.run(tf.global_variables_initializer())
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        return compute_advantages(
+            sample_batch, 0.0, self.config["gamma"], use_gae=False)
diff --git a/python/ray/rllib/pg/policy.py b/python/ray/rllib/pg/policy.py
deleted file mode 100644
index cc53eebcbd84..000000000000
--- a/python/ray/rllib/pg/policy.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-import ray
-from ray.rllib.models.catalog import ModelCatalog
-
-
-class PGPolicy():
-
-    other_output = []
-    is_recurrent = False
-
-    def __init__(self, registry, ob_space, ac_space, config):
-        self.config = config
-        self.registry = registry
-        with tf.variable_scope("local"):
-            self._setup_graph(ob_space, ac_space)
-        print("Setting up loss")
-        self._setup_loss(ac_space)
-        self._setup_gradients()
-        self.initialize()
-
-    def _setup_graph(self, ob_space, ac_space):
-        self.x = tf.placeholder(tf.float32, shape=[None]+list(ob_space.shape))
-        dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
-        self.model = ModelCatalog.get_model(
-                        self.registry, self.x, self.logit_dim,
-                        options=self.config["model"])
-        self.action_logits = self.model.outputs  # logit for each action
-        self.dist = dist_class(self.action_logits)
-        self.sample = self.dist.sample()
-        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                                          tf.get_variable_scope().name)
-
-    def _setup_loss(self, action_space):
-        self.ac = ModelCatalog.get_action_placeholder(action_space)
-        self.adv = tf.placeholder(tf.float32, [None], name="adv")
-
-        log_prob = self.dist.logp(self.ac)
-
-        # policy loss
-        self.loss = -tf.reduce_mean(log_prob * self.adv)
-
-    def _setup_gradients(self):
-        self.grads = tf.gradients(self.loss, self.var_list)
-        grads_and_vars = list(zip(self.grads, self.var_list))
-        opt = tf.train.AdamOptimizer(self.config["lr"])
-        self._apply_gradients = opt.apply_gradients(grads_and_vars)
-
-    def initialize(self):
-        self.sess = tf.Session()
-        self.variables = ray.experimental.TensorFlowVariables(
-                            self.loss, self.sess)
-        self.sess.run(tf.global_variables_initializer())
-
-    def compute_gradients(self, samples):
-        info = {}
-        feed_dict = {
-            self.x: samples["obs"],
-            self.ac: samples["actions"],
-            self.adv: samples["advantages"],
-        }
-        self.grads = [g for g in self.grads if g is not None]
-        grad = self.sess.run(self.grads, feed_dict=feed_dict)
-        return grad, info
-
-    def apply_gradients(self, grads):
-        feed_dict = dict(zip(self.grads, grads))
-        self.sess.run(self._apply_gradients, feed_dict=feed_dict)
-
-    def get_weights(self):
-        return self.variables.get_weights()
-
-    def set_weights(self, weights):
-        self.variables.set_weights(weights)
-
-    def compute(self, ob, *args):
-        action = self.sess.run(self.sample, {self.x: [ob]})
-        return action[0], {}
diff --git a/python/ray/rllib/ppo/loss.py b/python/ray/rllib/ppo/loss.py
index 3f69ff711692..f57502d18b40 100644
--- a/python/ray/rllib/ppo/loss.py
+++ b/python/ray/rllib/ppo/loss.py
@@ -7,7 +7,7 @@
 from ray.rllib.models import ModelCatalog
 
 
-class ProximalPolicyLoss(object):
+class ProximalPolicyGraph(object):
 
     other_output = ["vf_preds", "logprobs"]
     is_recurrent = False
@@ -82,11 +82,14 @@ def __init__(
             self.policy_results = [
                 self.sampler, self.curr_logits, tf.constant("NA")]
 
-    def compute(self, observation):
+    def compute_single_action(self, observation, features, is_training=False):
         action, logprobs, vf = self.sess.run(
             self.policy_results,
             feed_dict={self.observations: [observation]})
-        return action[0], {"vf_preds": vf[0], "logprobs": logprobs[0]}
+        return action[0], [], {"vf_preds": vf[0], "logprobs": logprobs[0]}
+
+    def get_initial_state(self):
+        return []
 
     def loss(self):
         return self.loss
diff --git a/python/ray/rllib/ppo/ppo.py b/python/ray/rllib/ppo/ppo.py
index a8c695033e9a..2e82b5086535 100644
--- a/python/ray/rllib/ppo/ppo.py
+++ b/python/ray/rllib/ppo/ppo.py
@@ -172,7 +172,7 @@ def standardized(value):
             batch_index = 0
             num_batches = (
                 int(tuples_per_device) // int(model.per_device_batch_size))
-            loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], []
+            loss, policy_graph, vf_loss, kl, entropy = [], [], [], [], []
             permutation = np.random.permutation(num_batches)
             # Prepare to drop into the debugger
             if self.iteration == config["tf_debug_iteration"]:
@@ -181,26 +181,26 @@ def standardized(value):
                 full_trace = (
                     i == 0 and self.iteration == 0 and
                     batch_index == config["full_trace_nth_sgd_batch"])
-                batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \
+                batch_loss, batch_policy_graph, batch_vf_loss, batch_kl, \
                     batch_entropy = model.run_sgd_minibatch(
                         permutation[batch_index] * model.per_device_batch_size,
                         self.kl_coeff, full_trace,
                         self.file_writer)
                 loss.append(batch_loss)
-                policy_loss.append(batch_policy_loss)
+                policy_graph.append(batch_policy_graph)
                 vf_loss.append(batch_vf_loss)
                 kl.append(batch_kl)
                 entropy.append(batch_entropy)
                 batch_index += 1
             loss = np.mean(loss)
-            policy_loss = np.mean(policy_loss)
+            policy_graph = np.mean(policy_graph)
             vf_loss = np.mean(vf_loss)
             kl = np.mean(kl)
             entropy = np.mean(entropy)
             sgd_end = time.time()
             print(
                 "{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format(
-                    i, loss, policy_loss, vf_loss, kl, entropy))
+                    i, loss, policy_graph, vf_loss, kl, entropy))
 
             values = []
             if i == config["num_sgd_iter"] - 1:
@@ -299,4 +299,5 @@ def _restore(self, checkpoint_path):
     def compute_action(self, observation):
         observation = self.local_evaluator.obs_filter(
             observation, update=False)
-        return self.local_evaluator.common_policy.compute(observation)[0]
+        return self.local_evaluator.common_policy.compute_single_action(
+            observation, [], False)[0]
diff --git a/python/ray/rllib/ppo/ppo_evaluator.py b/python/ray/rllib/ppo/ppo_evaluator.py
index 434feb094d7e..a8ca6e54ca92 100644
--- a/python/ray/rllib/ppo/ppo_evaluator.py
+++ b/python/ray/rllib/ppo/ppo_evaluator.py
@@ -16,8 +16,8 @@
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.sampler import SyncSampler
 from ray.rllib.utils.filter import get_filter, MeanStdFilter
-from ray.rllib.utils.process_rollout import process_rollout
-from ray.rllib.ppo.loss import ProximalPolicyLoss
+from ray.rllib.utils.process_rollout import compute_advantages
+from ray.rllib.ppo.loss import ProximalPolicyGraph
 
 
 # TODO(rliaw): Move this onto LocalMultiGPUOptimizer
@@ -86,7 +86,7 @@ def __init__(self, registry, env_creator, config, logdir, is_remote):
             self.per_device_batch_size = int(self.batch_size / len(devices))
 
         def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
-            return ProximalPolicyLoss(
+            return ProximalPolicyGraph(
                 self.env.observation_space, self.env.action_space,
                 obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
                 self.kl_coeff, self.distribution_class, self.config,
@@ -190,8 +190,9 @@ def sample(self):
 
         while num_steps_so_far < self.config["min_steps_per_task"]:
             rollout = self.sampler.get_data()
-            samples = process_rollout(
-                rollout, self.rew_filter, self.config["gamma"],
+            last_r = 0.0  # note: not needed since we don't truncate rollouts
+            samples = compute_advantages(
+                rollout, last_r, self.config["gamma"],
                 self.config["lambda"], use_gae=self.config["use_gae"])
             num_steps_so_far += samples.count
             all_samples.append(samples)
diff --git a/python/ray/rllib/test/test_checkpoint_restore.py b/python/ray/rllib/test/test_checkpoint_restore.py
index 9e583c877bb9..68eeb27ea19e 100644
--- a/python/ray/rllib/test/test_checkpoint_restore.py
+++ b/python/ray/rllib/test/test_checkpoint_restore.py
@@ -17,18 +17,19 @@ def get_mean_action(alg, obs):
     return np.mean(out)
 
 
-ray.init()
+ray.init(num_cpus=10)
 
 CONFIGS = {
-    "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100},
+    "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100,
+           "num_workers": 2},
     "DQN": {},
-    "DDPG": {"noise_scale": 0.0},
-    "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000},
-    "A3C": {"use_lstm": False},
+    "DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
+    "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
+    "A3C": {"use_lstm": False, "num_workers": 1},
 }
 
 
-def test(use_object_store, alg_name):
+def test(use_object_store, alg_name, failures):
     cls = get_agent_class(alg_name)
     if alg_name == "DDPG":
         alg1 = cls(config=CONFIGS[name], env="Pendulum-v0")
@@ -55,12 +56,15 @@ def test(use_object_store, alg_name):
         a1 = get_mean_action(alg1, obs)
         a2 = get_mean_action(alg2, obs)
         print("Checking computed actions", alg1, obs, a1, a2)
-        assert abs(a1 - a2) < .1, (a1, a2)
+        if abs(a1 - a2) > .1:
+            failures.append((alg_name, [a1, a2]))
 
 
 if __name__ == "__main__":
+    failures = []
     for use_object_store in [False, True]:
         for name in ["ES", "DQN", "DDPG", "PPO", "A3C"]:
-            test(use_object_store, name)
+            test(use_object_store, name, failures)
 
+    assert not failures, failures
     print("All checkpoint restore tests passed!")
diff --git a/python/ray/rllib/test/test_common_policy_evaluator.py b/python/ray/rllib/test/test_common_policy_evaluator.py
new file mode 100644
index 000000000000..9e70d2f812ca
--- /dev/null
+++ b/python/ray/rllib/test/test_common_policy_evaluator.py
@@ -0,0 +1,133 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import time
+import unittest
+
+import ray
+from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
+from ray.rllib.utils.policy_graph import PolicyGraph
+from ray.rllib.utils.process_rollout import compute_advantages
+
+
+class MockPolicyGraph(PolicyGraph):
+    def compute_actions(self, obs_batch, state_batches, is_training=False):
+        return [0] * len(obs_batch), [], {}
+
+    def postprocess_trajectory(self, batch):
+        return compute_advantages(batch, 100.0, 0.9, use_gae=False)
+
+
+class TestCommonPolicyEvaluator(unittest.TestCase):
+    def testBasic(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph)
+        batch = ev.sample()
+        for key in ["obs", "actions", "rewards", "dones", "advantages"]:
+            self.assertIn(key, batch)
+        self.assertGreater(batch["advantages"][0], 1)
+
+    def testPackEpisodes(self):
+        for batch_size in [1, 10, 100, 1000]:
+            ev = CommonPolicyEvaluator(
+                env_creator=lambda _: gym.make("CartPole-v0"),
+                policy_graph=MockPolicyGraph,
+                batch_steps=batch_size,
+                batch_mode="pack_episodes")
+            batch = ev.sample()
+            self.assertEqual(batch.count, batch_size)
+
+    def testTruncateEpisodes(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            batch_steps=2,
+            batch_mode="truncate_episodes")
+        batch = ev.sample()
+        self.assertEqual(batch.count, 2)
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            batch_steps=1000,
+            batch_mode="truncate_episodes")
+        self.assertLess(batch.count, 200)
+
+    def testCompleteEpisodes(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            batch_steps=2,
+            batch_mode="complete_episodes")
+        batch = ev.sample()
+        self.assertGreater(batch.count, 2)
+        self.assertTrue(batch["dones"][-1])
+        batch = ev.sample()
+        self.assertGreater(batch.count, 2)
+        self.assertTrue(batch["dones"][-1])
+
+    def testFilterSync(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            sample_async=True,
+            observation_filter="ConcurrentMeanStdFilter")
+        time.sleep(2)
+        ev.sample()
+        filters = ev.get_filters(flush_after=True)
+        obs_f = filters["obs_filter"]
+        self.assertNotEqual(obs_f.rs.n, 0)
+        self.assertNotEqual(obs_f.buffer.n, 0)
+
+    def testGetFilters(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            sample_async=True,
+            observation_filter="ConcurrentMeanStdFilter")
+        self.sample_and_flush(ev)
+        filters = ev.get_filters(flush_after=False)
+        time.sleep(2)
+        filters2 = ev.get_filters(flush_after=False)
+        obs_f = filters["obs_filter"]
+        obs_f2 = filters2["obs_filter"]
+        self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
+        self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
+
+    def testSyncFilter(self):
+        ev = CommonPolicyEvaluator(
+            env_creator=lambda _: gym.make("CartPole-v0"),
+            policy_graph=MockPolicyGraph,
+            sample_async=True,
+            observation_filter="ConcurrentMeanStdFilter")
+        obs_f = self.sample_and_flush(ev)
+
+        # Current State
+        filters = ev.get_filters(flush_after=False)
+        obs_f = filters["obs_filter"]
+
+        self.assertLessEqual(obs_f.buffer.n, 20)
+
+        new_obsf = obs_f.copy()
+        new_obsf.rs._n = 100
+        ev.sync_filters({"obs_filter": new_obsf})
+        filters = ev.get_filters(flush_after=False)
+        obs_f = filters["obs_filter"]
+        self.assertGreaterEqual(obs_f.rs.n, 100)
+        self.assertLessEqual(obs_f.buffer.n, 20)
+
+    def sample_and_flush(self, ev):
+        time.sleep(2)
+        ev.sample()
+        filters = ev.get_filters(flush_after=True)
+        obs_f = filters["obs_filter"]
+        self.assertNotEqual(obs_f.rs.n, 0)
+        self.assertNotEqual(obs_f.buffer.n, 0)
+        return obs_f
+
+
+if __name__ == '__main__':
+    ray.init()
+    unittest.main(verbosity=2)
diff --git a/python/ray/rllib/test/test_evaluators.py b/python/ray/rllib/test/test_evaluators.py
index 29c054a0d418..d2abf1e6d65f 100644
--- a/python/ray/rllib/test/test_evaluators.py
+++ b/python/ray/rllib/test/test_evaluators.py
@@ -3,19 +3,11 @@
 from __future__ import print_function
 
 import unittest
-import gym
-import shutil
-import tempfile
-import time
 
-import ray
-from ray.rllib.a3c import DEFAULT_CONFIG
-from ray.rllib.a3c.a3c_evaluator import A3CEvaluator
-from ray.rllib.dqn.dqn_evaluator import adjust_nstep
-from ray.tune.registry import get_registry
+from ray.rllib.dqn.dqn_policy_graph import adjust_nstep
 
 
-class DQNEvaluatorTest(unittest.TestCase):
+class DQNTest(unittest.TestCase):
     def testNStep(self):
         obs = [1, 2, 3, 4, 5, 6, 7]
         actions = ["a", "b", "a", "a", "a", "b", "a"]
@@ -30,70 +22,5 @@ def testNStep(self):
         self.assertEqual(dones, [1, 0, 0, 0, 0])
 
 
-class A3CEvaluatorTest(unittest.TestCase):
-
-    def setUp(self):
-        ray.init(num_cpus=1)
-        config = DEFAULT_CONFIG.copy()
-        config["num_workers"] = 1
-        config["observation_filter"] = "ConcurrentMeanStdFilter"
-        config["reward_filter"] = "MeanStdFilter"
-        config["batch_size"] = 2
-        self._temp_dir = tempfile.mkdtemp("a3c_evaluator_test")
-        self.e = A3CEvaluator(
-            get_registry(),
-            lambda config: gym.make("CartPole-v0"),
-            config,
-            logdir=self._temp_dir)
-
-    def tearDown(self):
-        ray.worker.cleanup()
-        shutil.rmtree(self._temp_dir)
-
-    def sample_and_flush(self):
-        e = self.e
-        time.sleep(2)
-        self.e.sample()
-        filters = e.get_filters(flush_after=True)
-        obs_f = filters["obs_filter"]
-        rew_f = filters["rew_filter"]
-        self.assertNotEqual(obs_f.rs.n, 0)
-        self.assertNotEqual(obs_f.buffer.n, 0)
-        self.assertNotEqual(rew_f.rs.n, 0)
-        self.assertNotEqual(rew_f.buffer.n, 0)
-        return obs_f, rew_f
-
-    def testGetFilters(self):
-        """Show `flush_after=False` provides does not affect the buffer."""
-        e = self.e
-        self.sample_and_flush()
-        filters = e.get_filters(flush_after=False)
-        obs_f = filters["obs_filter"]
-        filters2 = e.get_filters(flush_after=False)
-        obs_f2 = filters2["obs_filter"]
-        self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
-        self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
-
-    def testSyncFilter(self):
-        """Show that sync_filters rebases own buffer over input"""
-        e = self.e
-        obs_f, _ = self.sample_and_flush()
-
-        # Current State
-        filters = e.get_filters(flush_after=False)
-        obs_f = filters["obs_filter"]
-        rew_f = filters["rew_filter"]
-
-        self.assertLessEqual(obs_f.buffer.n, 20)
-
-        new_obsf = obs_f.copy()
-        new_obsf.rs._n = 100
-        e.sync_filters({"obs_filter": new_obsf, "rew_filter": rew_f})
-        filters = e.get_filters(flush_after=False)
-        obs_f = filters["obs_filter"]
-        self.assertGreaterEqual(obs_f.rs.n, 100)
-        self.assertLessEqual(obs_f.buffer.n, 20)
-
-
 if __name__ == '__main__':
     unittest.main(verbosity=2)
diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py
index 2e41c85a0233..bf3124002a9f 100644
--- a/python/ray/rllib/test/test_supported_spaces.py
+++ b/python/ray/rllib/test/test_supported_spaces.py
@@ -36,32 +36,6 @@
         Box(0.0, 1.0, (5,), dtype=np.float32)]),
 }
 
-# (alg, action_space, obs_space)
-KNOWN_FAILURES = [
-    # TODO(ekl) multiagent support for a3c
-    ("A3C", "implicit_tuple", "atari"),
-    ("A3C", "implicit_tuple", "atari_ram"),
-    ("A3C", "implicit_tuple", "discrete"),
-    ("A3C", "implicit_tuple", "image"),
-    ("A3C", "implicit_tuple", "mixed_tuple"),
-    ("A3C", "implicit_tuple", "simple_tuple"),
-    ("A3C", "implicit_tuple", "vector"),
-    ("A3C", "mixed_tuple", "atari"),
-    ("A3C", "mixed_tuple", "atari_ram"),
-    ("A3C", "mixed_tuple", "discrete"),
-    ("A3C", "mixed_tuple", "image"),
-    ("A3C", "mixed_tuple", "mixed_tuple"),
-    ("A3C", "mixed_tuple", "simple_tuple"),
-    ("A3C", "mixed_tuple", "vector"),
-    ("A3C", "simple_tuple", "atari"),
-    ("A3C", "simple_tuple", "atari_ram"),
-    ("A3C", "simple_tuple", "discrete"),
-    ("A3C", "simple_tuple", "image"),
-    ("A3C", "simple_tuple", "mixed_tuple"),
-    ("A3C", "simple_tuple", "simple_tuple"),
-    ("A3C", "simple_tuple", "vector"),
-]
-
 
 def make_stub_env(action_space, obs_space):
     class StubEnv(gym.Env):
@@ -135,19 +109,13 @@ def testAll(self):
             {"num_workers": 1, "optimizer": {}},
             stats)
         num_unexpected_errors = 0
-        num_unexpected_success = 0
         for (alg, a_name, o_name), stat in sorted(stats.items()):
-            if stat in ["ok", "unsupported"]:
-                if (alg, a_name, o_name) in KNOWN_FAILURES:
-                    num_unexpected_success += 1
-            else:
-                if (alg, a_name, o_name) not in KNOWN_FAILURES:
-                    num_unexpected_errors += 1
+            if stat not in ["ok", "unsupported"]:
+                num_unexpected_errors += 1
             print(
                 alg, "action_space", a_name, "obs_space", o_name,
                 "result", stat)
         self.assertEqual(num_unexpected_errors, 0)
-        self.assertEqual(num_unexpected_success, 0)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml
index 0a330bb5b57b..7c0f660fd8a6 100644
--- a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml
@@ -13,7 +13,6 @@ mountaincarcontinuous-ddpg:
         tau: 0.01
         l2_reg: 0.00001
         buffer_size: 50000
-        random_starts: False
         clip_rewards: False
         learning_starts: 1000
         #model:
diff --git a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
index 2166989d8080..baccb42b8b87 100644
--- a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
@@ -6,6 +6,5 @@ pendulum-ddpg:
         episode_reward_mean: -160
     config:
         use_huber: True
-        random_starts: False
         clip_rewards: False
         exploration_fraction: 0.1
diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c-pytorch.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c-pytorch.yaml
new file mode 100644
index 000000000000..a25da3c7769a
--- /dev/null
+++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c-pytorch.yaml
@@ -0,0 +1,10 @@
+cartpole-a3c:
+    env: CartPole-v0
+    run: A3C
+    stop:
+        episode_reward_mean: 200
+        time_total_s: 600
+    config:
+        num_workers: 1
+        gamma: 0.95
+        use_pytorch: true
diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml
index 6850a665e4f2..f20ea73c3b68 100644
--- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml
+++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-a3c.yaml
@@ -5,5 +5,5 @@ cartpole-a3c:
         episode_reward_mean: 200
         time_total_s: 600
     config:
-        num_workers: 4
+        num_workers: 1
         gamma: 0.95
diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml
index 7efde08f8f2f..04aa2dc6edcc 100644
--- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml
+++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-dqn.yaml
@@ -7,4 +7,3 @@ cartpole-dqn:
     config:
         n_step: 3
         gamma: 0.95
-        smoothing_num_episodes: 10
diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-pg.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-pg.yaml
new file mode 100644
index 000000000000..2bf9e7548b86
--- /dev/null
+++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-pg.yaml
@@ -0,0 +1,8 @@
+cartpole-pg:
+    env: CartPole-v0
+    run: PG
+    stop:
+        episode_reward_mean: 200
+        time_total_s: 300
+    config:
+        num_workers: 1
diff --git a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml
index 840f6d963c4e..124f756ecc1c 100644
--- a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml
@@ -6,7 +6,5 @@ pendulum-ddpg:
         time_total_s: 900
     config:
         use_huber: True
-        random_starts: False
         clip_rewards: False
         exploration_fraction: 0.1
-        smoothing_num_episodes: 10
diff --git a/python/ray/rllib/utils/common_policy_evaluator.py b/python/ray/rllib/utils/common_policy_evaluator.py
new file mode 100644
index 000000000000..d86c508cde4f
--- /dev/null
+++ b/python/ray/rllib/utils/common_policy_evaluator.py
@@ -0,0 +1,278 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle
+import numpy as np
+import tensorflow as tf
+
+import ray
+from ray.rllib.models import ModelCatalog
+from ray.rllib.optimizers.policy_evaluator import PolicyEvaluator
+from ray.rllib.utils.atari_wrappers import wrap_deepmind
+from ray.rllib.utils.compression import pack
+from ray.rllib.utils.filter import get_filter
+from ray.rllib.utils.sampler import AsyncSampler, SyncSampler
+from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
+from ray.tune.registry import get_registry
+from ray.tune.result import TrainingResult
+
+
+def collect_metrics(local_evaluator, remote_evaluators):
+    """Gathers episode metrics from CommonPolicyEvaluator instances."""
+
+    episode_rewards = []
+    episode_lengths = []
+    metric_lists = ray.get(
+        [a.apply.remote(lambda ev: ev.sampler.get_metrics())
+         for a in remote_evaluators])
+    metric_lists.append(local_evaluator.sampler.get_metrics())
+    for metrics in metric_lists:
+        for episode in metrics:
+            episode_lengths.append(episode.episode_length)
+            episode_rewards.append(episode.episode_reward)
+    if episode_rewards:
+        min_reward = min(episode_rewards)
+        max_reward = max(episode_rewards)
+    else:
+        min_reward = float('nan')
+        max_reward = float('nan')
+    avg_reward = np.mean(episode_rewards)
+    avg_length = np.mean(episode_lengths)
+    timesteps = np.sum(episode_lengths)
+
+    return TrainingResult(
+        episode_reward_max=max_reward,
+        episode_reward_min=min_reward,
+        episode_reward_mean=avg_reward,
+        episode_len_mean=avg_length,
+        episodes_total=len(episode_lengths),
+        timesteps_this_iter=timesteps)
+
+
+class CommonPolicyEvaluator(PolicyEvaluator):
+    """Policy evaluator implementation that operates on a rllib.PolicyGraph.
+
+    TODO: vector env
+    TODO: multi-agent
+    TODO: consumer buffering for multi-agent
+    TODO: complete episode batch mode
+
+    Examples:
+        # Create a policy evaluator and using it to collect experiences.
+        >>> evaluator = CommonPolicyEvaluator(
+              env_creator=lambda _: gym.make("CartPole-v0"),
+              policy_graph=PGPolicyGraph)
+        >>> print(evaluator.sample().keys())
+        {"obs": [[...]], "actions": [[...]], "rewards": [[...]],
+         "dones": [[...]], "new_obs": [[...]]}
+
+        # Creating policy evaluators using optimizer_cls.make().
+        >>> optimizer = LocalSyncOptimizer.make(
+              evaluator_cls=CommonPolicyEvaluator,
+              evaluator_args={
+                "env_creator": lambda _: gym.make("CartPole-v0"),
+                "policy_graph": PGPolicyGraph,
+              },
+              num_workers=10)
+        >>> for _ in range(10): optimizer.step()
+    """
+
+    @classmethod
+    def as_remote(cls, num_cpus=None, num_gpus=None):
+        return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)
+
+    def __init__(
+            self,
+            env_creator,
+            policy_graph,
+            tf_session_creator=None,
+            batch_steps=100,
+            batch_mode="truncate_episodes",
+            preprocessor_pref="rllib",
+            sample_async=False,
+            compress_observations=False,
+            observation_filter="NoFilter",
+            registry=None,
+            env_config=None,
+            model_config=None,
+            policy_config=None):
+        """Initialize a policy evaluator.
+
+        Arguments:
+            env_creator (func): Function that returns a gym.Env given an
+                env config dict.
+            policy_graph (class): A class implementing rllib.PolicyGraph or
+                rllib.TFPolicyGraph.
+            tf_session_creator (func): A function that returns a TF session.
+                This is optional and only useful with TFPolicyGraph.
+            batch_steps (int): The target number of env transitions to include
+                in each sample batch returned from this evaluator.
+            batch_mode (str): One of the following choices:
+                complete_episodes: each batch will be at least batch_steps
+                    in size, and will include one or more complete episodes.
+                truncate_episodes: each batch will be around batch_steps
+                    in size, and include transitions from one episode only.
+                pack_episodes: each batch will be exactly batch_steps in
+                    size, and may include transitions from multiple episodes.
+            preprocessor_pref (str): Whether to prefer RLlib preprocessors
+                ("rllib") or deepmind ("deepmind") when applicable.
+            sample_async (bool): Whether to compute samples asynchronously in
+                the background, which improves throughput but can cause samples
+                to be slightly off-policy.
+            compress_observations (bool): If true, compress the observations
+                returned.
+            observation_filter (str): Name of observation filter to use.
+            registry (tune.Registry): User-registered objects. Pass in the
+                value from tune.registry.get_registry() if you're having
+                trouble resolving things like custom envs.
+            env_config (dict): Config to pass to the env creator.
+            model_config (dict): Config to use when creating the policy model.
+            policy_config (dict): Config to pass to the policy.
+        """
+
+        registry = registry or get_registry()
+        env_config = env_config or {}
+        policy_config = policy_config or {}
+        model_config = model_config or {}
+
+        assert batch_mode in [
+            "complete_episodes", "truncate_episodes", "pack_episodes"]
+        self.env_creator = env_creator
+        self.policy_graph = policy_graph
+        self.batch_steps = batch_steps
+        self.batch_mode = batch_mode
+        self.compress_observations = compress_observations
+
+        self.env = env_creator(env_config)
+        is_atari = hasattr(self.env.unwrapped, "ale")
+        if is_atari and "custom_preprocessor" not in model_config and \
+                preprocessor_pref == "deepmind":
+            self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80))
+        else:
+            self.env = ModelCatalog.get_preprocessor_as_wrapper(
+                registry, self.env, model_config)
+
+        self.vectorized = hasattr(self.env, "vector_reset")
+        self.policy_map = {}
+
+        if issubclass(policy_graph, TFPolicyGraph):
+            with tf.Graph().as_default():
+                if tf_session_creator:
+                    self.sess = tf_session_creator()
+                else:
+                    self.sess = tf.Session(config=tf.ConfigProto(
+                        gpu_options=tf.GPUOptions(allow_growth=True)))
+                with self.sess.as_default():
+                    policy = policy_graph(
+                        self.env.observation_space, self.env.action_space,
+                        registry, policy_config)
+        else:
+            policy = policy_graph(
+                self.env.observation_space, self.env.action_space,
+                registry, policy_config)
+        self.policy_map = {
+            "default": policy
+        }
+
+        self.obs_filter = get_filter(
+            observation_filter, self.env.observation_space.shape)
+        self.filters = {"obs_filter": self.obs_filter}
+
+        if self.vectorized:
+            raise NotImplementedError("Vector envs not yet supported")
+        else:
+            if batch_mode not in [
+                    "pack_episodes", "truncate_episodes", "complete_episodes"]:
+                raise NotImplementedError("Batch mode not yet supported")
+            pack = batch_mode == "pack_episodes"
+            if batch_mode == "complete_episodes":
+                batch_steps = 999999
+            if sample_async:
+                self.sampler = AsyncSampler(
+                    self.env, self.policy_map["default"], self.obs_filter,
+                    batch_steps, pack=pack)
+                self.sampler.start()
+            else:
+                self.sampler = SyncSampler(
+                    self.env, self.policy_map["default"], self.obs_filter,
+                    batch_steps, pack=pack)
+
+    def sample(self):
+        """Evaluate the current policies and return a batch of experiences.
+
+        Return:
+            SampleBatch from evaluating the current policies.
+        """
+
+        batch = self.policy_map["default"].postprocess_trajectory(
+            self.sampler.get_data())
+
+        if self.compress_observations:
+            batch["obs"] = [pack(o) for o in batch["obs"]]
+            batch["new_obs"] = [pack(o) for o in batch["new_obs"]]
+
+        return batch
+
+    def apply(self, func):
+        """Apply the given function to this evaluator instance."""
+
+        return func(self)
+
+    def for_policy(self, func):
+        """Apply the given function to this evaluator's default policy."""
+
+        return func(self.policy_map["default"])
+
+    def sync_filters(self, new_filters):
+        """Changes self's filter to given and rebases any accumulated delta.
+
+        Args:
+            new_filters (dict): Filters with new state to update local copy.
+        """
+        assert all(k in new_filters for k in self.filters)
+        for k in self.filters:
+            self.filters[k].sync(new_filters[k])
+
+    def get_filters(self, flush_after=False):
+        """Returns a snapshot of filters.
+
+        Args:
+            flush_after (bool): Clears the filter buffer state.
+
+        Returns:
+            return_filters (dict): Dict for serializable filters
+        """
+        return_filters = {}
+        for k, f in self.filters.items():
+            return_filters[k] = f.as_serializable()
+            if flush_after:
+                f.clear_buffer()
+        return return_filters
+
+    def get_weights(self):
+        return self.policy_map["default"].get_weights()
+
+    def set_weights(self, weights):
+        return self.policy_map["default"].set_weights(weights)
+
+    def compute_gradients(self, samples):
+        return self.policy_map["default"].compute_gradients(samples)
+
+    def apply_gradients(self, grads):
+        return self.policy_map["default"].apply_gradients(grads)
+
+    def compute_apply(self, samples):
+        grad_fetch, apply_fetch = self.policy_map["default"].compute_apply(
+            samples)
+        return grad_fetch
+
+    def save(self):
+        filters = self.get_filters(flush_after=True)
+        state = self.policy_map["default"].get_state()
+        return pickle.dumps({"filters": filters, "state": state})
+
+    def restore(self, objs):
+        objs = pickle.loads(objs)
+        self.sync_filters(objs["filters"])
+        self.policy_map["default"].set_state(objs["state"])
diff --git a/python/ray/rllib/utils/policy_graph.py b/python/ray/rllib/utils/policy_graph.py
new file mode 100644
index 000000000000..ec78e1e5b7f8
--- /dev/null
+++ b/python/ray/rllib/utils/policy_graph.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class PolicyGraph(object):
+    """An agent policy and loss, i.e., a TFPolicyGraph or other subclass.
+
+    This object defines how to act in the environment, and also losses used to
+    improve the policy based on its experiences. Note that both policy and
+    loss are defined together for convenience, though the policy itself is
+    logically separate.
+
+    All policies can directly extend PolicyGraph, however TensorFlow users may
+    find TFPolicyGraph simpler to implement. TFPolicyGraph also enables RLlib
+    to apply TensorFlow-specific optimizations such as fusing multiple policy
+    graphs and multi-GPU support.
+    """
+
+    def __init__(self, registry, observation_space, action_space, config):
+        """Initialize the graph.
+
+        Args:
+            registry (obj): Object registry for user-defined envs, models, etc.
+            observation_space (gym.Space): Observation space of the env.
+            action_space (gym.Space): Action space of the env.
+            config (dict): Policy-specific configuration data.
+        """
+        pass
+
+    def compute_actions(self, obs_batch, state_batches, is_training=False):
+        """Compute actions for the current policy.
+
+        Arguments:
+            obs_batch (np.ndarray): batch of observations
+            state_batches (list): list of RNN state input batches, if any
+            is_training (bool): whether we are training the policy
+
+        Returns:
+            actions (np.ndarray): batch of output actions, with shape like
+                [BATCH_SIZE, ACTION_SHAPE].
+            state_outs (list): list of RNN state output batches, if any, with
+                shape like [STATE_SIZE, BATCH_SIZE].
+            info (dict): dictionary of extra feature batches, if any, with
+                shape like {"f1": [BATCH_SIZE, ...], "f2": [BATCH_SIZE, ...]}.
+        """
+        raise NotImplementedError
+
+    def compute_single_action(self, obs, state, is_training=False):
+        """Unbatched version of compute_actions.
+
+        Arguments:
+            obs (obj): single observation
+            state_batches (list): list of RNN state inputs, if any
+            is_training (bool): whether we are training the policy
+
+        Returns:
+            actions (obj): single action
+            state_outs (list): list of RNN state outputs, if any
+            info (dict): dictionary of extra features, if any
+        """
+
+        [action], state_out, info = self.compute_actions(
+            [obs], [[s] for s in state], is_training)
+        return action, [s[0] for s in state_out], \
+            {k: v[0] for k, v in info.items()}
+
+    def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
+        """Implements algorithm-specific trajectory postprocessing.
+
+        Arguments:
+            sample_batch (SampleBatch): batch of experiences for the policy
+            other_agent_batches (dict): In a multi-agent env, this contains the
+                experience batches seen by other agents.
+
+        Returns:
+            SampleBatch: postprocessed sample batch.
+        """
+        return sample_batch
+
+    def compute_gradients(self, postprocessed_batch):
+        """Computes gradients against a batch of experiences.
+
+        Returns:
+            grads (list): List of gradient output values
+            info (dict): Extra policy-specific values
+        """
+        raise NotImplementedError
+
+    def apply_gradients(self, gradients):
+        """Applies previously computed gradients.
+
+        Returns:
+            info (dict): Extra policy-specific values
+        """
+        raise NotImplementedError
+
+    def get_weights(self):
+        """Returns model weights.
+
+        Returns:
+            weights (obj): Serializable copy or view of model weights
+        """
+        raise NotImplementedError
+
+    def set_weights(self, weights):
+        """Sets model weights.
+
+        Arguments:
+            weights (obj): Serializable copy or view of model weights
+        """
+        raise NotImplementedError
+
+    def get_initial_state(self):
+        """Returns initial RNN state for the current policy."""
+        return []
+
+    def get_state(self):
+        """Saves all local state.
+
+        Returns:
+            state (obj): Serialized local state.
+        """
+        return self.get_weights()
+
+    def set_state(self, state):
+        """Restores all local state.
+
+        Arguments:
+            state (obj): Serialized local state.
+        """
+        self.set_weights(state)
diff --git a/python/ray/rllib/utils/process_rollout.py b/python/ray/rllib/utils/process_rollout.py
index b2d52fddabb3..ed7088bb18d8 100644
--- a/python/ray/rllib/utils/process_rollout.py
+++ b/python/ray/rllib/utils/process_rollout.py
@@ -11,12 +11,12 @@ def discount(x, gamma):
     return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
 
 
-def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):
+def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True):
     """Given a rollout, compute its value targets and the advantage.
 
     Args:
         rollout (PartialRollout): Partial Rollout Object
-        reward_filter (Filter): Filter for processing advantanges
+        last_r (float): Value estimation for last observation
         gamma (float): Parameter for GAE
         lambda_ (float): Parameter for GAE
         use_gae (bool): Using Generalized Advantage Estamation
@@ -32,21 +32,17 @@ def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):
 
     if use_gae:
         assert "vf_preds" in rollout, "Values not found!"
-        vpred_t = np.stack(rollout["vf_preds"] +
-                           [np.array(rollout.last_r)]).squeeze()
+        vpred_t = np.concatenate([rollout["vf_preds"], np.array([last_r])])
         delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1]
         # This formula for the advantage comes
         # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
         traj["advantages"] = discount(delta_t, gamma * lambda_)
         traj["value_targets"] = traj["advantages"] + traj["vf_preds"]
     else:
-        rewards_plus_v = np.stack(rollout["rewards"] +
-                                  [np.array(rollout.last_r)]).squeeze()
+        rewards_plus_v = np.concatenate(
+            [rollout["rewards"], np.array([last_r])])
         traj["advantages"] = discount(rewards_plus_v, gamma)[:-1]
 
-    for i in range(traj["advantages"].shape[0]):
-        traj["advantages"][i] = reward_filter(traj["advantages"][i])
-
     traj["advantages"] = traj["advantages"].copy()
 
     assert all(val.shape[0] == trajsize for val in traj.values()), \
diff --git a/python/ray/rllib/utils/sampler.py b/python/ray/rllib/utils/sampler.py
index 242464dbfe07..4b233adaf298 100644
--- a/python/ray/rllib/utils/sampler.py
+++ b/python/ray/rllib/utils/sampler.py
@@ -2,80 +2,12 @@
 from __future__ import division
 from __future__ import print_function
 
-import six.moves.queue as queue
-import threading
 from collections import namedtuple
 import numpy as np
+import six.moves.queue as queue
+import threading
 
-
-class PartialRollout(object):
-    """A piece of a complete rollout.
-
-    We run our agent, and process its experience once it has processed enough
-    steps.
-
-    Attributes:
-        data (dict): Stores rollout data. All numpy arrays other than
-            `observations` and `features` will be squeezed.
-        last_r (float): Value of next state. Used for bootstrapping.
-    """
-
-    fields = ["obs", "actions", "rewards", "new_obs", "dones", "features"]
-
-    def __init__(self, extra_fields=None):
-        """Initializers internals. Maintains a `last_r` field
-        in support of partial rollouts, used in bootstrapping advantage
-        estimation.
-
-        Args:
-            extra_fields: Optional field for object to keep track.
-        """
-        if extra_fields:
-            self.fields.extend(extra_fields)
-        self.data = {k: [] for k in self.fields}
-        self.last_r = 0.0
-
-    def add(self, **kwargs):
-        for k, v in kwargs.items():
-            self.data[k] += [v]
-
-    def extend(self, other_rollout):
-        """Extends internal data structure. Assumes other_rollout contains
-        data that occured afterwards."""
-
-        assert not self.is_terminal()
-        assert all(k in other_rollout.fields for k in self.fields)
-        for k, v in other_rollout.data.items():
-            self.data[k].extend(v)
-        self.last_r = other_rollout.last_r
-
-    def is_terminal(self):
-        """Check if terminal.
-
-        Returns:
-            terminal (bool): if rollout has terminated."""
-        return self.data["dones"][-1]
-
-    def __getitem__(self, key):
-        return self.data[key]
-
-    def __setitem__(self, key, item):
-        self.data[key] = item
-
-    def keys(self):
-        return self.data.keys()
-
-    def items(self):
-        return self.data.items()
-
-    def __iter__(self):
-        return self.data.__iter__()
-
-    def __next__(self):
-        return self.data.__next__()
-
-    def __contains__(self, x):
-        return x in self.data
+from ray.rllib.optimizers.sample_batch import SampleBatchBuilder
 
 
 CompletedRollout = namedtuple("CompletedRollout",
@@ -92,7 +24,9 @@ class SyncSampler(object):
     thread."""
     _async = False
 
-    def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
+    def __init__(
+            self, env, policy, obs_filter, num_local_steps, horizon=None,
+            pack=False):
         self.num_local_steps = num_local_steps
         self.horizon = horizon
         self.env = env
@@ -100,7 +34,7 @@ def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
         self._obs_filter = obs_filter
         self.rollout_provider = _env_runner(self.env, self.policy,
                                             self.num_local_steps, self.horizon,
-                                            self._obs_filter)
+                                            self._obs_filter, pack)
         self.metrics_queue = queue.Queue()
 
     def get_data(self):
@@ -128,7 +62,9 @@ class AsyncSampler(threading.Thread):
     accumulate and the gradient can be calculated on up to 5 batches."""
     _async = True
 
-    def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
+    def __init__(
+            self, env, policy, obs_filter, num_local_steps, horizon=None,
+            pack=False):
         assert getattr(
             obs_filter, "is_concurrent",
             False), ("Observation Filter must support concurrent updates.")
@@ -142,6 +78,7 @@ def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
         self._obs_filter = obs_filter
         self.started = False
         self.daemon = True
+        self.pack = pack
 
     def run(self):
         self.started = True
@@ -154,7 +91,7 @@ def run(self):
     def _run(self):
         rollout_provider = _env_runner(self.env, self.policy,
                                        self.num_local_steps, self.horizon,
-                                       self._obs_filter)
+                                       self._obs_filter, self.pack)
         while True:
             # The timeout variable exists because apparently, if one worker
             # dies, the other workers won't die with it, unless the timeout is
@@ -169,18 +106,18 @@ def get_data(self):
         """Gets currently accumulated data.
 
         Returns:
-            rollout (PartialRollout): trajectory data (unprocessed)
+            rollout (SampleBatch): trajectory data (unprocessed)
         """
         assert self.started, "Sampler never started running!"
         rollout = self.queue.get(timeout=600.0)
         if isinstance(rollout, BaseException):
             raise rollout
-        while not rollout.is_terminal():
+        while not rollout["dones"][-1]:
             try:
                 part = self.queue.get_nowait()
                 if isinstance(part, BaseException):
                     raise rollout
-                rollout.extend(part)
+                rollout = rollout.concat(part)
             except queue.Empty:
                 break
         return rollout
@@ -195,7 +132,7 @@ def get_metrics(self):
         return completed
 
 
-def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
+def _env_runner(env, policy, num_local_steps, horizon, obs_filter, pack):
     """This implements the logic of the thread runner.
 
     It continually runs the policy, and as long as the rollout exceeds a
@@ -206,12 +143,16 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
     Args:
         env: Environment generated by env_creator
         policy: Policy used to interact with environment. Also sets fields
-            to be included in `PartialRollout`
-        num_local_steps: Number of steps before `PartialRollout` is yielded.
+            to be included in `SampleBatch`
+        num_local_steps: Number of steps before `SampleBatch` is yielded. Set
+            to infinity to yield complete episodes.
+        horizon: Horizon of the episode.
         obs_filter: Filter used to process observations.
+        pack: Whether to pack multiple episodes into each batch. This
+            guarantees batches will be exactly `num_local_steps` in size.
 
     Yields:
-        rollout (PartialRollout): Object containing state, action, reward,
+        rollout (SampleBatch): Object containing state, action, reward,
             terminal condition, and other fields as dictated by `policy`.
     """
     last_observation = obs_filter(env.reset())
@@ -221,24 +162,23 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
         print("Warning, no horizon specified, assuming infinite")
     if not horizon:
         horizon = 999999
-    if hasattr(policy, "get_initial_features"):
-        last_features = policy.get_initial_features()
-    else:
-        last_features = []
+    last_features = policy.get_initial_state()
     features = last_features
     length = 0
     rewards = 0
     rollout_number = 0
 
     while True:
-        terminal_end = False
-        rollout = PartialRollout(extra_fields=policy.other_output)
+        batch_builder = SampleBatchBuilder()
 
         for _ in range(num_local_steps):
-            action, pi_info = policy.compute(last_observation, *last_features)
-            if policy.is_recurrent:
-                features = pi_info["features"]
-                del pi_info["features"]
+            # Assume batch size one for now
+            action, features, pi_info = policy.compute_single_action(
+                last_observation, last_features, is_training=True)
+            for i, state_value in enumerate(last_features):
+                pi_info["state_in_{}".format(i)] = state_value
+            for i, state_value in enumerate(features):
+                pi_info["state_out_{}".format(i)] = state_value
             observation, reward, terminal, info = env.step(action)
             observation = obs_filter(observation)
 
@@ -252,12 +192,11 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
                 action = np.concatenate(action, axis=0).flatten()
 
             # Collect the experience.
-            rollout.add(
+            batch_builder.add_values(
                 obs=last_observation,
                 actions=action,
                 rewards=reward,
                 dones=terminal,
-                features=last_features,
                 new_obs=observation,
                 **pi_info)
 
@@ -265,24 +204,18 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
             last_features = features
 
             if terminal:
-                terminal_end = True
                 yield CompletedRollout(length, rewards)
 
-                if (length >= horizon
-                        or not env.metadata.get("semantics.autoreset")):
+                if (length >= horizon or
+                        not env.metadata.get("semantics.autoreset")):
                     last_observation = obs_filter(env.reset())
-                    if hasattr(policy, "get_initial_features"):
-                        last_features = policy.get_initial_features()
-                    else:
-                        last_features = []
+                    last_features = policy.get_initial_state()
                     rollout_number += 1
                     length = 0
                     rewards = 0
-                    break
-
-        if not terminal_end:
-            rollout.last_r = policy.value(last_observation, *last_features)
+                    if not pack:
+                        break
 
         # Once we have enough experience, yield it, and have the ThreadRunner
         # place it on a queue.
-        yield rollout
+        yield batch_builder.build()
diff --git a/python/ray/rllib/utils/tf_policy_graph.py b/python/ray/rllib/utils/tf_policy_graph.py
new file mode 100644
index 000000000000..6588060bfe29
--- /dev/null
+++ b/python/ray/rllib/utils/tf_policy_graph.py
@@ -0,0 +1,152 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+import ray
+from ray.rllib.utils.policy_graph import PolicyGraph
+
+
+class TFPolicyGraph(PolicyGraph):
+    """An agent policy and loss implemented in TensorFlow.
+
+    Extending this class enables RLlib to perform TensorFlow specific
+    optimizations on the policy graph, e.g., parallelization across gpus or
+    fusing multiple graphs together in the multi-agent setting.
+
+    All input and output tensors are of shape [BATCH_DIM, ...].
+
+    Examples:
+        >>> policy = TFPolicyGraphSubclass(
+            sess, obs_input, action_sampler, loss, loss_inputs, is_training)
+
+        >>> print(policy.compute_actions([1, 0, 2]))
+        (array([0, 1, 1]), [], {})
+
+        >>> print(policy.postprocess_trajectory(SampleBatch({...})))
+        SampleBatch({"action": ..., "advantages": ..., ...})
+    """
+
+    def __init__(
+            self, sess, obs_input, action_sampler, loss, loss_inputs,
+            is_training, state_inputs=None, state_outputs=None):
+        """Initialize the policy.
+
+        Arguments:
+            obs_input (Tensor): input placeholder for observations.
+            action_sampler (Tensor): Tensor for sampling an action.
+            loss (Tensor): scalar policy loss output tensor.
+            loss_inputs (list): a (name, placeholder) tuple for each loss
+                input argument. Each placeholder name must correspond to a
+                SampleBatch column key returned by postprocess_trajectory().
+            is_training (Tensor): input placeholder for whether we are
+                currently training the policy.
+            state_inputs (list): list of RNN state output Tensors.
+            state_outputs (list): list of initial state values.
+        """
+
+        self._sess = sess
+        self._obs_input = obs_input
+        self._sampler = action_sampler
+        self._loss = loss
+        self._loss_inputs = loss_inputs
+        self._is_training = is_training
+        self._state_inputs = state_inputs or []
+        self._state_outputs = state_outputs or []
+        self._optimizer = self.optimizer()
+        self._grads_and_vars = self.gradients(self._optimizer)
+        self._grads = [g for (g, v) in self._grads_and_vars]
+        self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars)
+        self._variables = ray.experimental.TensorFlowVariables(
+            self._loss, self._sess)
+
+        assert len(self._state_inputs) == len(self._state_outputs) == \
+            len(self.get_initial_state())
+
+    def compute_actions(
+            self, obs_batch, state_batches=None, is_training=False):
+        state_batches = state_batches or []
+        assert len(self._state_inputs) == len(state_batches), \
+            (self._state_inputs, state_batches)
+        feed_dict = self.extra_compute_action_feed_dict()
+        feed_dict[self._obs_input] = obs_batch
+        feed_dict[self._is_training] = is_training
+        for ph, value in zip(self._state_inputs, state_batches):
+            feed_dict[ph] = value
+        fetches = self._sess.run(
+            ([self._sampler] + self._state_outputs +
+             [self.extra_compute_action_fetches()]), feed_dict=feed_dict)
+        return fetches[0], fetches[1:-1], fetches[-1]
+
+    def _get_loss_inputs_dict(self, postprocessed_batch):
+        feed_dict = {}
+        for key, ph in self._loss_inputs:
+            # TODO(ekl) fix up handling of RNN inputs so that we can batch
+            # across multiple rollouts
+            if key.startswith("state_in_"):
+                feed_dict[ph] = postprocessed_batch[key][:1]  # in state only
+            else:
+                feed_dict[ph] = postprocessed_batch[key]
+        return feed_dict
+
+    def compute_gradients(self, postprocessed_batch):
+        feed_dict = self.extra_compute_grad_feed_dict()
+        feed_dict[self._is_training] = True
+        feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch))
+        fetches = self._sess.run(
+            [self._grads, self.extra_compute_grad_fetches()],
+            feed_dict=feed_dict)
+        return fetches[0], fetches[1]
+
+    def apply_gradients(self, gradients):
+        assert len(gradients) == len(self._grads), (gradients, self._grads)
+        feed_dict = self.extra_apply_grad_feed_dict()
+        feed_dict[self._is_training] = True
+        for ph, value in zip(self._grads, gradients):
+            feed_dict[ph] = value
+        fetches = self.sess.run(
+            [self._apply_op, self.extra_apply_grad_fetches()],
+            feed_dict=feed_dict)
+        return fetches[1]
+
+    def compute_apply(self, postprocessed_batch):
+        feed_dict = self.extra_compute_grad_feed_dict()
+        feed_dict.update(self.extra_apply_grad_feed_dict())
+        feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch))
+        feed_dict[self._is_training] = True
+        fetches = self._sess.run(
+            [self._apply_op, self.extra_compute_grad_fetches(),
+             self.extra_apply_grad_fetches()],
+            feed_dict=feed_dict)
+        return fetches[1], fetches[2]
+
+    def get_weights(self):
+        return self._variables.get_flat()
+
+    def set_weights(self, weights):
+        return self._variables.set_flat(weights)
+
+    def extra_compute_action_feed_dict(self):
+        return {}
+
+    def extra_compute_action_fetches(self):
+        return {}  # e.g, value function
+
+    def extra_compute_grad_feed_dict(self):
+        return {}  # e.g, kl_coeff
+
+    def extra_compute_grad_fetches(self):
+        return {}  # e.g, td error
+
+    def extra_apply_grad_feed_dict(self):
+        return {}
+
+    def extra_apply_grad_fetches(self):
+        return {}  # e.g., batch norm updates
+
+    def optimizer(self):
+        return tf.train.AdamOptimizer()
+
+    def gradients(self, optimizer):
+        return optimizer.compute_gradients(self._loss)
diff --git a/python/ray/tune/result.py b/python/ray/tune/result.py
index 74ea2bcb9838..261ca6e90ff9 100644
--- a/python/ray/tune/result.py
+++ b/python/ray/tune/result.py
@@ -31,6 +31,12 @@
         # (Optional) The mean episode reward if applicable.
         "episode_reward_mean",
 
+        # (Optional) The min episode reward if applicable.
+        "episode_reward_min",
+
+        # (Optional) The max episode reward if applicable.
+        "episode_reward_max",
+
         # (Optional) The mean episode length if applicable.
         "episode_len_mean",
 
diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh
index 8bd010c3c810..e18b5a6ef47c 100755
--- a/test/jenkins_tests/run_multi_node_tests.sh
+++ b/test/jenkins_tests/run_multi_node_tests.sh
@@ -208,6 +208,9 @@ docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \
 docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \
     python /ray/python/ray/rllib/test/test_checkpoint_restore.py
 
+docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \
+    python /ray/python/ray/rllib/test/test_common_policy_evaluator.py
+
 docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \
     python /ray/python/ray/rllib/test/test_supported_spaces.py