ray-project · ericl · May 18, 2019 · May 15, 2019 · May 15, 2019 · May 16, 2019
@@ -49,8 +49,8 @@ class A3CTrainer(Trainer):
     def _init(self, config, env_creator):
         if config["use_pytorch"]:
             from ray.rllib.agents.a3c.a3c_torch_policy_graph import \
-                A3CTorchPolicyGraph
-            policy_cls = A3CTorchPolicyGraph
+                A3CTorchPolicy
+            policy_cls = A3CTorchPolicy
         else:
             policy_cls = self._policy_graph
 

@@ -7,109 +7,84 @@
 from torch import nn
 
 import ray
-from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.policy_graph import PolicyGraph
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
-from ray.rllib.utils.annotations import override
-
-
-class A3CLoss(nn.Module):
-    def __init__(self, dist_class, vf_loss_coeff=0.5, entropy_coeff=0.01):
-        nn.Module.__init__(self)
-        self.dist_class = dist_class
-        self.vf_loss_coeff = vf_loss_coeff
-        self.entropy_coeff = entropy_coeff
-
-    def forward(self, policy_model, observations, actions, advantages,
-                value_targets):
-        logits, _, values, _ = policy_model({
-            SampleBatch.CUR_OBS: observations
-        }, [])
-        dist = self.dist_class(logits)
-        log_probs = dist.logp(actions)
-        self.entropy = dist.entropy().mean()
-        self.pi_err = -advantages.dot(log_probs.reshape(-1))
-        self.value_err = F.mse_loss(values.reshape(-1), value_targets)
-        overall_err = sum([
-            self.pi_err,
-            self.vf_loss_coeff * self.value_err,
-            -self.entropy_coeff * self.entropy,
-        ])
-
-        return overall_err
-
-
-class A3CPostprocessing(object):
-    """Adds the VF preds and advantages fields to the trajectory."""
-
-    @override(TorchPolicyGraph)
-    def extra_action_out(self, model_out):
-        return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}
-
-    @override(PolicyGraph)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        completed = sample_batch[SampleBatch.DONES][-1]
-        if completed:
-            last_r = 0.0
-        else:
-            last_r = self._value(sample_batch[SampleBatch.NEXT_OBS][-1])
-        return compute_advantages(sample_batch, last_r, self.config["gamma"],
-                                  self.config["lambda"])
-
-
-class A3CTorchPolicyGraph(A3CPostprocessing, TorchPolicyGraph):
-    """A simple, non-recurrent PyTorch policy example."""
-
-    def __init__(self, obs_space, action_space, config):
-        config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
-        self.config = config
-        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"], torch=True)
-        model = ModelCatalog.get_torch_model(obs_space, self.logit_dim,
-                                             self.config["model"])
-        loss = A3CLoss(dist_class, self.config["vf_loss_coeff"],
-                       self.config["entropy_coeff"])
-        TorchPolicyGraph.__init__(
-            self,
-            obs_space,
-            action_space,
-            model,
-            loss,
-            loss_inputs=[
-                SampleBatch.CUR_OBS, SampleBatch.ACTIONS,
-                Postprocessing.ADVANTAGES, Postprocessing.VALUE_TARGETS
-            ],
-            action_distribution_cls=dist_class)
-
-    @override(TorchPolicyGraph)
-    def optimizer(self):
-        return torch.optim.Adam(self._model.parameters(), lr=self.config["lr"])
-
-    @override(TorchPolicyGraph)
-    def extra_grad_process(self):
-        info = {}
-        if self.config["grad_clip"]:
-            total_norm = nn.utils.clip_grad_norm_(self._model.parameters(),
-                                                  self.config["grad_clip"])
-            info["grad_gnorm"] = total_norm
-        return info
-
-    @override(TorchPolicyGraph)
-    def extra_grad_info(self):
-        return {
-            "policy_entropy": self._loss.entropy.item(),
-            "policy_loss": self._loss.pi_err.item(),
-            "vf_loss": self._loss.value_err.item()
-        }
-
+from ray.rllib.evaluation.torch_policy_template import build_torch_policy
+
+
+def actor_critic_loss(policy, batch_tensors):
+    logits, _, values, _ = policy.model({
+        SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
+    }, [])
+    dist = policy.dist_class(logits)
+    log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS])
+    policy.entropy = dist.entropy().mean()
+    policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
+        log_probs.reshape(-1))
+    policy.value_err = F.mse_loss(
+        values.reshape(-1), batch_tensors[Postprocessing.VALUE_TARGETS])
+    overall_err = sum([
+        policy.pi_err,
+        policy.config["vf_loss_coeff"] * policy.value_err,
+        -policy.config["entropy_coeff"] * policy.entropy,
+    ])
+    return overall_err
+
+
+def loss_and_entropy_stats(policy, batch_tensors):
+    return {
+        "policy_entropy": policy.entropy.item(),
+        "policy_loss": policy.pi_err.item(),
+        "vf_loss": policy.value_err.item(),
+    }
+
+
+def add_advantages(policy,
+                   sample_batch,
+                   other_agent_batches=None,
+                   episode=None):
+    completed = sample_batch[SampleBatch.DONES][-1]
+    if completed:
+        last_r = 0.0
+    else:
+        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
+    return compute_advantages(sample_batch, last_r, policy.config["gamma"],
+                              policy.config["lambda"])
+
+
+def model_value_predictions(policy, model_out):
+    return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()}
+
+
+def apply_grad_clipping(policy):
+    info = {}
+    if policy.config["grad_clip"]:
+        total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(),
+                                              policy.config["grad_clip"])
+        info["grad_gnorm"] = total_norm
+    return info
+
+
+def torch_optimizer(policy, config):
+    return torch.optim.Adam(policy.model.parameters(), lr=config["lr"])
+
+
+class ValueNetworkMixin(object):
     def _value(self, obs):
         with self.lock:
             obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device)
-            _, _, vf, _ = self._model({"obs": obs}, [])
+            _, _, vf, _ = self.model({"obs": obs}, [])
             return vf.detach().cpu().numpy().squeeze()
+
+
+A3CTorchPolicy = build_torch_policy(
+    name="A3CTorchPolicy",
+    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
+    loss_fn=actor_critic_loss,
+    stats_fn=loss_and_entropy_stats,
+    postprocess_fn=add_advantages,
+    extra_action_out_fn=model_value_predictions,
+    extra_grad_process_fn=apply_grad_clipping,
+    optimizer_fn=torch_optimizer,
+    mixins=[ValueNetworkMixin])
@@ -2,11 +2,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.agents.trainer import Trainer, with_common_config
-from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
-
-from ray.rllib.optimizers import SyncSamplesOptimizer
-from ray.rllib.utils.annotations import override
+from ray.rllib.agents.trainer import with_common_config
+from ray.rllib.agents.trainer_template import build_trainer
+from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy
 
 # yapf: disable
 # __sphinx_doc_begin__
@@ -22,40 +20,16 @@
 # yapf: enable
 
 
-class PGTrainer(Trainer):
-    """Simple policy gradient agent.
-
-    This is an example agent to show how to implement algorithms in RLlib.
-    In most cases, you will probably want to use the PPO agent instead.
-    """
-
-    _name = "PG"
-    _default_config = DEFAULT_CONFIG
-    _policy_graph = PGPolicyGraph
+def get_policy_class(config):
+    if config["use_pytorch"]:
+        from ray.rllib.agents.pg.torch_pg_policy_graph import PGTorchPolicy
+        return PGTorchPolicy
+    else:
+        return PGTFPolicy
 
-    @override(Trainer)
-    def _init(self, config, env_creator):
-        if config["use_pytorch"]:
-            from ray.rllib.agents.pg.torch_pg_policy_graph import \
-                PGTorchPolicyGraph
-            policy_cls = PGTorchPolicyGraph
-        else:
-            policy_cls = self._policy_graph
-        self.local_evaluator = self.make_local_evaluator(
-            env_creator, policy_cls)
-        self.remote_evaluators = self.make_remote_evaluators(
-            env_creator, policy_cls, config["num_workers"])
-        optimizer_config = dict(
-            config["optimizer"],
-            **{"train_batch_size": config["train_batch_size"]})
-        self.optimizer = SyncSamplesOptimizer(
-            self.local_evaluator, self.remote_evaluators, **optimizer_config)
 
-    @override(Trainer)
-    def _train(self):
-        prev_steps = self.optimizer.num_steps_sampled
-        self.optimizer.step()
-        result = self.collect_metrics()
-        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
-                      prev_steps)
-        return result
+PGTrainer = build_trainer(
+    name="PG",
+    default_config=DEFAULT_CONFIG,
+    default_policy=PGTFPolicy,
+    get_policy_class=get_policy_class)
@@ -3,102 +3,33 @@
 from __future__ import print_function
 
 import ray
-from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
     Postprocessing
-from ray.rllib.evaluation.policy_graph import PolicyGraph
+from ray.rllib.evaluation.tf_policy_template import build_tf_policy
 from ray.rllib.evaluation.sample_batch import SampleBatch
-from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
-from ray.rllib.utils.annotations import override
 from ray.rllib.utils import try_import_tf
 
 tf = try_import_tf()
 
 
-class PGLoss(object):
-    """The basic policy gradient loss."""
+# The basic policy gradients loss
+def policy_gradient_loss(policy, batch_tensors):
+    actions = batch_tensors[SampleBatch.ACTIONS]
+    advantages = batch_tensors[Postprocessing.ADVANTAGES]
+    return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages)
 
-    def __init__(self, action_dist, actions, advantages):
-        self.loss = -tf.reduce_mean(action_dist.logp(actions) * advantages)
 
+# This adds the "advantages" column to the sample batch.
+def postprocess_advantages(policy,
+                           sample_batch,
+                           other_agent_batches=None,
+                           episode=None):
+    return compute_advantages(
+        sample_batch, 0.0, policy.config["gamma"], use_gae=False)
 
-class PGPostprocessing(object):
-    """Adds the advantages field to the trajectory."""
 
-    @override(PolicyGraph)
-    def postprocess_trajectory(self,
-                               sample_batch,
-                               other_agent_batches=None,
-                               episode=None):
-        # This adds the "advantages" column to the sample batch
-        return compute_advantages(
-            sample_batch, 0.0, self.config["gamma"], use_gae=False)
-
-
-class PGPolicyGraph(PGPostprocessing, TFPolicyGraph):
-    """Simple policy gradient example of defining a policy graph."""
-
-    def __init__(self, obs_space, action_space, config):
-        config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config)
-        self.config = config
-
-        # Setup placeholders
-        obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
-        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
-            action_space, self.config["model"])
-        prev_actions = ModelCatalog.get_action_placeholder(action_space)
-        prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
-
-        # Create the model network and action outputs
-        self.model = ModelCatalog.get_model({
-            "obs": obs,
-            "prev_actions": prev_actions,
-            "prev_rewards": prev_rewards,
-            "is_training": self._get_is_training_placeholder(),
-        }, obs_space, action_space, self.logit_dim, self.config["model"])
-        action_dist = dist_class(self.model.outputs)  # logit for each action
-
-        # Setup policy loss
-        actions = ModelCatalog.get_action_placeholder(action_space)
-        advantages = tf.placeholder(tf.float32, [None], name="adv")
-        loss = PGLoss(action_dist, actions, advantages).loss
-
-        # Mapping from sample batch keys to placeholders. These keys will be
-        # read from postprocessed sample batches and fed into the specified
-        # placeholders during loss computation.
-        loss_in = [
-            (SampleBatch.CUR_OBS, obs),
-            (SampleBatch.ACTIONS, actions),
-            (SampleBatch.PREV_ACTIONS, prev_actions),
-            (SampleBatch.PREV_REWARDS, prev_rewards),
-            (Postprocessing.ADVANTAGES, advantages),
-        ]
-
-        # Initialize TFPolicyGraph
-        sess = tf.get_default_session()
-        TFPolicyGraph.__init__(
-            self,
-            obs_space,
-            action_space,
-            sess,
-            obs_input=obs,
-            action_sampler=action_dist.sample(),
-            action_prob=action_dist.sampled_action_prob(),
-            loss=loss,
-            loss_inputs=loss_in,
-            model=self.model,
-            state_inputs=self.model.state_in,
-            state_outputs=self.model.state_out,
-            prev_action_input=prev_actions,
-            prev_reward_input=prev_rewards,
-            seq_lens=self.model.seq_lens,
-            max_seq_len=config["model"]["max_seq_len"])
-        sess.run(tf.global_variables_initializer())
-
-    @override(PolicyGraph)
-    def get_initial_state(self):
-        return self.model.state_init
-
-    @override(TFPolicyGraph)
-    def optimizer(self):
-        return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
+PGTFPolicy = build_tf_policy(
+    name="PGTFPolicy",
+    get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
+    postprocess_fn=postprocess_advantages,
+    loss_fn=policy_gradient_loss)