From d0fec3ecb9cf7752d03a59e3d08bd998b7636e28 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 15 May 2019 16:39:08 -0700 Subject: [PATCH 01/39] dynamic graph --- python/ray/rllib/agents/pg/pg.py | 13 +- python/ray/rllib/agents/pg/pg_policy_graph.py | 101 ++------------ .../evaluation/dynamic_tf_policy_graph.py | 128 ++++++++++++++++++ .../ray/rllib/evaluation/tf_policy_graph.py | 51 ++++--- 4 files changed, 183 insertions(+), 110 deletions(-) create mode 100644 python/ray/rllib/evaluation/dynamic_tf_policy_graph.py diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index e70fdcc8b2c6..8ca36647dcb6 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -3,8 +3,9 @@ from __future__ import print_function from ray.rllib.agents.trainer import Trainer, with_common_config -from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph - +from ray.rllib.agents.pg.pg_policy_graph import postprocess_advantages, \ + policy_gradient_loss, make_optimizer +from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.utils.annotations import override @@ -22,6 +23,14 @@ # yapf: enable +PGPolicyGraph = build_tf_graph( + name="PG", + default_config=DEFAULT_CONFIG, + postprocess_fn=postprocess_advantages, + loss_fn=policy_gradient_loss, + make_optimizer=make_optimizer) + + class PGTrainer(Trainer): """Simple policy gradient agent. diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index a55af79b1e61..f3bef7c6c296 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -3,102 +3,29 @@ from __future__ import print_function import ray -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing -from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.sample_batch import SampleBatch -from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph -from ray.rllib.utils.annotations import override from ray.rllib.utils import try_import_tf tf = try_import_tf() -class PGLoss(object): - """The basic policy gradient loss.""" +# The basic policy gradients loss +def policy_gradient_loss(postprocessed_batch, action_dist): + actions = postprocessed_batch[SampleBatch.ACTIONS] + advantages = postprocessed_batch[Postprocessing.ADVANTAGES] + return -tf.reduce_mean(action_dist.logp(actions) * advantages) - def __init__(self, action_dist, actions, advantages): - self.loss = -tf.reduce_mean(action_dist.logp(actions) * advantages) +# This adds the "advantages" column to the sample batch. +def postprocess_advantages(graph, + sample_batch, + other_agent_batches=None, + episode=None): + return compute_advantages( + sample_batch, 0.0, graph.config["gamma"], use_gae=False) -class PGPostprocessing(object): - """Adds the advantages field to the trajectory.""" - @override(PolicyGraph) - def postprocess_trajectory(self, - sample_batch, - other_agent_batches=None, - episode=None): - # This adds the "advantages" column to the sample batch - return compute_advantages( - sample_batch, 0.0, self.config["gamma"], use_gae=False) - - -class PGPolicyGraph(PGPostprocessing, TFPolicyGraph): - """Simple policy gradient example of defining a policy graph.""" - - def __init__(self, obs_space, action_space, config): - config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) - self.config = config - - # Setup placeholders - obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) - dist_class, self.logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"]) - prev_actions = ModelCatalog.get_action_placeholder(action_space) - prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") - - # Create the model network and action outputs - self.model = ModelCatalog.get_model({ - "obs": obs, - "prev_actions": prev_actions, - "prev_rewards": prev_rewards, - "is_training": self._get_is_training_placeholder(), - }, obs_space, action_space, self.logit_dim, self.config["model"]) - action_dist = dist_class(self.model.outputs) # logit for each action - - # Setup policy loss - actions = ModelCatalog.get_action_placeholder(action_space) - advantages = tf.placeholder(tf.float32, [None], name="adv") - loss = PGLoss(action_dist, actions, advantages).loss - - # Mapping from sample batch keys to placeholders. These keys will be - # read from postprocessed sample batches and fed into the specified - # placeholders during loss computation. - loss_in = [ - (SampleBatch.CUR_OBS, obs), - (SampleBatch.ACTIONS, actions), - (SampleBatch.PREV_ACTIONS, prev_actions), - (SampleBatch.PREV_REWARDS, prev_rewards), - (Postprocessing.ADVANTAGES, advantages), - ] - - # Initialize TFPolicyGraph - sess = tf.get_default_session() - TFPolicyGraph.__init__( - self, - obs_space, - action_space, - sess, - obs_input=obs, - action_sampler=action_dist.sample(), - action_prob=action_dist.sampled_action_prob(), - loss=loss, - loss_inputs=loss_in, - model=self.model, - state_inputs=self.model.state_in, - state_outputs=self.model.state_out, - prev_action_input=prev_actions, - prev_reward_input=prev_rewards, - seq_lens=self.model.seq_lens, - max_seq_len=config["model"]["max_seq_len"]) - sess.run(tf.global_variables_initializer()) - - @override(PolicyGraph) - def get_initial_state(self): - return self.model.state_init - - @override(TFPolicyGraph) - def optimizer(self): - return tf.train.AdamOptimizer(learning_rate=self.config["lr"]) +def make_optimizer(graph): + return tf.train.AdamOptimizer(learning_rate=graph.config["lr"]) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py new file mode 100644 index 000000000000..2185ca8785eb --- /dev/null +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -0,0 +1,128 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from ray.rllib.evaluation.policy_graph import PolicyGraph +from ray.rllib.evaluation.sample_batch import SampleBatch +from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.utils.annotations import override +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() + + +def build_tf_graph( + name, default_config, postprocess_fn, loss_fn, make_optimizer=None): + + class graph_cls(DynamicTFPolicyGraph): + def __init__(self, obs_space, action_space, config): + config = dict(default_config, **config) + DynamicTFPolicyGraph.__init__( + self, obs_space, action_space, config, loss_fn) + + @override(PolicyGraph) + def postprocess_trajectory(self, + sample_batch, + other_agent_batches=None, + episode=None): + return postprocess_fn( + self, sample_batch, other_agent_batches, episode) + + @override(TFPolicyGraph) + def optimizer(self): + if make_optimizer: + return make_optimizer(self) + else: + return TFPolicyGraph.optimizer(self) + + graph_cls.__name__ = name + return graph_cls + + +class DynamicTFPolicyGraph(TFPolicyGraph): + def __init__(self, obs_space, action_space, config, loss_fn): + self.config = config + self._build_loss = loss_fn + + # Setup standard placeholders + obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) + dist_class, self.logit_dim = ModelCatalog.get_action_dist( + action_space, self.config["model"]) + prev_actions = ModelCatalog.get_action_placeholder(action_space) + prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") + + # Create the model network and action outputs + self.model = ModelCatalog.get_model({ + "obs": obs, + "prev_actions": prev_actions, + "prev_rewards": prev_rewards, + "is_training": self._get_is_training_placeholder(), + }, obs_space, action_space, self.logit_dim, self.config["model"]) + self.action_dist = dist_class(self.model.outputs) + + sess = tf.get_default_session() + TFPolicyGraph.__init__( + self, + obs_space, + action_space, + sess, + obs_input=obs, + action_sampler=self.action_dist.sample(), + action_prob=self.action_dist.sampled_action_prob(), + loss=None, # dynamically initialized on run + loss_inputs=[], + model=self.model, + state_inputs=self.model.state_in, + state_outputs=self.model.state_out, + prev_action_input=prev_actions, + prev_reward_input=prev_rewards, + seq_lens=self.model.seq_lens, + max_seq_len=config["model"]["max_seq_len"]) + sess.run(tf.global_variables_initializer()) + + @override(PolicyGraph) + def get_initial_state(self): + return self.model.state_init + + def _initialize_loss_if_needed(self, postprocessed_batch): + if self._loss is not None: + return # already created + + with self._sess.graph.as_default(): + unroll_tensors = { + SampleBatch.PREV_ACTIONS: self._prev_action_input, + SampleBatch.PREV_REWARDS: self._prev_reward_input, + SampleBatch.CUR_OBS: self._obs_input, + } + loss_inputs = [ + (SampleBatch.PREV_ACTIONS, self._prev_action_input), + (SampleBatch.PREV_REWARDS, self._prev_reward_input), + (SampleBatch.CUR_OBS, self._obs_input), + ] + + for k, v in postprocessed_batch.items(): + if k in unroll_tensors: + continue + elif v.dtype == np.object: + continue # can't handle arbitrary objects in TF + shape = (None,) + v.shape[1:] + placeholder = tf.placeholder(v.dtype, shape=shape, name=k) + unroll_tensors[k] = placeholder + loss_inputs.append((k, placeholder)) # TODO: prune to used only + + loss = self._build_loss(unroll_tensors, self.action_dist) + TFPolicyGraph._initialize_loss(self, loss, loss_inputs) + self._sess.run(tf.global_variables_initializer()) + + @override(PolicyGraph) + def compute_gradients(self, postprocessed_batch): + self._initialize_loss_if_needed(postprocessed_batch) + return TFPolicyGraph.compute_gradients(self, postprocessed_batch) + + @override(PolicyGraph) + def learn_on_batch(self, postprocessed_batch): + self._initialize_loss_if_needed(postprocessed_batch) + return TFPolicyGraph.learn_on_batch(self, postprocessed_batch) diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index 2b1eca9e8d5b..e5cf697ac32a 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -112,17 +112,38 @@ def __init__(self, self._prev_action_input = prev_action_input self._prev_reward_input = prev_reward_input self._sampler = action_sampler - self._loss_inputs = loss_inputs - self._loss_input_dict = dict(self._loss_inputs) self._is_training = self._get_is_training_placeholder() self._action_prob = action_prob self._state_inputs = state_inputs or [] self._state_outputs = state_outputs or [] - for i, ph in enumerate(self._state_inputs): - self._loss_input_dict["state_in_{}".format(i)] = ph self._seq_lens = seq_lens self._max_seq_len = max_seq_len self._batch_divisibility_req = batch_divisibility_req + self._update_ops = update_ops + + if loss is not None: + self._initialize_loss(loss, loss_inputs) + else: + self._loss = None + + if len(self._state_inputs) != len(self._state_outputs): + raise ValueError( + "Number of state input and output tensors must match, got: " + "{} vs {}".format(self._state_inputs, self._state_outputs)) + if len(self.get_initial_state()) != len(self._state_inputs): + raise ValueError( + "Length of initial state must match number of state inputs, " + "got: {} vs {}".format(self.get_initial_state(), + self._state_inputs)) + if self._state_inputs and self._seq_lens is None: + raise ValueError( + "seq_lens tensor must be given if state inputs are defined") + + def _initialize_loss(self, loss, loss_inputs): + self._loss_inputs = loss_inputs + self._loss_input_dict = dict(self._loss_inputs) + for i, ph in enumerate(self._state_inputs): + self._loss_input_dict["state_in_{}".format(i)] = ph if self.model: self._loss = self.model.custom_loss(loss, self._loss_input_dict) @@ -141,9 +162,7 @@ def __init__(self, self._loss, self._sess) # gather update ops for any batch norm layers - if update_ops: - self._update_ops = update_ops - else: + if not self._update_ops: self._update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name) if self._update_ops: @@ -153,20 +172,7 @@ def __init__(self, self._apply_op = self.build_apply_op(self._optimizer, self._grads_and_vars) - if len(self._state_inputs) != len(self._state_outputs): - raise ValueError( - "Number of state input and output tensors must match, got: " - "{} vs {}".format(self._state_inputs, self._state_outputs)) - if len(self.get_initial_state()) != len(self._state_inputs): - raise ValueError( - "Length of initial state must match number of state inputs, " - "got: {} vs {}".format(self.get_initial_state(), - self._state_inputs)) - if self._state_inputs and self._seq_lens is None: - raise ValueError( - "seq_lens tensor must be given if state inputs are defined") - - logger.debug("Created {} with loss inputs: {}".format( + logger.debug("Initialized {} with loss inputs: {}".format( self, self._loss_input_dict)) @override(PolicyGraph) @@ -186,18 +192,21 @@ def compute_actions(self, @override(PolicyGraph) def compute_gradients(self, postprocessed_batch): + assert self._loss is not None, "Loss not initialized" builder = TFRunBuilder(self._sess, "compute_gradients") fetches = self._build_compute_gradients(builder, postprocessed_batch) return builder.get(fetches) @override(PolicyGraph) def apply_gradients(self, gradients): + assert self._loss is not None, "Loss not initialized" builder = TFRunBuilder(self._sess, "apply_gradients") fetches = self._build_apply_gradients(builder, gradients) builder.get(fetches) @override(PolicyGraph) def learn_on_batch(self, postprocessed_batch): + assert self._loss is not None, "Loss not initialized" builder = TFRunBuilder(self._sess, "learn_on_batch") fetches = self._build_learn_on_batch(builder, postprocessed_batch) return builder.get(fetches) From b742efec0ec06947bbbcf60d5e520ceabd0886cc Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 15 May 2019 16:51:20 -0700 Subject: [PATCH 02/39] wip --- python/ray/rllib/agents/pg/pg.py | 57 +++++++------------ python/ray/rllib/agents/pg/pg_policy_graph.py | 1 - python/ray/rllib/agents/trainer_template.py | 57 +++++++++++++++++++ .../evaluation/dynamic_tf_policy_graph.py | 21 ++++--- 4 files changed, 89 insertions(+), 47 deletions(-) create mode 100644 python/ray/rllib/agents/trainer_template.py diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index 8ca36647dcb6..9223fd8e0ceb 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -2,12 +2,12 @@ from __future__ import division from __future__ import print_function -from ray.rllib.agents.trainer import Trainer, with_common_config +from ray.rllib.agents.trainer import with_common_config +from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.agents.pg.pg_policy_graph import postprocess_advantages, \ policy_gradient_loss, make_optimizer from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph from ray.rllib.optimizers import SyncSamplesOptimizer -from ray.rllib.utils.annotations import override # yapf: disable # __sphinx_doc_begin__ @@ -22,49 +22,32 @@ # __sphinx_doc_end__ # yapf: enable - PGPolicyGraph = build_tf_graph( - name="PG", + name="PGPolicyGraph", default_config=DEFAULT_CONFIG, postprocess_fn=postprocess_advantages, loss_fn=policy_gradient_loss, make_optimizer=make_optimizer) -class PGTrainer(Trainer): - """Simple policy gradient agent. +def make_policy_optimizer(local_ev, remote_evs, config): + optimizer_config = dict(config["optimizer"], + **{"train_batch_size": config["train_batch_size"]}) + return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config) - This is an example agent to show how to implement algorithms in RLlib. - In most cases, you will probably want to use the PPO agent instead. - """ - _name = "PG" - _default_config = DEFAULT_CONFIG - _policy_graph = PGPolicyGraph +def make_policy_graph(config): + if config["use_pytorch"]: + from ray.rllib.agents.pg.torch_pg_policy_graph import \ + PGTorchPolicyGraph + return PGTorchPolicyGraph + else: + return PGPolicyGraph - @override(Trainer) - def _init(self, config, env_creator): - if config["use_pytorch"]: - from ray.rllib.agents.pg.torch_pg_policy_graph import \ - PGTorchPolicyGraph - policy_cls = PGTorchPolicyGraph - else: - policy_cls = self._policy_graph - self.local_evaluator = self.make_local_evaluator( - env_creator, policy_cls) - self.remote_evaluators = self.make_remote_evaluators( - env_creator, policy_cls, config["num_workers"]) - optimizer_config = dict( - config["optimizer"], - **{"train_batch_size": config["train_batch_size"]}) - self.optimizer = SyncSamplesOptimizer( - self.local_evaluator, self.remote_evaluators, **optimizer_config) - @override(Trainer) - def _train(self): - prev_steps = self.optimizer.num_steps_sampled - self.optimizer.step() - result = self.collect_metrics() - result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - - prev_steps) - return result +PGTrainer = build_trainer( + "PG", + default_config=DEFAULT_CONFIG, + default_policy_graph=PGPolicyGraph, + make_policy_graph=make_policy_graph, + make_policy_optimizer=make_policy_optimizer) diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index f3bef7c6c296..3058bcf1a412 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -2,7 +2,6 @@ from __future__ import division from __future__ import print_function -import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing from ray.rllib.evaluation.sample_batch import SampleBatch diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py new file mode 100644 index 000000000000..9ac15ec5fdc2 --- /dev/null +++ b/python/ray/rllib/agents/trainer_template.py @@ -0,0 +1,57 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ray.rllib.agents.trainer import Trainer +from ray.rllib.utils.annotations import override, DeveloperAPI + + +@DeveloperAPI +def build_trainer(name, + default_config, + default_policy_graph, + make_policy_optimizer, + validate_config=None, + make_policy_graph=None, + before_train_step=None, + after_optimizer_step=None, + after_train_result=None): + class trainer_cls(Trainer): + _name = name + _default_config = default_config + _policy_graph = default_policy_graph + + def _init(self, config, env_creator): + if validate_config: + validate_config(config) + if make_policy_graph is None: + policy_graph = default_policy_graph + else: + policy_graph = make_policy_graph(config) + self.local_evaluator = self.make_local_evaluator( + env_creator, policy_graph) + self.remote_evaluators = self.make_remote_evaluators( + env_creator, policy_graph, config["num_workers"]) + if make_policy_optimizer: + self.optimizer = make_policy_optimizer( + self.local_evaluator, self.remote_evaluators, config) + + @override(Trainer) + def _train(self): + if before_train_step: + before_train_step(self) + prev_steps = self.optimizer.num_steps_sampled + fetches = self.optimizer.step() + if after_optimizer_step: + after_optimizer_step(self, fetches) + res = self.collect_metrics() + res.update( + timesteps_this_iter=self.optimizer.num_steps_sampled - + prev_steps, + info=res.get("info", {})) + if after_train_result: + after_train_result(self, res) + return res + + trainer_cls.__name__ = name + return trainer_cls diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 2185ca8785eb..151fa3121955 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -14,22 +14,24 @@ tf = try_import_tf() -def build_tf_graph( - name, default_config, postprocess_fn, loss_fn, make_optimizer=None): - +def build_tf_graph(name, + default_config, + postprocess_fn, + loss_fn, + make_optimizer=None): class graph_cls(DynamicTFPolicyGraph): def __init__(self, obs_space, action_space, config): config = dict(default_config, **config) - DynamicTFPolicyGraph.__init__( - self, obs_space, action_space, config, loss_fn) + DynamicTFPolicyGraph.__init__(self, obs_space, action_space, + config, loss_fn) @override(PolicyGraph) def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None): - return postprocess_fn( - self, sample_batch, other_agent_batches, episode) + return postprocess_fn(self, sample_batch, other_agent_batches, + episode) @override(TFPolicyGraph) def optimizer(self): @@ -108,10 +110,11 @@ def _initialize_loss_if_needed(self, postprocessed_batch): continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF - shape = (None,) + v.shape[1:] + shape = (None, ) + v.shape[1:] placeholder = tf.placeholder(v.dtype, shape=shape, name=k) unroll_tensors[k] = placeholder - loss_inputs.append((k, placeholder)) # TODO: prune to used only + loss_inputs.append((k, + placeholder)) # TODO: prune to used only loss = self._build_loss(unroll_tensors, self.action_dist) TFPolicyGraph._initialize_loss(self, loss, loss_inputs) From d8a722a48b95fb8f82f3441d5542107afd8d7cd3 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 15 May 2019 17:20:24 -0700 Subject: [PATCH 03/39] clean up --- python/ray/rllib/agents/pg/pg.py | 19 +--- python/ray/rllib/agents/pg/pg_policy_graph.py | 24 +++-- .../evaluation/dynamic_tf_policy_graph.py | 99 +++++++++++++++---- .../ray/rllib/evaluation/tf_policy_graph.py | 6 +- python/ray/rllib/utils/tracking_dict.py | 19 ++++ 5 files changed, 124 insertions(+), 43 deletions(-) create mode 100644 python/ray/rllib/utils/tracking_dict.py diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index 9223fd8e0ceb..347d404cc42f 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -4,9 +4,7 @@ from ray.rllib.agents.trainer import with_common_config from ray.rllib.agents.trainer_template import build_trainer -from ray.rllib.agents.pg.pg_policy_graph import postprocess_advantages, \ - policy_gradient_loss, make_optimizer -from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph +from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph from ray.rllib.optimizers import SyncSamplesOptimizer # yapf: disable @@ -22,21 +20,14 @@ # __sphinx_doc_end__ # yapf: enable -PGPolicyGraph = build_tf_graph( - name="PGPolicyGraph", - default_config=DEFAULT_CONFIG, - postprocess_fn=postprocess_advantages, - loss_fn=policy_gradient_loss, - make_optimizer=make_optimizer) - -def make_policy_optimizer(local_ev, remote_evs, config): +def _make_policy_optimizer(local_ev, remote_evs, config): optimizer_config = dict(config["optimizer"], **{"train_batch_size": config["train_batch_size"]}) return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config) -def make_policy_graph(config): +def _make_policy_graph(config): if config["use_pytorch"]: from ray.rllib.agents.pg.torch_pg_policy_graph import \ PGTorchPolicyGraph @@ -49,5 +40,5 @@ def make_policy_graph(config): "PG", default_config=DEFAULT_CONFIG, default_policy_graph=PGPolicyGraph, - make_policy_graph=make_policy_graph, - make_policy_optimizer=make_policy_optimizer) + make_policy_graph=_make_policy_graph, + make_policy_optimizer=_make_policy_optimizer) diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index 3058bcf1a412..c045cbfe2274 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -2,8 +2,10 @@ from __future__ import division from __future__ import print_function +import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing +from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.utils import try_import_tf @@ -11,20 +13,28 @@ # The basic policy gradients loss -def policy_gradient_loss(postprocessed_batch, action_dist): +def _policy_gradient_loss(graph, postprocessed_batch): actions = postprocessed_batch[SampleBatch.ACTIONS] advantages = postprocessed_batch[Postprocessing.ADVANTAGES] - return -tf.reduce_mean(action_dist.logp(actions) * advantages) + return -tf.reduce_mean(graph.action_dist.logp(actions) * advantages) # This adds the "advantages" column to the sample batch. -def postprocess_advantages(graph, - sample_batch, - other_agent_batches=None, - episode=None): +def _postprocess_advantages(graph, + sample_batch, + other_agent_batches=None, + episode=None): return compute_advantages( sample_batch, 0.0, graph.config["gamma"], use_gae=False) -def make_optimizer(graph): +def _make_optimizer(graph): return tf.train.AdamOptimizer(learning_rate=graph.config["lr"]) + + +PGPolicyGraph = build_tf_graph( + name="PGPolicyGraph", + get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, + postprocess_fn=_postprocess_advantages, + loss_fn=_policy_gradient_loss, + make_optimizer=_make_optimizer) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 151fa3121955..b2d07288efff 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -2,26 +2,49 @@ from __future__ import division from __future__ import print_function +import logging import numpy as np from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.utils.annotations import override +from ray.rllib.utils.annotations import override, DeveloperAPI from ray.rllib.utils import try_import_tf +from ray.rllib.utils.debug import log_once, summarize +from ray.rllib.utils.tracking_dict import UsageTrackingDict tf = try_import_tf() +logger = logging.getLogger(__name__) + +@DeveloperAPI def build_tf_graph(name, - default_config, - postprocess_fn, + get_default_config, loss_fn, + postprocess_fn=None, make_optimizer=None): + """Helper function for creating a dynamic tf policy graph at runtime. + + Arguments: + name (str): name of the graph (e.g., "PGPolicyGraph") + get_default_config (func): function that returns the default config + to merge with any overrides + loss_fn (func): function that returns a loss tensor the policy graph, + and dict of experience tensor placeholders + postprocess_fn (func): optional experience postprocessing function + that takes the same args as PolicyGraph.postprocess_trajectory() + make_optimizer (func): optional function that returns a tf.Optimizer + given the policy graph object + + Returns: + a DynamicTFPolicyGraph instance that uses the specified args + """ + class graph_cls(DynamicTFPolicyGraph): def __init__(self, obs_space, action_space, config): - config = dict(default_config, **config) + config = dict(get_default_config(), **config) DynamicTFPolicyGraph.__init__(self, obs_space, action_space, config, loss_fn) @@ -30,6 +53,8 @@ def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None): + if not postprocess_fn: + return sample_batch return postprocess_fn(self, sample_batch, other_agent_batches, episode) @@ -45,7 +70,22 @@ def optimizer(self): class DynamicTFPolicyGraph(TFPolicyGraph): - def __init__(self, obs_space, action_space, config, loss_fn): + """A TFPolicyGraph that auto-defines placeholders dynamically at runtime. + + The loss function of this class is not initialized until the first batch + of experiences is collected from the environment. At that point we + dynamically generate TF placeholders based on the batch keys and values. + which are passed into the user-defined loss function. + """ + + def __init__(self, + obs_space, + action_space, + config, + loss_fn, + autosetup_model=True, + action_sampler=None, + action_prob=None): self.config = config self._build_loss = loss_fn @@ -57,13 +97,23 @@ def __init__(self, obs_space, action_space, config, loss_fn): prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs - self.model = ModelCatalog.get_model({ - "obs": obs, - "prev_actions": prev_actions, - "prev_rewards": prev_rewards, - "is_training": self._get_is_training_placeholder(), - }, obs_space, action_space, self.logit_dim, self.config["model"]) - self.action_dist = dist_class(self.model.outputs) + if autosetup_model: + self.model = ModelCatalog.get_model({ + "obs": obs, + "prev_actions": prev_actions, + "prev_rewards": prev_rewards, + "is_training": self._get_is_training_placeholder(), + }, obs_space, action_space, self.logit_dim, self.config["model"]) + self.action_dist = dist_class(self.model.outputs) + action_sampler = self.action_dist.sample() + action_prob = self.action_dist.sampled_action_prob() + else: + self.model = None + self.action_dist = None + if not action_sampler: + raise ValueError( + "When autosetup_model=False, action_sampler must be " + "passed in to the constructor.") sess = tf.get_default_session() TFPolicyGraph.__init__( @@ -72,8 +122,8 @@ def __init__(self, obs_space, action_space, config, loss_fn): action_space, sess, obs_input=obs, - action_sampler=self.action_dist.sample(), - action_prob=self.action_dist.sampled_action_prob(), + action_sampler=action_sampler, + action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, @@ -87,18 +137,21 @@ def __init__(self, obs_space, action_space, config, loss_fn): @override(PolicyGraph) def get_initial_state(self): - return self.model.state_init + if self.model: + return self.model.state_init + else: + return [] def _initialize_loss_if_needed(self, postprocessed_batch): if self._loss is not None: return # already created with self._sess.graph.as_default(): - unroll_tensors = { + unroll_tensors = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, - } + }) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), @@ -113,10 +166,16 @@ def _initialize_loss_if_needed(self, postprocessed_batch): shape = (None, ) + v.shape[1:] placeholder = tf.placeholder(v.dtype, shape=shape, name=k) unroll_tensors[k] = placeholder - loss_inputs.append((k, - placeholder)) # TODO: prune to used only - loss = self._build_loss(unroll_tensors, self.action_dist) + if log_once("loss_init"): + logger.info( + "Initializing loss function with inputs:\n\n{}\n".format( + summarize(unroll_tensors))) + + loss = self._build_loss(self, unroll_tensors) + for k in unroll_tensors.accessed_keys: + loss_inputs.append((k, unroll_tensors[k])) + TFPolicyGraph._initialize_loss(self, loss, loss_inputs) self._sess.run(tf.global_variables_initializer()) diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index e5cf697ac32a..8e07e61284b1 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -172,8 +172,10 @@ def _initialize_loss(self, loss, loss_inputs): self._apply_op = self.build_apply_op(self._optimizer, self._grads_and_vars) - logger.debug("Initialized {} with loss inputs: {}".format( - self, self._loss_input_dict)) + if log_once("loss_used"): + logger.info( + "These tensors were used in the loss_fn:\n\n{}\n".format( + summarize(self._loss_input_dict))) @override(PolicyGraph) def compute_actions(self, diff --git a/python/ray/rllib/utils/tracking_dict.py b/python/ray/rllib/utils/tracking_dict.py new file mode 100644 index 000000000000..8b65a4708c2a --- /dev/null +++ b/python/ray/rllib/utils/tracking_dict.py @@ -0,0 +1,19 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +class UsageTrackingDict(dict): + """Dict that tracks which keys have been accessed. + + We make the simplifying assumption only __getitem__ is used to access + values. + """ + + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) + self.accessed_keys = set() + + def __getitem__(self, key): + self.accessed_keys.add(key) + return dict.__getitem__(self, key) From 169493c2d4605cca5de0bca651340efdbb07895b Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 15 May 2019 17:27:49 -0700 Subject: [PATCH 04/39] fix --- python/ray/rllib/utils/tracking_dict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/rllib/utils/tracking_dict.py b/python/ray/rllib/utils/tracking_dict.py index 8b65a4708c2a..d0a04c4d059e 100644 --- a/python/ray/rllib/utils/tracking_dict.py +++ b/python/ray/rllib/utils/tracking_dict.py @@ -5,7 +5,7 @@ class UsageTrackingDict(dict): """Dict that tracks which keys have been accessed. - + We make the simplifying assumption only __getitem__ is used to access values. """ From fee8ec53443ff439ec5c62b174c2ccb8d84d48de Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 15 May 2019 18:02:23 -0700 Subject: [PATCH 05/39] document trainer --- python/ray/rllib/agents/pg/pg.py | 4 +-- python/ray/rllib/agents/trainer_template.py | 31 +++++++++++++++++-- .../evaluation/dynamic_tf_policy_graph.py | 2 +- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index 347d404cc42f..d9acebd91e80 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -27,7 +27,7 @@ def _make_policy_optimizer(local_ev, remote_evs, config): return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config) -def _make_policy_graph(config): +def _get_policy_graph(config): if config["use_pytorch"]: from ray.rllib.agents.pg.torch_pg_policy_graph import \ PGTorchPolicyGraph @@ -40,5 +40,5 @@ def _make_policy_graph(config): "PG", default_config=DEFAULT_CONFIG, default_policy_graph=PGPolicyGraph, - make_policy_graph=_make_policy_graph, + get_policy_graph=_get_policy_graph, make_policy_optimizer=_make_policy_optimizer) diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py index 9ac15ec5fdc2..6c99eb6d4acc 100644 --- a/python/ray/rllib/agents/trainer_template.py +++ b/python/ray/rllib/agents/trainer_template.py @@ -12,10 +12,35 @@ def build_trainer(name, default_policy_graph, make_policy_optimizer, validate_config=None, - make_policy_graph=None, + get_policy_graph=None, before_train_step=None, after_optimizer_step=None, after_train_result=None): + """Helper function for defining a custom trainer. + + Arguments: + name (str): name of the trainer (e.g., "PPO") + default_config (dict): the default config dict of the algorithm + default_policy_graph (cls): the default PolicyGraph class to use + make_policy_optimizer (func): function that returns a PolicyOptimizer + instance given (local_evaluator, remote_evaluators, config) + validate_config (func): optional callback that checks a given config + for correctness. It may mutate the config as needed. + get_policy_graph (func): optional callback that takes a config and + returns the policy graph class to override the default with + before_train_step (func): optional callback to run before each train() + call. It takes the trainer instance as an argument. + after_optimizer_step (func): optional callback to run after each + step() call to the policy optimizer. It takes the trainer instance + and the policy gradient fetches as arguments. + after_train_result (func): optional callback to run at the end of each + train() call. It takes the trainer instance and result dict as + arguments, and may mutate the result dict as needed. + + Returns: + a Trainer instance that uses the specified args. + """ + class trainer_cls(Trainer): _name = name _default_config = default_config @@ -24,10 +49,10 @@ class trainer_cls(Trainer): def _init(self, config, env_creator): if validate_config: validate_config(config) - if make_policy_graph is None: + if get_policy_graph is None: policy_graph = default_policy_graph else: - policy_graph = make_policy_graph(config) + policy_graph = get_policy_graph(config) self.local_evaluator = self.make_local_evaluator( env_creator, policy_graph) self.remote_evaluators = self.make_remote_evaluators( diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index b2d07288efff..71e06d7f54c4 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -28,7 +28,7 @@ def build_tf_graph(name, """Helper function for creating a dynamic tf policy graph at runtime. Arguments: - name (str): name of the graph (e.g., "PGPolicyGraph") + name (str): name of the graph (e.g., "PPOPolicyGraph") get_default_config (func): function that returns the default config to merge with any overrides loss_fn (func): function that returns a loss tensor the policy graph, From 03b602166b701a6cb8f9f4fe80f4aad7777190e9 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 15 May 2019 23:07:38 -0700 Subject: [PATCH 06/39] wip --- .../ray/rllib/agents/ppo/ppo_policy_graph.py | 211 ++++++------------ .../evaluation/dynamic_tf_policy_graph.py | 64 +++++- .../ray/rllib/evaluation/tf_policy_graph.py | 5 + .../rllib/optimizers/multi_gpu_optimizer.py | 18 +- 4 files changed, 142 insertions(+), 156 deletions(-) diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index 61aced1db740..9a7c46e2854f 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -7,6 +7,7 @@ import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing +from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.sample_batch import SampleBatch @@ -107,6 +108,50 @@ def reduce_mean_valid(t): self.loss = loss +def loss_fn(graph, postprocessed_batch): + if graph.model.state_in: + max_seq_len = tf.reduce_max(graph.model.seq_lens) + mask = tf.sequence_mask(graph.model.seq_lens, max_seq_len) + mask = tf.reshape(mask, [-1]) + else: + mask = tf.ones_like( + postprocessed_batch[Postprocessing.ADVANTAGES], dtype=tf.bool) + + loss_obj = PPOLoss( + graph.action_space, + postprocessed_batch[Postprocessing.VALUE_TARGETS], + postprocessed_batch[Postprocessing.ADVANTAGES], + postprocessed_batch[SampleBatch.ACTIONS], + postprocessed_batch[BEHAVIOUR_LOGITS], + postprocessed_batch[SampleBatch.VF_PREDS], + graph.action_dist, + graph.value_function, + graph.kl_coeff, + mask, + entropy_coeff=graph.config["entropy_coeff"], + clip_param=graph.config["clip_param"], + vf_clip_param=graph.config["vf_clip_param"], + vf_loss_coeff=graph.config["vf_loss_coeff"], + use_gae=graph.config["use_gae"]) + + graph.explained_variance = explained_variance( + postprocessed_batch[Postprocessing.VALUE_TARGETS], + graph.value_function) + + graph.stats_fetches = { + "cur_kl_coeff": graph.kl_coeff, + "cur_lr": tf.cast(graph.cur_lr, tf.float64), + "total_loss": loss_obj.loss, + "policy_loss": loss_obj.mean_policy_loss, + "vf_loss": loss_obj.mean_vf_loss, + "vf_explained_var": graph.explained_variance, + "kl": loss_obj.mean_kl, + "entropy": loss_obj.mean_entropy, + } + + return loss_obj.loss + + class PPOPostprocessing(object): """Adds the policy logits, VF preds, and advantages to the trajectory.""" @@ -115,7 +160,7 @@ def extra_compute_action_fetches(self): return dict( TFPolicyGraph.extra_compute_action_fetches(self), **{ SampleBatch.VF_PREDS: self.value_function, - BEHAVIOUR_LOGITS: self.logits + BEHAVIOUR_LOGITS: self.model.outputs, }) @override(PolicyGraph) @@ -143,83 +188,19 @@ def postprocess_trajectory(self, return batch -class PPOPolicyGraph(LearningRateSchedule, PPOPostprocessing, TFPolicyGraph): +class PPOPolicyGraph( + LearningRateSchedule, PPOPostprocessing, DynamicTFPolicyGraph): + def __init__(self, observation_space, action_space, config, existing_inputs=None): - """ - Arguments: - observation_space: Environment observation space specification. - action_space: Environment action space specification. - config (dict): Configuration values for PPO graph. - existing_inputs (list): Optional list of tuples that specify the - placeholders upon which the graph should be built upon. - """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) - self.sess = tf.get_default_session() - self.action_space = action_space - self.config = config - self.kl_coeff_val = self.config["kl_coeff"] - self.kl_target = self.config["kl_target"] - dist_cls, logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"]) - - if existing_inputs: - obs_ph, value_targets_ph, adv_ph, act_ph, \ - logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ - existing_inputs[:8] - existing_state_in = existing_inputs[8:-1] - existing_seq_lens = existing_inputs[-1] - else: - obs_ph = tf.placeholder( - tf.float32, - name="obs", - shape=(None, ) + observation_space.shape) - adv_ph = tf.placeholder( - tf.float32, name="advantages", shape=(None, )) - act_ph = ModelCatalog.get_action_placeholder(action_space) - logits_ph = tf.placeholder( - tf.float32, name="logits", shape=(None, logit_dim)) - vf_preds_ph = tf.placeholder( - tf.float32, name="vf_preds", shape=(None, )) - value_targets_ph = tf.placeholder( - tf.float32, name="value_targets", shape=(None, )) - prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) - prev_rewards_ph = tf.placeholder( - tf.float32, [None], name="prev_reward") - existing_state_in = None - existing_seq_lens = None - self.observations = obs_ph - self.prev_actions = prev_actions_ph - self.prev_rewards = prev_rewards_ph - - self.loss_in = [ - (SampleBatch.CUR_OBS, obs_ph), - (Postprocessing.VALUE_TARGETS, value_targets_ph), - (Postprocessing.ADVANTAGES, adv_ph), - (SampleBatch.ACTIONS, act_ph), - (BEHAVIOUR_LOGITS, logits_ph), - (SampleBatch.VF_PREDS, vf_preds_ph), - (SampleBatch.PREV_ACTIONS, prev_actions_ph), - (SampleBatch.PREV_REWARDS, prev_rewards_ph), - ] - self.model = ModelCatalog.get_model( - { - "obs": obs_ph, - "prev_actions": prev_actions_ph, - "prev_rewards": prev_rewards_ph, - "is_training": self._get_is_training_placeholder(), - }, - observation_space, - action_space, - logit_dim, - self.config["model"], - state_in=existing_state_in, - seq_lens=existing_seq_lens) # KL Coefficient + self.kl_coeff_val = config["kl_coeff"] + self.kl_target = config["kl_target"] self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", @@ -227,9 +208,10 @@ def __init__(self, trainable=False, dtype=tf.float32) - self.logits = self.model.outputs - curr_action_dist = dist_cls(self.logits) - self.sampler = curr_action_dist.sample() + DynamicTFPolicyGraph.__init__( + self, observation_space, action_space, config, loss_fn, + existing_inputs=existing_inputs) + if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() @@ -249,81 +231,18 @@ def __init__(self, "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ - "obs": obs_ph, - "prev_actions": prev_actions_ph, - "prev_rewards": prev_rewards_ph, + "obs": self._obs_input, + "prev_actions": self._prev_action_input, + "prev_rewards": self._prev_reward_input, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: - self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) - - if self.model.state_in: - max_seq_len = tf.reduce_max(self.model.seq_lens) - mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) - mask = tf.reshape(mask, [-1]) - else: - mask = tf.ones_like(adv_ph, dtype=tf.bool) - - self.loss_obj = PPOLoss( - action_space, - value_targets_ph, - adv_ph, - act_ph, - logits_ph, - vf_preds_ph, - curr_action_dist, - self.value_function, - self.kl_coeff, - mask, - entropy_coeff=self.config["entropy_coeff"], - clip_param=self.config["clip_param"], - vf_clip_param=self.config["vf_clip_param"], - vf_loss_coeff=self.config["vf_loss_coeff"], - use_gae=self.config["use_gae"]) + self.value_function = tf.zeros(shape=tf.shape(self._obs_input)[:1]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) - TFPolicyGraph.__init__( - self, - observation_space, - action_space, - self.sess, - obs_input=obs_ph, - action_sampler=self.sampler, - action_prob=curr_action_dist.sampled_action_prob(), - loss=self.loss_obj.loss, - model=self.model, - loss_inputs=self.loss_in, - state_inputs=self.model.state_in, - state_outputs=self.model.state_out, - prev_action_input=prev_actions_ph, - prev_reward_input=prev_rewards_ph, - seq_lens=self.model.seq_lens, - max_seq_len=config["model"]["max_seq_len"]) - - self.sess.run(tf.global_variables_initializer()) - self.explained_variance = explained_variance(value_targets_ph, - self.value_function) - self.stats_fetches = { - "cur_kl_coeff": self.kl_coeff, - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "total_loss": self.loss_obj.loss, - "policy_loss": self.loss_obj.mean_policy_loss, - "vf_loss": self.loss_obj.mean_vf_loss, - "vf_explained_var": self.explained_variance, - "kl": self.loss_obj.mean_kl, - "entropy": self.loss_obj.mean_entropy - } - - @override(TFPolicyGraph) - def copy(self, existing_inputs): - """Creates a copy of self using existing input placeholders.""" - return PPOPolicyGraph( - self.observation_space, - self.action_space, - self.config, - existing_inputs=existing_inputs) + self._sess.run(tf.global_variables_initializer()) @override(TFPolicyGraph) def gradients(self, optimizer, loss): @@ -352,19 +271,19 @@ def update_kl(self, sampled_kl): self.kl_coeff_val *= 1.5 elif sampled_kl < 0.5 * self.kl_target: self.kl_coeff_val *= 0.5 - self.kl_coeff.load(self.kl_coeff_val, session=self.sess) + self.kl_coeff.load(self.kl_coeff_val, session=self._sess) return self.kl_coeff_val def _value(self, ob, prev_action, prev_reward, *args): feed_dict = { - self.observations: [ob], - self.prev_actions: [prev_action], - self.prev_rewards: [prev_reward], + self._obs_input: [ob], + self._prev_action_input: [prev_action], + self._prev_reward_input: [prev_reward], self.model.seq_lens: [1] } assert len(args) == len(self.model.state_in), \ (args, self.model.state_in) for k, v in zip(self.model.state_in, args): feed_dict[k] = v - vf = self.sess.run(self.value_function, feed_dict) + vf = self._sess.run(self.value_function, feed_dict) return vf[0] diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 71e06d7f54c4..9141c51e7681 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -24,7 +24,8 @@ def build_tf_graph(name, get_default_config, loss_fn, postprocess_fn=None, - make_optimizer=None): + make_optimizer=None, + extra_action_fetches=None): """Helper function for creating a dynamic tf policy graph at runtime. Arguments: @@ -37,6 +38,8 @@ def build_tf_graph(name, that takes the same args as PolicyGraph.postprocess_trajectory() make_optimizer (func): optional function that returns a tf.Optimizer given the policy graph object + extra_action_fetches (func): optional function that returns + a dict of TF fetches given the policy graph object Returns: a DynamicTFPolicyGraph instance that uses the specified args @@ -47,6 +50,11 @@ def __init__(self, obs_space, action_space, config): config = dict(get_default_config(), **config) DynamicTFPolicyGraph.__init__(self, obs_space, action_space, config, loss_fn) + if build_extra_action_fetches is None: + self._extra_action_fetches = {} + else: + self._extra_action_fetches = ( + build_extra_action_fetches(self)) @override(PolicyGraph) def postprocess_trajectory(self, @@ -65,6 +73,12 @@ def optimizer(self): else: return TFPolicyGraph.optimizer(self) + @override(TFPolicyGraph) + def extra_compute_action_fetches(self): + return dict( + TFPolicyGraph.extra_compute_action_fetches(self), + **self._extra_action_fetches) + graph_cls.__name__ = name return graph_cls @@ -85,19 +99,28 @@ def __init__(self, loss_fn, autosetup_model=True, action_sampler=None, - action_prob=None): + action_prob=None, + existing_inputs=None): self.config = config + self.autosetup_model = autosetup_model self._build_loss = loss_fn # Setup standard placeholders - obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) - dist_class, self.logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"]) - prev_actions = ModelCatalog.get_action_placeholder(action_space) - prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") + if existing_inputs is not None: + obs = existing_inputs[SampleBatch.CUR_OBS] + prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] + prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] + else: + obs = tf.placeholder( + tf.float32, shape=[None] + list(obs_space.shape), + name="observation") + prev_actions = ModelCatalog.get_action_placeholder(action_space) + prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs if autosetup_model: + dist_class, self.logit_dim = ModelCatalog.get_action_dist( + action_space, self.config["model"]) self.model = ModelCatalog.get_model({ "obs": obs, "prev_actions": prev_actions, @@ -108,6 +131,7 @@ def __init__(self, action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() else: + self.logit_dim = None self.model = None self.action_dist = None if not action_sampler: @@ -135,6 +159,31 @@ def __init__(self, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer()) + @override(TFPolicyGraph) + def copy(self, existing_inputs): + """Creates a copy of self using existing input placeholders.""" + assert self._loss is not None, "Cannot copy graph before loss init" + if len(self._loss_inputs) != len(existing_inputs): + raise ValueError( + "Tensor list mismatch", self._loss_inputs, + existing_inputs) + for i, (k, v) in enumerate(self._loss_inputs): + if v.shape.as_list() != existing_inputs[i].shape.as_list(): + raise ValueError( + "Tensor shape mismatch", i, k, + v.shape, existing_inputs[i].shape) + input_dict = { + k: existing_inputs[i] for i, (k, _) in enumerate(self._loss_inputs) + } + instance = self.__class__( + self.observation_space, self.action_space, self.config, + existing_inputs=input_dict) + loss = instance._build_loss(instance, input_dict) + TFPolicyGraph._initialize_loss( + instance, loss, + [(k, existing_inputs[i]) for i, (k, _) in enumerate(self._loss_inputs)]) + return instance + @override(PolicyGraph) def get_initial_state(self): if self.model: @@ -177,7 +226,6 @@ def _initialize_loss_if_needed(self, postprocessed_batch): loss_inputs.append((k, unroll_tensors[k])) TFPolicyGraph._initialize_loss(self, loss, loss_inputs) - self._sess.run(tf.global_variables_initializer()) @override(PolicyGraph) def compute_gradients(self, postprocessed_batch): diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index 8e07e61284b1..d988a4be7e4f 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -125,6 +125,9 @@ def __init__(self, self._initialize_loss(loss, loss_inputs) else: self._loss = None + # TODO(ekl) what is the right way to handle pre-init vars? + self._variables = ray.experimental.tf_utils.TensorFlowVariables( + self._sampler, self._sess) if len(self._state_inputs) != len(self._state_outputs): raise ValueError( @@ -177,6 +180,8 @@ def _initialize_loss(self, loss, loss_inputs): "These tensors were used in the loss_fn:\n\n{}\n".format( summarize(self._loss_input_dict))) + self._sess.run(tf.global_variables_initializer()) + @override(PolicyGraph) def compute_actions(self, obs_batch, diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py index 45df865e43ff..79340769f284 100644 --- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py +++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py @@ -8,6 +8,7 @@ from collections import defaultdict import ray +from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer @@ -92,9 +93,18 @@ def __init__(self, # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. self.optimizers = {} + + def _initialize_optimizers_as_needed(self, samples): with self.local_evaluator.tf_sess.graph.as_default(): with self.local_evaluator.tf_sess.as_default(): - for policy_id, policy in self.policies.items(): + for policy_id, sample_batch in samples.policy_batches.items(): + if policy_id in self.optimizers: + continue # already initialized + + policy = self.policies[policy_id] + if isinstance(policy, DynamicTFPolicyGraph): + policy._initialize_loss_if_needed(sample_batch) + with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE): if policy._state_inputs: rnn_inputs = policy._state_inputs + [ @@ -110,7 +120,9 @@ def __init__(self, self.per_device_batch_size, policy.copy)) self.sess = self.local_evaluator.tf_sess - self.sess.run(tf.global_variables_initializer()) + self.sess.run(tf.global_variables_initializer()) # TODO(ekl) how to deal with this + + self.optimizers_initialized = True @override(PolicyOptimizer) def step(self): @@ -148,6 +160,8 @@ def step(self): DEFAULT_POLICY_ID: samples }, samples.count) + self._initialize_optimizers_as_needed(samples) + for policy_id, policy in self.policies.items(): if policy_id not in samples.policy_batches: continue From 48bdcf4a2f2f389327937ad250ce0dd1332c7d83 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 15 May 2019 23:24:36 -0700 Subject: [PATCH 07/39] initialize the graph using a fake batch --- .../evaluation/dynamic_tf_policy_graph.py | 41 +++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 71e06d7f54c4..2dab7c7f8c2b 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -72,10 +72,11 @@ def optimizer(self): class DynamicTFPolicyGraph(TFPolicyGraph): """A TFPolicyGraph that auto-defines placeholders dynamically at runtime. - The loss function of this class is not initialized until the first batch - of experiences is collected from the environment. At that point we - dynamically generate TF placeholders based on the batch keys and values. - which are passed into the user-defined loss function. + Initialization of this class occurs in two phases. + * Phase 1: the model is created and model variables are initialized. + * Phase 2: a fake batch of data is created, sent to the trajectory + postprocessor, and then used to create placeholders for the loss + function. The loss function is initialiezd with these placeholders. """ def __init__(self, @@ -133,6 +134,7 @@ def __init__(self, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) + self._initialize_loss_if_needed() sess.run(tf.global_variables_initializer()) @override(PolicyGraph) @@ -142,10 +144,28 @@ def get_initial_state(self): else: return [] - def _initialize_loss_if_needed(self, postprocessed_batch): + def _initialize_loss_if_needed(self): if self._loss is not None: return # already created + def fake_array(tensor): + shape = tensor.shape.as_list() + shape[0] = 1 + return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) + + fake_batch = { + SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), + SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), + SampleBatch.CUR_OBS: fake_array(self._obs_input), + SampleBatch.ACTIONS: fake_array(self._sampler), + SampleBatch.REWARDS: np.array([0], dtype=np.int32), + } + for k, v in self.extra_compute_action_fetches().items(): + fake_batch[k] = fake_array(v) + + postprocessed_batch = self.postprocess_trajectory( + SampleBatch(fake_batch)) + with self._sess.graph.as_default(): unroll_tensors = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, @@ -177,14 +197,3 @@ def _initialize_loss_if_needed(self, postprocessed_batch): loss_inputs.append((k, unroll_tensors[k])) TFPolicyGraph._initialize_loss(self, loss, loss_inputs) - self._sess.run(tf.global_variables_initializer()) - - @override(PolicyGraph) - def compute_gradients(self, postprocessed_batch): - self._initialize_loss_if_needed(postprocessed_batch) - return TFPolicyGraph.compute_gradients(self, postprocessed_batch) - - @override(PolicyGraph) - def learn_on_batch(self, postprocessed_batch): - self._initialize_loss_if_needed(postprocessed_batch) - return TFPolicyGraph.learn_on_batch(self, postprocessed_batch) From 18b290bff8aa6d02aea89b629ff0089a0292315b Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 15 May 2019 23:30:01 -0700 Subject: [PATCH 08/39] clean up dynamic init --- .../ray/rllib/evaluation/dynamic_tf_policy_graph.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 2dab7c7f8c2b..14964348a2e8 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -116,6 +116,7 @@ def __init__(self, "When autosetup_model=False, action_sampler must be " "passed in to the constructor.") + # Phase 1 init sess = tf.get_default_session() TFPolicyGraph.__init__( self, @@ -134,7 +135,9 @@ def __init__(self, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) - self._initialize_loss_if_needed() + + # Phase 2 init + self._initialize_loss_dynamic() sess.run(tf.global_variables_initializer()) @override(PolicyGraph) @@ -144,10 +147,7 @@ def get_initial_state(self): else: return [] - def _initialize_loss_if_needed(self): - if self._loss is not None: - return # already created - + def _initialize_loss_dynamic(self): def fake_array(tensor): shape = tensor.shape.as_list() shape[0] = 1 @@ -157,8 +157,10 @@ def fake_array(tensor): SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), SampleBatch.CUR_OBS: fake_array(self._obs_input), + SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.ACTIONS: fake_array(self._sampler), SampleBatch.REWARDS: np.array([0], dtype=np.int32), + SampleBatch.DONES: np.array([False], dtype=np.bool), } for k, v in self.extra_compute_action_fetches().items(): fake_batch[k] = fake_array(v) From 26367032e64652a905c5a3aa4b4111d80a43c8a8 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 00:32:55 -0700 Subject: [PATCH 09/39] wip --- python/ray/rllib/optimizers/multi_gpu_optimizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py index 23ee1833b9f0..45df865e43ff 100644 --- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py +++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py @@ -6,7 +6,6 @@ import math import numpy as np from collections import defaultdict -import tensorflow as tf import ray from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY @@ -19,6 +18,9 @@ from ray.rllib.utils.timer import TimerStat from ray.rllib.evaluation.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \ MultiAgentBatch +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() logger = logging.getLogger(__name__) From d239a79efb0a49ec97ecef8c69297a990f600b34 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 00:34:29 -0700 Subject: [PATCH 10/39] spelling --- python/ray/rllib/evaluation/dynamic_tf_policy_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 7f298d3abb09..f678df9cd34c 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -90,7 +90,7 @@ class DynamicTFPolicyGraph(TFPolicyGraph): * Phase 1: the model is created and model variables are initialized. * Phase 2: a fake batch of data is created, sent to the trajectory postprocessor, and then used to create placeholders for the loss - function. The loss function is initialiezd with these placeholders. + function. The loss function is initialized with these placeholders. """ def __init__(self, From 3ff0d0883b8ccdb9a2363e0323f348a2acccdfde Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 12:17:18 -0700 Subject: [PATCH 11/39] use builder for ppo pol graph --- python/ray/rllib/agents/pg/pg_policy_graph.py | 20 +- .../ray/rllib/agents/ppo/ppo_policy_graph.py | 252 +++++++++--------- .../evaluation/dynamic_tf_policy_graph.py | 95 ++----- .../ray/rllib/evaluation/tf_policy_graph.py | 4 +- .../evaluation/tf_policy_graph_template.py | 120 +++++++++ python/ray/rllib/optimizers/multi_gpu_impl.py | 2 +- .../rllib/optimizers/multi_gpu_optimizer.py | 2 +- 7 files changed, 276 insertions(+), 219 deletions(-) create mode 100644 python/ray/rllib/evaluation/tf_policy_graph_template.py diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index c045cbfe2274..4858d4a5e87f 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -5,7 +5,7 @@ import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing -from ray.rllib.evaluation.dynamic_tf_policy_graph import build_tf_graph +from ray.rllib.evaluation.tf_policy_graph_template import build_tf_graph from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.utils import try_import_tf @@ -13,23 +13,23 @@ # The basic policy gradients loss -def _policy_gradient_loss(graph, postprocessed_batch): - actions = postprocessed_batch[SampleBatch.ACTIONS] - advantages = postprocessed_batch[Postprocessing.ADVANTAGES] - return -tf.reduce_mean(graph.action_dist.logp(actions) * advantages) +def _policy_gradient_loss(policy, batch_tensors): + actions = batch_tensors[SampleBatch.ACTIONS] + advantages = batch_tensors[Postprocessing.ADVANTAGES] + return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages) # This adds the "advantages" column to the sample batch. -def _postprocess_advantages(graph, +def _postprocess_advantages(policy, sample_batch, other_agent_batches=None, episode=None): return compute_advantages( - sample_batch, 0.0, graph.config["gamma"], use_gae=False) + sample_batch, 0.0, policy.config["gamma"], use_gae=False) -def _make_optimizer(graph): - return tf.train.AdamOptimizer(learning_rate=graph.config["lr"]) +def _make_optimizer(policy): + return tf.train.AdamOptimizer(learning_rate=policy.config["lr"]) PGPolicyGraph = build_tf_graph( @@ -37,4 +37,4 @@ def _make_optimizer(graph): get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, postprocess_fn=_postprocess_advantages, loss_fn=_policy_gradient_loss, - make_optimizer=_make_optimizer) + optimizer_fn=_make_optimizer) diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index b34ebc45482a..b322dba122b0 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -7,14 +7,10 @@ import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing -from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph -from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY -from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.sample_batch import SampleBatch -from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \ - LearningRateSchedule +from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule +from ray.rllib.evaluation.tf_policy_graph_template import build_tf_graph from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.utils.annotations import override from ray.rllib.utils.explained_variance import explained_variance from ray.rllib.utils import try_import_tf @@ -108,95 +104,103 @@ def reduce_mean_valid(t): self.loss = loss -def _build_ppo_loss(graph, postprocessed_batch): - if graph.model.state_in: - max_seq_len = tf.reduce_max(graph.model.seq_lens) - mask = tf.sequence_mask(graph.model.seq_lens, max_seq_len) +def _build_ppo_loss(policy, batch_tensors): + if policy.model.state_in: + max_seq_len = tf.reduce_max(policy.model.seq_lens) + mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like( - postprocessed_batch[Postprocessing.ADVANTAGES], dtype=tf.bool) - - loss_obj = PPOLoss( - graph.action_space, - postprocessed_batch[Postprocessing.VALUE_TARGETS], - postprocessed_batch[Postprocessing.ADVANTAGES], - postprocessed_batch[SampleBatch.ACTIONS], - postprocessed_batch[BEHAVIOUR_LOGITS], - postprocessed_batch[SampleBatch.VF_PREDS], - graph.action_dist, - graph.value_function, - graph.kl_coeff, + batch_tensors[Postprocessing.ADVANTAGES], dtype=tf.bool) + + policy.loss_obj = PPOLoss( + policy.action_space, + batch_tensors[Postprocessing.VALUE_TARGETS], + batch_tensors[Postprocessing.ADVANTAGES], + batch_tensors[SampleBatch.ACTIONS], + batch_tensors[BEHAVIOUR_LOGITS], + batch_tensors[SampleBatch.VF_PREDS], + policy.action_dist, + policy.value_function, + policy.kl_coeff, mask, - entropy_coeff=graph.config["entropy_coeff"], - clip_param=graph.config["clip_param"], - vf_clip_param=graph.config["vf_clip_param"], - vf_loss_coeff=graph.config["vf_loss_coeff"], - use_gae=graph.config["use_gae"]) - - graph.explained_variance = explained_variance( - postprocessed_batch[Postprocessing.VALUE_TARGETS], - graph.value_function) - - graph.stats_fetches = { - "cur_kl_coeff": graph.kl_coeff, - "cur_lr": tf.cast(graph.cur_lr, tf.float64), - "total_loss": loss_obj.loss, - "policy_loss": loss_obj.mean_policy_loss, - "vf_loss": loss_obj.mean_vf_loss, - "vf_explained_var": graph.explained_variance, - "kl": loss_obj.mean_kl, - "entropy": loss_obj.mean_entropy, + entropy_coeff=policy.config["entropy_coeff"], + clip_param=policy.config["clip_param"], + vf_clip_param=policy.config["vf_clip_param"], + vf_loss_coeff=policy.config["vf_loss_coeff"], + use_gae=policy.config["use_gae"]) + + return policy.loss_obj.loss + + +def _build_ppo_stats(policy, batch_tensors): + policy.explained_variance = explained_variance( + batch_tensors[Postprocessing.VALUE_TARGETS], policy.value_function) + + stats_fetches = { + "cur_kl_coeff": policy.kl_coeff, + "cur_lr": tf.cast(policy.cur_lr, tf.float64), + "total_loss": policy.loss_obj.loss, + "policy_loss": policy.loss_obj.mean_policy_loss, + "vf_loss": policy.loss_obj.mean_vf_loss, + "vf_explained_var": policy.explained_variance, + "kl": policy.loss_obj.mean_kl, + "entropy": policy.loss_obj.mean_entropy, } - return loss_obj.loss + return stats_fetches -class PPOPostprocessing(object): +def _build_ppo_action_fetches(policy): + """Adds value function and logits outputs to experience batches.""" + return { + SampleBatch.VF_PREDS: policy.value_function, + BEHAVIOUR_LOGITS: policy.model.outputs, + } + + +def _postprocess_ppo_gae(policy, + sample_batch, + other_agent_batches=None, + episode=None): """Adds the policy logits, VF preds, and advantages to the trajectory.""" - @override(TFPolicyGraph) - def extra_compute_action_fetches(self): - return dict( - TFPolicyGraph.extra_compute_action_fetches(self), **{ - SampleBatch.VF_PREDS: self.value_function, - BEHAVIOUR_LOGITS: self.model.outputs, - }) - - @override(PolicyGraph) - def postprocess_trajectory(self, - sample_batch, - other_agent_batches=None, - episode=None): - completed = sample_batch["dones"][-1] - if completed: - last_r = 0.0 - else: - next_state = [] - for i in range(len(self.model.state_in)): - next_state.append([sample_batch["state_out_{}".format(i)][-1]]) - last_r = self._value(sample_batch[SampleBatch.NEXT_OBS][-1], - sample_batch[SampleBatch.ACTIONS][-1], - sample_batch[SampleBatch.REWARDS][-1], - *next_state) - batch = compute_advantages( - sample_batch, - last_r, - self.config["gamma"], - self.config["lambda"], - use_gae=self.config["use_gae"]) - return batch - - -class PPOPolicyGraph(LearningRateSchedule, PPOPostprocessing, - DynamicTFPolicyGraph): - def __init__(self, - observation_space, - action_space, - config, - existing_inputs=None): - config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) + completed = sample_batch["dones"][-1] + if completed: + last_r = 0.0 + else: + next_state = [] + for i in range(len(policy.model.state_in)): + next_state.append([sample_batch["state_out_{}".format(i)][-1]]) + last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1], + sample_batch[SampleBatch.ACTIONS][-1], + sample_batch[SampleBatch.REWARDS][-1], + *next_state) + batch = compute_advantages( + sample_batch, + last_r, + policy.config["gamma"], + policy.config["lambda"], + use_gae=policy.config["use_gae"]) + return batch + + +def _build_ppo_gradients(policy, optimizer, loss): + if policy.config["grad_clip"] is not None: + policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, + tf.get_variable_scope().name) + grads = tf.gradients(loss, policy.var_list) + policy.grads, _ = tf.clip_by_global_norm(grads, + policy.config["grad_clip"]) + clipped_grads = list(zip(policy.grads, policy.var_list)) + return clipped_grads + else: + return optimizer.compute_gradients( + loss, colocate_gradients_with_ops=True) + +class KLCoeffMixin(object): + def __init__(self, config): # KL Coefficient self.kl_coeff_val = config["kl_coeff"] self.kl_target = config["kl_target"] @@ -207,20 +211,22 @@ def __init__(self, trainable=False, dtype=tf.float32) - DynamicTFPolicyGraph.__init__( - self, - observation_space, - action_space, - config, - _build_ppo_loss, - existing_inputs=existing_inputs, - autoinit_loss=False) - - if self.config["use_gae"]: - if self.config["vf_share_layers"]: + def update_kl(self, sampled_kl): + if sampled_kl > 2.0 * self.kl_target: + self.kl_coeff_val *= 1.5 + elif sampled_kl < 0.5 * self.kl_target: + self.kl_coeff_val *= 0.5 + self.kl_coeff.load(self.kl_coeff_val, session=self._sess) + return self.kl_coeff_val + + +class ValueNetworkMixin(object): + def __init__(self, obs_space, action_space, config): + if config["use_gae"]: + if config["vf_share_layers"]: self.value_function = self.model.value_function() else: - vf_config = self.config["model"].copy() + vf_config = config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. @@ -239,45 +245,11 @@ def __init__(self, "prev_actions": self._prev_action_input, "prev_rewards": self._prev_reward_input, "is_training": self._get_is_training_placeholder(), - }, observation_space, action_space, 1, vf_config).outputs + }, obs_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(self._obs_input)[:1]) - LearningRateSchedule.__init__(self, self.config["lr"], - self.config["lr_schedule"]) - self._initialize_loss() - - @override(TFPolicyGraph) - def gradients(self, optimizer, loss): - if self.config["grad_clip"] is not None: - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, - tf.get_variable_scope().name) - grads = tf.gradients(loss, self.var_list) - self.grads, _ = tf.clip_by_global_norm(grads, - self.config["grad_clip"]) - clipped_grads = list(zip(self.grads, self.var_list)) - return clipped_grads - else: - return optimizer.compute_gradients( - loss, colocate_gradients_with_ops=True) - - @override(PolicyGraph) - def get_initial_state(self): - return self.model.state_init - - @override(TFPolicyGraph) - def extra_compute_grad_fetches(self): - return {LEARNER_STATS_KEY: self.stats_fetches} - - def update_kl(self, sampled_kl): - if sampled_kl > 2.0 * self.kl_target: - self.kl_coeff_val *= 1.5 - elif sampled_kl < 0.5 * self.kl_target: - self.kl_coeff_val *= 0.5 - self.kl_coeff.load(self.kl_coeff_val, session=self._sess) - return self.kl_coeff_val - def _value(self, ob, prev_action, prev_reward, *args): feed_dict = { self._obs_input: [ob], @@ -291,3 +263,21 @@ def _value(self, ob, prev_action, prev_reward, *args): feed_dict[k] = v vf = self._sess.run(self.value_function, feed_dict) return vf[0] + + +def _setup_mixins(policy, obs_space, action_space, config): + ValueNetworkMixin.__init__(policy, obs_space, action_space, config) + KLCoeffMixin.__init__(policy, config) + LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) + + +PPOPolicyGraph = build_tf_graph( + name="PPOPolicyGraph", + get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, + loss_fn=_build_ppo_loss, + stats_fn=_build_ppo_stats, + extra_action_fetches_fn=_build_ppo_action_fetches, + postprocess_fn=_postprocess_ppo_gae, + gradients_fn=_build_ppo_gradients, + pre_loss_init_fn=_setup_mixins, + mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin]) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index f678df9cd34c..49da52a25636 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -21,68 +21,6 @@ @DeveloperAPI -def build_tf_graph(name, - get_default_config, - loss_fn, - postprocess_fn=None, - make_optimizer=None, - make_extra_action_fetches=None): - """Helper function for creating a dynamic tf policy graph at runtime. - - Arguments: - name (str): name of the graph (e.g., "PPOPolicyGraph") - get_default_config (func): function that returns the default config - to merge with any overrides - loss_fn (func): function that returns a loss tensor the policy graph, - and dict of experience tensor placeholders - postprocess_fn (func): optional experience postprocessing function - that takes the same args as PolicyGraph.postprocess_trajectory() - make_optimizer (func): optional function that returns a tf.Optimizer - given the policy graph object - make_extra_action_fetches (func): optional function that returns - a dict of TF fetches given the policy graph object - - Returns: - a DynamicTFPolicyGraph instance that uses the specified args - """ - - class graph_cls(DynamicTFPolicyGraph): - def __init__(self, obs_space, action_space, config): - config = dict(get_default_config(), **config) - if make_extra_action_fetches is None: - self._extra_action_fetches = {} - else: - self._extra_action_fetches = make_extra_action_fetches(self) - DynamicTFPolicyGraph.__init__(self, obs_space, action_space, - config, loss_fn) - - @override(PolicyGraph) - def postprocess_trajectory(self, - sample_batch, - other_agent_batches=None, - episode=None): - if not postprocess_fn: - return sample_batch - return postprocess_fn(self, sample_batch, other_agent_batches, - episode) - - @override(TFPolicyGraph) - def optimizer(self): - if make_optimizer: - return make_optimizer(self) - else: - return TFPolicyGraph.optimizer(self) - - @override(TFPolicyGraph) - def extra_compute_action_fetches(self): - return dict( - TFPolicyGraph.extra_compute_action_fetches(self), - **self._extra_action_fetches) - - graph_cls.__name__ = name - return graph_cls - - class DynamicTFPolicyGraph(TFPolicyGraph): """A TFPolicyGraph that auto-defines placeholders dynamically at runtime. @@ -98,14 +36,16 @@ def __init__(self, action_space, config, loss_fn, + stats_fn=None, autosetup_model=True, - autoinit_loss=True, + pre_loss_init_fn=None, action_sampler=None, action_prob=None, existing_inputs=None): self.config = config self.autosetup_model = autosetup_model - self._build_loss = loss_fn + self._loss_fn = loss_fn + self._stats_fn = stats_fn # Setup standard placeholders if existing_inputs is not None: @@ -182,7 +122,9 @@ def __init__(self, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) - if autoinit_loss: + # Phase 2 init + pre_loss_init_fn(self, obs_space, action_space, config) + if not existing_inputs: self._initialize_loss() @override(TFPolicyGraph) @@ -201,7 +143,7 @@ def copy(self, existing_inputs): if v.shape.as_list() != existing_inputs[i].shape.as_list(): raise ValueError("Tensor shape mismatch", i, k, v.shape, existing_inputs[i].shape) - # by convention, the loss inputs are followed by state inputs and then + # By convention, the loss inputs are followed by state inputs and then # the seq len tensor rnn_inputs = [] for i in range(len(self._state_inputs)): @@ -217,7 +159,10 @@ def copy(self, existing_inputs): self.action_space, self.config, existing_inputs=input_dict) - loss = instance._build_loss(instance, input_dict) + loss = instance._loss_fn(instance, input_dict) + if instance._stats_fn: + instance._stats_fetches.update( + instance._stats_fn(instance, input_dict)) TFPolicyGraph._initialize_loss( instance, loss, [(k, existing_inputs[i]) for i, (k, _) in enumerate(self._loss_inputs)]) @@ -259,7 +204,7 @@ def fake_array(tensor): postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) - loss_input = UsageTrackingDict({ + batch_tensors = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, @@ -271,22 +216,24 @@ def fake_array(tensor): ] for k, v in postprocessed_batch.items(): - if k in loss_input: + if k in batch_tensors: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF shape = (None, ) + v.shape[1:] placeholder = tf.placeholder(v.dtype, shape=shape, name=k) - loss_input[k] = placeholder + batch_tensors[k] = placeholder if log_once("loss_init"): logger.info( "Initializing loss function with dummy input:\n\n{}\n".format( - summarize(loss_input))) + summarize(batch_tensors))) - loss = self._build_loss(self, loss_input) - for k in sorted(loss_input.accessed_keys): - loss_inputs.append((k, loss_input[k])) + loss = self._loss_fn(self, batch_tensors) + if self._stats_fn: + self._stats_fetches.update(self._stats_fn(self, batch_tensors)) + for k in sorted(batch_tensors.accessed_keys): + loss_inputs.append((k, batch_tensors[k])) TFPolicyGraph._initialize_loss(self, loss, loss_inputs) self._sess.run(tf.global_variables_initializer()) diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index bc15dd35db9b..a33626b2df52 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -120,6 +120,7 @@ def __init__(self, self._max_seq_len = max_seq_len self._batch_divisibility_req = batch_divisibility_req self._update_ops = update_ops + self._stats_fetches = {} if loss is not None: self._initialize_loss(loss, loss_inputs) @@ -147,10 +148,9 @@ def _initialize_loss(self, loss, loss_inputs): if self.model: self._loss = self.model.custom_loss(loss, self._loss_input_dict) - self._stats_fetches = {"model": self.model.custom_stats()} + self._stats_fetches.update({"model": self.model.custom_stats()}) else: self._loss = loss - self._stats_fetches = {} self._optimizer = self.optimizer() self._grads_and_vars = [ diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py new file mode 100644 index 000000000000..549b508e791e --- /dev/null +++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py @@ -0,0 +1,120 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ray.rllib.evaluation.dynamic_tf_policy_graph import DynamicTFPolicyGraph +from ray.rllib.evaluation.policy_graph import PolicyGraph +from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph +from ray.rllib.utils.annotations import override, DeveloperAPI + + +@DeveloperAPI +def build_tf_graph(name, + get_default_config, + loss_fn, + stats_fn=None, + extra_action_fetches_fn=None, + postprocess_fn=None, + optimizer_fn=None, + gradients_fn=None, + pre_init_fn=None, + pre_loss_init_fn=None, + post_init_fn=None, + mixins=None): + """Helper function for creating a dynamic tf policy graph at runtime. + + Arguments: + name (str): name of the graph (e.g., "PPOPolicyGraph") + get_default_config (func): function that returns the default config + to merge with any overrides + loss_fn (func): function that returns a loss tensor the policy graph, + and dict of experience tensor placeholders + stats_fn (func): optional function that returns a dict of + TF fetches given the policy graph and batch input tensors + extra_action_fetches_fn (func): optional function that returns + a dict of TF fetches given the policy graph object + postprocess_fn (func): optional experience postprocessing function + that takes the same args as PolicyGraph.postprocess_trajectory() + optimizer_fn (func): optional function that returns a tf.Optimizer + given the policy graph object + gradients_fn (func): optional function that returns a list of gradients + given a tf optimizer and loss tensor. If not specified, this + defaults to optimizer.compute_gradients(loss) + pre_init_fn (func): optional function to run at the beginning of + __init__ that takes the same arguments as __init__ + pre_loss_init_fn (func): optional function to run prior to loss + init that takes the same arguments as __init__ + post_init_fn (func): optional function to run at the end of __init__ + that takes the same arguments as __init__ + mixins (list): list of any class mixins for the returned policy class + + Returns: + a DynamicTFPolicyGraph instance that uses the specified args + """ + + if mixins is None: + mixins = [] + + class graph_cls(*mixins, DynamicTFPolicyGraph): + def __init__(self, + obs_space, + action_space, + config, + existing_inputs=None): + config = dict(get_default_config(), **config) + + if pre_init_fn: + pre_init_fn(self, obs_space, action_space, config) + + def before_loss_init(policy, obs_space, action_space, config): + if pre_loss_init_fn: + pre_loss_init_fn(policy, obs_space, action_space, config) + if extra_action_fetches_fn is None: + self._extra_action_fetches = {} + else: + self._extra_action_fetches = extra_action_fetches_fn(self) + + DynamicTFPolicyGraph.__init__( + self, + obs_space, + action_space, + config, + loss_fn, + stats_fn, + pre_loss_init_fn=before_loss_init, + existing_inputs=existing_inputs) + if post_init_fn: + post_init_fn(self, obs_space, action_space, config) + + @override(PolicyGraph) + def postprocess_trajectory(self, + sample_batch, + other_agent_batches=None, + episode=None): + if not postprocess_fn: + return sample_batch + return postprocess_fn(self, sample_batch, other_agent_batches, + episode) + + @override(TFPolicyGraph) + def optimizer(self): + if optimizer_fn: + return optimizer_fn(self) + else: + return TFPolicyGraph.optimizer(self) + + @override(TFPolicyGraph) + def gradients(self, optimizer, loss): + if gradients_fn: + return gradients_fn(self, optimizer, loss) + else: + return TFPolicyGraph.gradients(self, optimizer, loss) + + @override(TFPolicyGraph) + def extra_compute_action_fetches(self): + return dict( + TFPolicyGraph.extra_compute_action_fetches(self), + **self._extra_action_fetches) + + graph_cls.__name__ = name + return graph_cls diff --git a/python/ray/rllib/optimizers/multi_gpu_impl.py b/python/ray/rllib/optimizers/multi_gpu_impl.py index d892dbe7dbac..8d1bbd4fb54d 100644 --- a/python/ray/rllib/optimizers/multi_gpu_impl.py +++ b/python/ray/rllib/optimizers/multi_gpu_impl.py @@ -255,7 +255,7 @@ def optimize(self, sess, batch_index): fetches = {"train": self._train_op} for tower in self._towers: - fetches.update(tower.loss_graph.extra_compute_grad_fetches()) + fetches.update(tower.loss_graph._get_grad_and_stats_fetches()) return sess.run(fetches, feed_dict=feed_dict) diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py index 45df865e43ff..de2671e6a932 100644 --- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py +++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py @@ -222,6 +222,6 @@ def stats(self): def _averaged(kv): out = {} for k, v in kv.items(): - if v[0] is not None: + if v[0] is not None and not isinstance(v[0], dict): out[k] = np.mean(v) return out From e218d2c59ad35cd09648e10bfcaa09c541610cef Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 14:37:37 -0700 Subject: [PATCH 12/39] add ppo graph --- python/ray/rllib/agents/ppo/ppo.py | 212 ++++++++++++++--------------- 1 file changed, 103 insertions(+), 109 deletions(-) diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 8f69c91149e7..54f7a579fcc4 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -4,10 +4,10 @@ import logging -from ray.rllib.agents import Trainer, with_common_config +from ray.rllib.agents import with_common_config from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph +from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer -from ray.rllib.utils.annotations import override logger = logging.getLogger(__name__) @@ -63,110 +63,104 @@ # yapf: enable -class PPOTrainer(Trainer): - """Multi-GPU optimized implementation of PPO in TensorFlow.""" - - _name = "PPO" - _default_config = DEFAULT_CONFIG - _policy_graph = PPOPolicyGraph - - @override(Trainer) - def _init(self, config, env_creator): - self._validate_config() - self.local_evaluator = self.make_local_evaluator( - env_creator, self._policy_graph) - self.remote_evaluators = self.make_remote_evaluators( - env_creator, self._policy_graph, config["num_workers"]) - if config["simple_optimizer"]: - self.optimizer = SyncSamplesOptimizer( - self.local_evaluator, - self.remote_evaluators, - num_sgd_iter=config["num_sgd_iter"], - train_batch_size=config["train_batch_size"]) - else: - self.optimizer = LocalMultiGPUOptimizer( - self.local_evaluator, - self.remote_evaluators, - sgd_batch_size=config["sgd_minibatch_size"], - num_sgd_iter=config["num_sgd_iter"], - num_gpus=config["num_gpus"], - sample_batch_size=config["sample_batch_size"], - num_envs_per_worker=config["num_envs_per_worker"], - train_batch_size=config["train_batch_size"], - standardize_fields=["advantages"], - straggler_mitigation=config["straggler_mitigation"]) - - @override(Trainer) - def _train(self): - if "observation_filter" not in self.raw_user_config: - # TODO(ekl) remove this message after a few releases - logger.info( - "Important! Since 0.7.0, observation normalization is no " - "longer enabled by default. To enable running-mean " - "normalization, set 'observation_filter': 'MeanStdFilter'. " - "You can ignore this message if your environment doesn't " - "require observation normalization.") - prev_steps = self.optimizer.num_steps_sampled - fetches = self.optimizer.step() - if "kl" in fetches: - # single-agent - self.local_evaluator.for_policy( - lambda pi: pi.update_kl(fetches["kl"])) - else: - - def update(pi, pi_id): - if pi_id in fetches: - pi.update_kl(fetches[pi_id]["kl"]) - else: - logger.debug( - "No data for {}, not updating kl".format(pi_id)) - - # multi-agent - self.local_evaluator.foreach_trainable_policy(update) - res = self.collect_metrics() - res.update( - timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, - info=res.get("info", {})) - - # Warn about bad clipping configs - if self.config["vf_clip_param"] <= 0: - rew_scale = float("inf") - elif res["policy_reward_mean"]: - rew_scale = 0 # punt on handling multiagent case - else: - rew_scale = round( - abs(res["episode_reward_mean"]) / self.config["vf_clip_param"], - 0) - if rew_scale > 200: - logger.warning( - "The magnitude of your environment rewards are more than " - "{}x the scale of `vf_clip_param`. ".format(rew_scale) + - "This means that it will take more than " - "{} iterations for your value ".format(rew_scale) + - "function to converge. If this is not intended, consider " - "increasing `vf_clip_param`.") - return res - - def _validate_config(self): - if self.config["entropy_coeff"] < 0: - raise DeprecationWarning("entropy_coeff must be >= 0") - if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]: - raise ValueError( - "Minibatch size {} must be <= train batch size {}.".format( - self.config["sgd_minibatch_size"], - self.config["train_batch_size"])) - if (self.config["batch_mode"] == "truncate_episodes" - and not self.config["use_gae"]): - raise ValueError( - "Episode truncation is not supported without a value " - "function. Consider setting batch_mode=complete_episodes.") - if (self.config["multiagent"]["policy_graphs"] - and not self.config["simple_optimizer"]): - logger.info( - "In multi-agent mode, policies will be optimized sequentially " - "by the multi-GPU optimizer. Consider setting " - "simple_optimizer=True if this doesn't work for you.") - if not self.config["vf_share_layers"]: - logger.warning( - "FYI: By default, the value function will not share layers " - "with the policy model ('vf_share_layers': False).") +def _make_optimizer(local_evaluator, remote_evaluators, config): + if config["simple_optimizer"]: + return SyncSamplesOptimizer( + local_evaluator, + remote_evaluators, + num_sgd_iter=config["num_sgd_iter"], + train_batch_size=config["train_batch_size"]) + + return LocalMultiGPUOptimizer( + local_evaluator, + remote_evaluators, + sgd_batch_size=config["sgd_minibatch_size"], + num_sgd_iter=config["num_sgd_iter"], + num_gpus=config["num_gpus"], + sample_batch_size=config["sample_batch_size"], + num_envs_per_worker=config["num_envs_per_worker"], + train_batch_size=config["train_batch_size"], + standardize_fields=["advantages"], + straggler_mitigation=config["straggler_mitigation"]) + + +def _update_kl(trainer, fetches): + if "kl" in fetches: + # single-agent + trainer.local_evaluator.for_policy( + lambda pi: pi.update_kl(fetches["kl"])) + else: + + def update(pi, pi_id): + if pi_id in fetches: + pi.update_kl(fetches[pi_id]["kl"]) + else: + logger.debug("No data for {}, not updating kl".format(pi_id)) + + # multi-agent + trainer.local_evaluator.foreach_trainable_policy(update) + + +def _warn_about_obs_filter(trainer): + if "observation_filter" not in trainer.raw_user_config: + # TODO(ekl) remove this message after a few releases + logger.info( + "Important! Since 0.7.0, observation normalization is no " + "longer enabled by default. To enable running-mean " + "normalization, set 'observation_filter': 'MeanStdFilter'. " + "You can ignore this message if your environment doesn't " + "require observation normalization.") + + +def _warn_about_bad_reward_scales(trainer, result): + # Warn about bad clipping configs + if trainer.config["vf_clip_param"] <= 0: + rew_scale = float("inf") + elif result["policy_reward_mean"]: + rew_scale = 0 # punt on handling multiagent case + else: + rew_scale = round( + abs(result["episode_reward_mean"]) / + trainer.config["vf_clip_param"], 0) + if rew_scale > 200: + logger.warning( + "The magnitude of your environment rewards are more than " + "{}x the scale of `vf_clip_param`. ".format(rew_scale) + + "This means that it will take more than " + "{} iterations for your value ".format(rew_scale) + + "function to converge. If this is not intended, consider " + "increasing `vf_clip_param`.") + + +def _validate_config(config): + if config["entropy_coeff"] < 0: + raise DeprecationWarning("entropy_coeff must be >= 0") + if config["sgd_minibatch_size"] > config["train_batch_size"]: + raise ValueError( + "Minibatch size {} must be <= train batch size {}.".format( + config["sgd_minibatch_size"], config["train_batch_size"])) + if (config["batch_mode"] == "truncate_episodes" and not config["use_gae"]): + raise ValueError( + "Episode truncation is not supported without a value " + "function. Consider setting batch_mode=complete_episodes.") + if (config["multiagent"]["policy_graphs"] + and not config["simple_optimizer"]): + logger.info( + "In multi-agent mode, policies will be optimized sequentially " + "by the multi-GPU optimizer. Consider setting " + "simple_optimizer=True if this doesn't work for you.") + if not config["vf_share_layers"]: + logger.warning( + "FYI: By default, the value function will not share layers " + "with the policy model ('vf_share_layers': False).") + + +PPOTrainer = build_trainer( + "PPO", + default_config=DEFAULT_CONFIG, + default_policy_graph=PPOPolicyGraph, + make_policy_optimizer=_make_optimizer, + validate_config=_validate_config, + after_optimizer_step=_update_kl, + before_train_step=_warn_about_obs_filter, + after_train_result=_warn_about_bad_reward_scales) From 9d9fd97211e9d1d4822c6cab2bf904f251146c8f Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 14:44:12 -0700 Subject: [PATCH 13/39] fix naming --- .../ray/rllib/agents/ppo/ppo_policy_graph.py | 2 +- .../evaluation/tf_policy_graph_template.py | 30 ++++++++++--------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index b322dba122b0..774fd7c583a3 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -279,5 +279,5 @@ def _setup_mixins(policy, obs_space, action_space, config): extra_action_fetches_fn=_build_ppo_action_fetches, postprocess_fn=_postprocess_ppo_gae, gradients_fn=_build_ppo_gradients, - pre_loss_init_fn=_setup_mixins, + before_loss_init=_setup_mixins, mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin]) diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py index 549b508e791e..338ab587ce4a 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph_template.py +++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py @@ -17,9 +17,9 @@ def build_tf_graph(name, postprocess_fn=None, optimizer_fn=None, gradients_fn=None, - pre_init_fn=None, - pre_loss_init_fn=None, - post_init_fn=None, + before_init=None, + before_loss_init=None, + after_init=None, mixins=None): """Helper function for creating a dynamic tf policy graph at runtime. @@ -40,11 +40,11 @@ def build_tf_graph(name, gradients_fn (func): optional function that returns a list of gradients given a tf optimizer and loss tensor. If not specified, this defaults to optimizer.compute_gradients(loss) - pre_init_fn (func): optional function to run at the beginning of + before_init (func): optional function to run at the beginning of __init__ that takes the same arguments as __init__ - pre_loss_init_fn (func): optional function to run prior to loss + before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ - post_init_fn (func): optional function to run at the end of __init__ + after_init (func): optional function to run at the end of __init__ that takes the same arguments as __init__ mixins (list): list of any class mixins for the returned policy class @@ -63,12 +63,13 @@ def __init__(self, existing_inputs=None): config = dict(get_default_config(), **config) - if pre_init_fn: - pre_init_fn(self, obs_space, action_space, config) + if before_init: + before_init(self, obs_space, action_space, config) - def before_loss_init(policy, obs_space, action_space, config): - if pre_loss_init_fn: - pre_loss_init_fn(policy, obs_space, action_space, config) + def before_loss_init_wrapper(policy, obs_space, action_space, + config): + if before_loss_init: + before_loss_init(policy, obs_space, action_space, config) if extra_action_fetches_fn is None: self._extra_action_fetches = {} else: @@ -81,10 +82,11 @@ def before_loss_init(policy, obs_space, action_space, config): config, loss_fn, stats_fn, - pre_loss_init_fn=before_loss_init, + before_loss_init=before_loss_init_wrapper, existing_inputs=existing_inputs) - if post_init_fn: - post_init_fn(self, obs_space, action_space, config) + + if after_init: + after_init(self, obs_space, action_space, config) @override(PolicyGraph) def postprocess_trajectory(self, From 3093391674ecdc5f53c3ade5a8afa04a2eb3a4a4 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 14:45:32 -0700 Subject: [PATCH 14/39] order --- python/ray/rllib/evaluation/tf_policy_graph_template.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py index 338ab587ce4a..00c3a25ab753 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph_template.py +++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py @@ -46,7 +46,9 @@ def build_tf_graph(name, init that takes the same arguments as __init__ after_init (func): optional function to run at the end of __init__ that takes the same arguments as __init__ - mixins (list): list of any class mixins for the returned policy class + mixins (list): list of any class mixins for the returned policy class. + These mixins will be applied in order and will have higher + precedence than the DynamicTFPolicyGraph class Returns: a DynamicTFPolicyGraph instance that uses the specified args From e670abd218b7b1c99bd3018168089a9a154a4aa0 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 14:51:13 -0700 Subject: [PATCH 15/39] docs --- .../evaluation/dynamic_tf_policy_graph.py | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 49da52a25636..5b7daa0a3eae 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -28,7 +28,8 @@ class DynamicTFPolicyGraph(TFPolicyGraph): * Phase 1: the model is created and model variables are initialized. * Phase 2: a fake batch of data is created, sent to the trajectory postprocessor, and then used to create placeholders for the loss - function. The loss function is initialized with these placeholders. + function. The loss and stats functions are initialized with these + placeholders. """ def __init__(self, @@ -38,10 +39,33 @@ def __init__(self, loss_fn, stats_fn=None, autosetup_model=True, - pre_loss_init_fn=None, + before_loss_init=None, action_sampler=None, action_prob=None, existing_inputs=None): + """Initialize a dynamic TF policy graph. + + Arguments: + observation_space (gym.Space): Observation space of the policy. + action_space (gym.Space): Action space of the policy. + config (dict): Policy-specific configuration data. + loss_fn (func): function that returns a loss tensor the policy + graph, and dict of experience tensor placeholders + stats_fn (func): optional function that returns a dict of + TF fetches given the policy graph and batch input tensors + autosetup_model (bool): whether to create a model and action dist + using catalog defaults. These will be available as self.model + and self.action_dist + before_loss_init (func): optional function to run prior to loss + init that takes the same arguments as __init__ + action_sampler (Tensor): if autosetup_model is False, this must be + specified to define how the policy computes actions + action_prob (Tensor): if autosetup_model is False, this can be + specified to define the chosen action probability + existing_inputs (OrderedDict): when copying a policy graph, this + specifies an existing dict of placeholders to use instead of + defining new ones + """ self.config = config self.autosetup_model = autosetup_model self._loss_fn = loss_fn @@ -123,7 +147,7 @@ def __init__(self, max_seq_len=config["model"]["max_seq_len"]) # Phase 2 init - pre_loss_init_fn(self, obs_space, action_space, config) + before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss() From 0aba2f38f652c23e7144b5868f5dd0c8beaa8216 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 14:58:21 -0700 Subject: [PATCH 16/39] set class name correctly --- python/ray/rllib/agents/trainer_template.py | 7 ++++++- python/ray/rllib/evaluation/tf_policy_graph_template.py | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py index 6c99eb6d4acc..85ed16b56044 100644 --- a/python/ray/rllib/agents/trainer_template.py +++ b/python/ray/rllib/agents/trainer_template.py @@ -41,6 +41,10 @@ def build_trainer(name, a Trainer instance that uses the specified args. """ + if name.endswith("Trainer"): + raise ValueError("Algorithm name should not include *Trainer suffix", + name) + class trainer_cls(Trainer): _name = name _default_config = default_config @@ -78,5 +82,6 @@ def _train(self): after_train_result(self, res) return res - trainer_cls.__name__ = name + trainer_cls.__name__ = name + "Trainer" + trainer_cls.__qualname__ = name + "Trainer" return trainer_cls diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py index 00c3a25ab753..16aac69bd8ba 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph_template.py +++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py @@ -57,6 +57,9 @@ def build_tf_graph(name, if mixins is None: mixins = [] + if not name.endswith("PolicyGraph"): + raise ValueError("Name should match *PolicyGraph", name) + class graph_cls(*mixins, DynamicTFPolicyGraph): def __init__(self, obs_space, @@ -121,4 +124,5 @@ def extra_compute_action_fetches(self): **self._extra_action_fetches) graph_cls.__name__ = name + graph_cls.__qualname__ = name return graph_cls From 298fcd041174a9bd24c511fc4b516f6fd760d23d Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 15:35:22 -0700 Subject: [PATCH 17/39] add torch builder --- python/ray/rllib/agents/pg/pg_policy_graph.py | 4 +- .../rllib/agents/pg/torch_pg_policy_graph.py | 92 +++++---------- .../ray/rllib/agents/ppo/ppo_policy_graph.py | 4 +- .../evaluation/tf_policy_graph_template.py | 24 ++-- .../rllib/evaluation/torch_policy_graph.py | 36 +++--- .../evaluation/torch_policy_graph_template.py | 108 ++++++++++++++++++ python/ray/rllib/utils/tracking_dict.py | 12 +- 7 files changed, 187 insertions(+), 93 deletions(-) create mode 100644 python/ray/rllib/evaluation/torch_policy_graph_template.py diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index 4858d4a5e87f..510662be6f00 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -5,7 +5,7 @@ import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing -from ray.rllib.evaluation.tf_policy_graph_template import build_tf_graph +from ray.rllib.evaluation.tf_policy_graph_template import build_tf_policy from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.utils import try_import_tf @@ -32,7 +32,7 @@ def _make_optimizer(policy): return tf.train.AdamOptimizer(learning_rate=policy.config["lr"]) -PGPolicyGraph = build_tf_graph( +PGPolicyGraph = build_tf_policy( name="PGPolicyGraph", get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, postprocess_fn=_postprocess_advantages, diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py index 746ef1bca42f..063cc0610c3e 100644 --- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py @@ -3,81 +3,47 @@ from __future__ import print_function import torch -from torch import nn import ray -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing -from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.sample_batch import SampleBatch -from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph -from ray.rllib.utils.annotations import override +from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy -class PGLoss(nn.Module): - def __init__(self, dist_class): - nn.Module.__init__(self) - self.dist_class = dist_class +def _pg_torch_loss(policy, batch_tensors): + logits, _, values, _ = policy.model({ + SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] + }, []) + action_dist = policy.dist_class(logits) + log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS]) + # save the error in the policy object + policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot( + log_probs.reshape(-1)) + return policy.pi_err - def forward(self, policy_model, observations, actions, advantages): - logits, _, values, _ = policy_model({ - SampleBatch.CUR_OBS: observations - }, []) - dist = self.dist_class(logits) - log_probs = dist.logp(actions) - self.pi_err = -advantages.dot(log_probs.reshape(-1)) - return self.pi_err +def _postprocess_advantages(policy, + sample_batch, + other_agent_batches=None, + episode=None): + return compute_advantages( + sample_batch, 0.0, policy.config["gamma"], use_gae=False) -class PGPostprocessing(object): - """Adds the value func output and advantages field to the trajectory.""" - @override(TorchPolicyGraph) - def extra_action_out(self, model_out): - return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()} +def _pg_loss_stats(policy, batch_tensors): + # the error is recorded when computing the loss + return {"policy_loss": policy.pi_err.item()} - @override(PolicyGraph) - def postprocess_trajectory(self, - sample_batch, - other_agent_batches=None, - episode=None): - return compute_advantages( - sample_batch, 0.0, self.config["gamma"], use_gae=False) +def _make_optimizer(policy): + return torch.optim.Adam(policy._model.parameters(), lr=policy.config["lr"]) -class PGTorchPolicyGraph(PGPostprocessing, TorchPolicyGraph): - def __init__(self, obs_space, action_space, config): - config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) - self.config = config - dist_class, self.logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"], torch=True) - model = ModelCatalog.get_torch_model(obs_space, self.logit_dim, - self.config["model"]) - loss = PGLoss(dist_class) - TorchPolicyGraph.__init__( - self, - obs_space, - action_space, - model, - loss, - loss_inputs=[ - SampleBatch.CUR_OBS, SampleBatch.ACTIONS, - Postprocessing.ADVANTAGES - ], - action_distribution_cls=dist_class) - - @override(TorchPolicyGraph) - def optimizer(self): - return torch.optim.Adam(self._model.parameters(), lr=self.config["lr"]) - - @override(TorchPolicyGraph) - def extra_grad_info(self): - return {"policy_loss": self._loss.pi_err.item()} - - def _value(self, obs): - with self.lock: - obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device) - _, _, vf, _ = self.model({"obs": obs}, []) - return vf.detach().cpu().numpy().squeeze() +PGTorchPolicyGraph = build_torch_policy( + name="PGTorchPolicyGraph", + get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, + loss_fn=_pg_torch_loss, + stats_fn=_pg_loss_stats, + postprocess_fn=_postprocess_advantages, + optimizer_fn=_make_optimizer) diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index 774fd7c583a3..6d5e7c971919 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -9,7 +9,7 @@ Postprocessing from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule -from ray.rllib.evaluation.tf_policy_graph_template import build_tf_graph +from ray.rllib.evaluation.tf_policy_graph_template import build_tf_policy from ray.rllib.models.catalog import ModelCatalog from ray.rllib.utils.explained_variance import explained_variance from ray.rllib.utils import try_import_tf @@ -271,7 +271,7 @@ def _setup_mixins(policy, obs_space, action_space, config): LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) -PPOPolicyGraph = build_tf_graph( +PPOPolicyGraph = build_tf_policy( name="PPOPolicyGraph", get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, loss_fn=_build_ppo_loss, diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py index 16aac69bd8ba..2b69c1a49bd7 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph_template.py +++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py @@ -9,18 +9,18 @@ @DeveloperAPI -def build_tf_graph(name, - get_default_config, - loss_fn, - stats_fn=None, - extra_action_fetches_fn=None, - postprocess_fn=None, - optimizer_fn=None, - gradients_fn=None, - before_init=None, - before_loss_init=None, - after_init=None, - mixins=None): +def build_tf_policy(name, + get_default_config, + loss_fn, + stats_fn=None, + extra_action_fetches_fn=None, + postprocess_fn=None, + optimizer_fn=None, + gradients_fn=None, + before_init=None, + before_loss_init=None, + after_init=None, + mixins=None): """Helper function for creating a dynamic tf policy graph at runtime. Arguments: diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py index fb5c879a1ab8..4a4e79a15242 100644 --- a/python/ray/rllib/evaluation/torch_policy_graph.py +++ b/python/ray/rllib/evaluation/torch_policy_graph.py @@ -15,6 +15,7 @@ from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.utils.annotations import override +from ray.rllib.utils.tracking_dict import UsageTrackingDict class TorchPolicyGraph(PolicyGraph): @@ -87,30 +88,26 @@ def compute_actions(self, @override(PolicyGraph) def learn_on_batch(self, postprocessed_batch): + batch_tensors = self._lazy_tensor_dict(postprocessed_batch) + with self.lock: - loss_in = [] - for key in self._loss_inputs: - loss_in.append( - torch.from_numpy(postprocessed_batch[key]).to(self.device)) - loss_out = self._loss(self._model, *loss_in) + loss_out = self._compute_loss(batch_tensors) self._optimizer.zero_grad() loss_out.backward() grad_process_info = self.extra_grad_process() self._optimizer.step() - grad_info = self.extra_grad_info() + grad_info = self.extra_grad_info(batch_tensors) grad_info.update(grad_process_info) return {LEARNER_STATS_KEY: grad_info} @override(PolicyGraph) def compute_gradients(self, postprocessed_batch): + batch_tensors = self._lazy_tensor_dict(postprocessed_batch) + with self.lock: - loss_in = [] - for key in self._loss_inputs: - loss_in.append( - torch.from_numpy(postprocessed_batch[key]).to(self.device)) - loss_out = self._loss(self._model, *loss_in) + loss_out = self._compute_loss(batch_tensors) self._optimizer.zero_grad() loss_out.backward() @@ -125,7 +122,7 @@ def compute_gradients(self, postprocessed_batch): else: grads.append(None) - grad_info = self.extra_grad_info() + grad_info = self.extra_grad_info(batch_tensors) grad_info.update(grad_process_info) return grads, {LEARNER_STATS_KEY: grad_info} @@ -163,7 +160,7 @@ def extra_action_out(self, model_out): model_out (list): Outputs of the policy model module.""" return {} - def extra_grad_info(self): + def extra_grad_info(self, batch_tensors): """Return dict of extra grad info.""" return {} @@ -171,3 +168,16 @@ def extra_grad_info(self): def optimizer(self): """Custom PyTorch optimizer to use.""" return torch.optim.Adam(self._model.parameters()) + + def _compute_loss(self, batch_tensors): + loss_in = [] + for key in self._loss_inputs: + loss_in.append(batch_tensors[key]) + loss_out = self._loss(self._model, *loss_in) + return loss_out + + def _lazy_tensor_dict(self, postprocessed_batch): + batch_tensors = UsageTrackingDict(postprocessed_batch) + batch_tensors.set_get_interceptor( + lambda arr: torch.from_numpy(arr).to(self.device)) + return batch_tensors diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_graph_template.py new file mode 100644 index 000000000000..66fb51032797 --- /dev/null +++ b/python/ray/rllib/evaluation/torch_policy_graph_template.py @@ -0,0 +1,108 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ray.rllib.evaluation.policy_graph import PolicyGraph +from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.utils.annotations import override, DeveloperAPI + + +@DeveloperAPI +def build_torch_policy(name, + get_default_config, + loss_fn, + stats_fn=None, + postprocess_fn=None, + optimizer_fn=None, + before_init=None, + after_init=None, + mixins=None): + """Helper function for creating a dynamic tf policy graph at runtime. + + Arguments: + name (str): name of the graph (e.g., "PPOPolicyGraph") + get_default_config (func): function that returns the default config + to merge with any overrides + loss_fn (func): function that returns a loss tensor the policy graph, + and dict of experience tensor placeholders + stats_fn (func): optional function that returns a dict of + values given the policy graph and batch input tensors + postprocess_fn (func): optional experience postprocessing function + that takes the same args as PolicyGraph.postprocess_trajectory() + optimizer_fn (func): optional function that returns a torch optimizer + given the policy graph object + before_init (func): optional function to run at the beginning of + __init__ that takes the same arguments as __init__ + after_init (func): optional function to run at the end of __init__ + that takes the same arguments as __init__ + mixins (list): list of any class mixins for the returned policy class. + These mixins will be applied in order and will have higher + precedence than the TorchPolicyGraph class + + Returns: + a TorchPolicyGraph instance that uses the specified args + """ + + if mixins is None: + mixins = [] + + if not name.endswith("TorchPolicyGraph"): + raise ValueError("Name should match *TorchPolicyGraph", name) + + class graph_cls(*mixins, TorchPolicyGraph): + def __init__(self, obs_space, action_space, config): + config = dict(get_default_config(), **config) + self.config = config + + if before_init: + before_init(self, obs_space, action_space, config) + + self.dist_class, self.logit_dim = ModelCatalog.get_action_dist( + action_space, self.config["model"], torch=True) + self.model = ModelCatalog.get_torch_model( + obs_space, self.logit_dim, self.config["model"]) + + TorchPolicyGraph.__init__( + self, + obs_space, + action_space, + self.model, + None, # loss fn is None since we override _compute_loss + [], # TODO(ekl) clean up torch loss handling + self.dist_class) + + if after_init: + after_init(self, obs_space, action_space, config) + + @override(PolicyGraph) + def postprocess_trajectory(self, + sample_batch, + other_agent_batches=None, + episode=None): + if not postprocess_fn: + return sample_batch + return postprocess_fn(self, sample_batch, other_agent_batches, + episode) + + @override(TorchPolicyGraph) + def optimizer(self): + if optimizer_fn: + return optimizer_fn(self) + else: + return TorchPolicyGraph.optimizer(self) + + @override(TorchPolicyGraph) + def extra_grad_info(self, batch_tensors): + if stats_fn: + return stats_fn(self, batch_tensors) + else: + return TorchPolicyGraph.extra_grad_info(self, batch_tensors) + + @override(TorchPolicyGraph) + def _compute_loss(self, batch_tensors): + return loss_fn(self, batch_tensors) + + graph_cls.__name__ = name + graph_cls.__qualname__ = name + return graph_cls diff --git a/python/ray/rllib/utils/tracking_dict.py b/python/ray/rllib/utils/tracking_dict.py index d0a04c4d059e..d43f6e87b0f7 100644 --- a/python/ray/rllib/utils/tracking_dict.py +++ b/python/ray/rllib/utils/tracking_dict.py @@ -6,6 +6,9 @@ class UsageTrackingDict(dict): """Dict that tracks which keys have been accessed. + It can also intercept gets and allow an arbitrary callback to be applied + (i.e., to lazily convert numpy arrays to Tensors). + We make the simplifying assumption only __getitem__ is used to access values. """ @@ -13,7 +16,14 @@ class UsageTrackingDict(dict): def __init__(self, *args, **kwargs): dict.__init__(self, *args, **kwargs) self.accessed_keys = set() + self.get_interceptor = None + + def set_get_interceptor(self, fn): + self.get_interceptor = fn def __getitem__(self, key): self.accessed_keys.add(key) - return dict.__getitem__(self, key) + value = dict.__getitem__(self, key) + if self.get_interceptor: + value = self.get_interceptor(value) + return value From 5269fe0f2a463e11d5edce017fb583def7f4249a Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 16:12:00 -0700 Subject: [PATCH 18/39] add custom model support in builder --- .../agents/a3c/a3c_torch_policy_graph.py | 173 ++++++++---------- python/ray/rllib/agents/pg/pg_policy_graph.py | 18 +- .../rllib/agents/pg/torch_pg_policy_graph.py | 22 +-- .../ray/rllib/agents/ppo/ppo_policy_graph.py | 30 +-- .../evaluation/dynamic_tf_policy_graph.py | 60 +++--- .../evaluation/tf_policy_graph_template.py | 5 + .../rllib/evaluation/torch_policy_graph.py | 23 +-- .../evaluation/torch_policy_graph_template.py | 51 ++++-- 8 files changed, 181 insertions(+), 201 deletions(-) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index d35aabe0d667..13704f74e938 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -7,109 +7,84 @@ from torch import nn import ray -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing -from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.sample_batch import SampleBatch -from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph -from ray.rllib.utils.annotations import override - - -class A3CLoss(nn.Module): - def __init__(self, dist_class, vf_loss_coeff=0.5, entropy_coeff=0.01): - nn.Module.__init__(self) - self.dist_class = dist_class - self.vf_loss_coeff = vf_loss_coeff - self.entropy_coeff = entropy_coeff - - def forward(self, policy_model, observations, actions, advantages, - value_targets): - logits, _, values, _ = policy_model({ - SampleBatch.CUR_OBS: observations - }, []) - dist = self.dist_class(logits) - log_probs = dist.logp(actions) - self.entropy = dist.entropy().mean() - self.pi_err = -advantages.dot(log_probs.reshape(-1)) - self.value_err = F.mse_loss(values.reshape(-1), value_targets) - overall_err = sum([ - self.pi_err, - self.vf_loss_coeff * self.value_err, - -self.entropy_coeff * self.entropy, - ]) - - return overall_err - - -class A3CPostprocessing(object): - """Adds the VF preds and advantages fields to the trajectory.""" - - @override(TorchPolicyGraph) - def extra_action_out(self, model_out): - return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()} - - @override(PolicyGraph) - def postprocess_trajectory(self, - sample_batch, - other_agent_batches=None, - episode=None): - completed = sample_batch[SampleBatch.DONES][-1] - if completed: - last_r = 0.0 - else: - last_r = self._value(sample_batch[SampleBatch.NEXT_OBS][-1]) - return compute_advantages(sample_batch, last_r, self.config["gamma"], - self.config["lambda"]) - - -class A3CTorchPolicyGraph(A3CPostprocessing, TorchPolicyGraph): - """A simple, non-recurrent PyTorch policy example.""" - - def __init__(self, obs_space, action_space, config): - config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) - self.config = config - dist_class, self.logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"], torch=True) - model = ModelCatalog.get_torch_model(obs_space, self.logit_dim, - self.config["model"]) - loss = A3CLoss(dist_class, self.config["vf_loss_coeff"], - self.config["entropy_coeff"]) - TorchPolicyGraph.__init__( - self, - obs_space, - action_space, - model, - loss, - loss_inputs=[ - SampleBatch.CUR_OBS, SampleBatch.ACTIONS, - Postprocessing.ADVANTAGES, Postprocessing.VALUE_TARGETS - ], - action_distribution_cls=dist_class) - - @override(TorchPolicyGraph) - def optimizer(self): - return torch.optim.Adam(self._model.parameters(), lr=self.config["lr"]) - - @override(TorchPolicyGraph) - def extra_grad_process(self): - info = {} - if self.config["grad_clip"]: - total_norm = nn.utils.clip_grad_norm_(self._model.parameters(), - self.config["grad_clip"]) - info["grad_gnorm"] = total_norm - return info - - @override(TorchPolicyGraph) - def extra_grad_info(self): - return { - "policy_entropy": self._loss.entropy.item(), - "policy_loss": self._loss.pi_err.item(), - "vf_loss": self._loss.value_err.item() - } - +from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy + + +def a3c_torch_loss(policy, batch_tensors): + logits, _, values, _ = policy.model({ + SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] + }, []) + dist = policy.dist_class(logits) + log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS]) + policy.entropy = dist.entropy().mean() + policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot( + log_probs.reshape(-1)) + policy.value_err = F.mse_loss( + values.reshape(-1), batch_tensors[Postprocessing.VALUE_TARGETS]) + overall_err = sum([ + policy.pi_err, + policy.config["vf_loss_coeff"] * policy.value_err, + -policy.config["entropy_coeff"] * policy.entropy, + ]) + return overall_err + + +def a3c_torch_stats(policy, batch_tensors): + return { + "policy_entropy": policy.entropy.item(), + "policy_loss": policy.pi_err.item(), + "vf_loss": policy.value_err.item(), + } + + +def a3c_extra_action_out(policy, model_out): + return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()} + + +def a3c_extra_grad_process(policy): + info = {} + if policy.config["grad_clip"]: + total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(), + policy.config["grad_clip"]) + info["grad_gnorm"] = total_norm + return info + + +def optimizer(policy): + return torch.optim.Adam(policy.model.parameters(), lr=policy.config["lr"]) + + +def postprocess_torch_a3c(policy, + sample_batch, + other_agent_batches=None, + episode=None): + completed = sample_batch[SampleBatch.DONES][-1] + if completed: + last_r = 0.0 + else: + last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1]) + return compute_advantages(sample_batch, last_r, policy.config["gamma"], + policy.config["lambda"]) + + +class ValueNetworkMixin(object): def _value(self, obs): with self.lock: obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device) - _, _, vf, _ = self._model({"obs": obs}, []) + _, _, vf, _ = self.model({"obs": obs}, []) return vf.detach().cpu().numpy().squeeze() + + +A3CTorchPolicyGraph = build_torch_policy( + name="A3CTorchPolicyGraph", + get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, + loss_fn=a3c_torch_loss, + stats_fn=a3c_torch_stats, + postprocess_fn=postprocess_torch_a3c, + extra_action_out_fn=a3c_extra_action_out, + extra_grad_process_fn=a3c_extra_grad_process, + optimizer_fn=optimizer, + mixins=[ValueNetworkMixin]) diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index 510662be6f00..4bdcb3e1fed5 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -13,28 +13,28 @@ # The basic policy gradients loss -def _policy_gradient_loss(policy, batch_tensors): +def policy_gradient_loss(policy, batch_tensors): actions = batch_tensors[SampleBatch.ACTIONS] advantages = batch_tensors[Postprocessing.ADVANTAGES] return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages) # This adds the "advantages" column to the sample batch. -def _postprocess_advantages(policy, - sample_batch, - other_agent_batches=None, - episode=None): +def postprocess_advantages(policy, + sample_batch, + other_agent_batches=None, + episode=None): return compute_advantages( sample_batch, 0.0, policy.config["gamma"], use_gae=False) -def _make_optimizer(policy): +def make_optimizer(policy): return tf.train.AdamOptimizer(learning_rate=policy.config["lr"]) PGPolicyGraph = build_tf_policy( name="PGPolicyGraph", get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, - postprocess_fn=_postprocess_advantages, - loss_fn=_policy_gradient_loss, - optimizer_fn=_make_optimizer) + postprocess_fn=postprocess_advantages, + loss_fn=policy_gradient_loss, + optimizer_fn=make_optimizer) diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py index 063cc0610c3e..040dcadc9742 100644 --- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py @@ -11,7 +11,7 @@ from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy -def _pg_torch_loss(policy, batch_tensors): +def pg_torch_loss(policy, batch_tensors): logits, _, values, _ = policy.model({ SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] }, []) @@ -23,27 +23,27 @@ def _pg_torch_loss(policy, batch_tensors): return policy.pi_err -def _postprocess_advantages(policy, - sample_batch, - other_agent_batches=None, - episode=None): +def postprocess_advantages(policy, + sample_batch, + other_agent_batches=None, + episode=None): return compute_advantages( sample_batch, 0.0, policy.config["gamma"], use_gae=False) -def _pg_loss_stats(policy, batch_tensors): +def pg_loss_stats(policy, batch_tensors): # the error is recorded when computing the loss return {"policy_loss": policy.pi_err.item()} -def _make_optimizer(policy): +def make_optimizer(policy): return torch.optim.Adam(policy._model.parameters(), lr=policy.config["lr"]) PGTorchPolicyGraph = build_torch_policy( name="PGTorchPolicyGraph", get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, - loss_fn=_pg_torch_loss, - stats_fn=_pg_loss_stats, - postprocess_fn=_postprocess_advantages, - optimizer_fn=_make_optimizer) + loss_fn=pg_torch_loss, + stats_fn=pg_loss_stats, + postprocess_fn=postprocess_advantages, + optimizer_fn=make_optimizer) diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index 6d5e7c971919..6e05d7069f90 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -104,7 +104,7 @@ def reduce_mean_valid(t): self.loss = loss -def _build_ppo_loss(policy, batch_tensors): +def build_ppo_loss(policy, batch_tensors): if policy.model.state_in: max_seq_len = tf.reduce_max(policy.model.seq_lens) mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len) @@ -133,7 +133,7 @@ def _build_ppo_loss(policy, batch_tensors): return policy.loss_obj.loss -def _build_ppo_stats(policy, batch_tensors): +def build_ppo_stats(policy, batch_tensors): policy.explained_variance = explained_variance( batch_tensors[Postprocessing.VALUE_TARGETS], policy.value_function) @@ -151,7 +151,7 @@ def _build_ppo_stats(policy, batch_tensors): return stats_fetches -def _build_ppo_action_fetches(policy): +def build_ppo_action_fetches(policy): """Adds value function and logits outputs to experience batches.""" return { SampleBatch.VF_PREDS: policy.value_function, @@ -159,10 +159,10 @@ def _build_ppo_action_fetches(policy): } -def _postprocess_ppo_gae(policy, - sample_batch, - other_agent_batches=None, - episode=None): +def postprocess_ppo_gae(policy, + sample_batch, + other_agent_batches=None, + episode=None): """Adds the policy logits, VF preds, and advantages to the trajectory.""" completed = sample_batch["dones"][-1] @@ -185,7 +185,7 @@ def _postprocess_ppo_gae(policy, return batch -def _build_ppo_gradients(policy, optimizer, loss): +def build_ppo_gradients(policy, optimizer, loss): if policy.config["grad_clip"] is not None: policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) @@ -265,7 +265,7 @@ def _value(self, ob, prev_action, prev_reward, *args): return vf[0] -def _setup_mixins(policy, obs_space, action_space, config): +def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy, obs_space, action_space, config) KLCoeffMixin.__init__(policy, config) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) @@ -274,10 +274,10 @@ def _setup_mixins(policy, obs_space, action_space, config): PPOPolicyGraph = build_tf_policy( name="PPOPolicyGraph", get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, - loss_fn=_build_ppo_loss, - stats_fn=_build_ppo_stats, - extra_action_fetches_fn=_build_ppo_action_fetches, - postprocess_fn=_postprocess_ppo_gae, - gradients_fn=_build_ppo_gradients, - before_loss_init=_setup_mixins, + loss_fn=build_ppo_loss, + stats_fn=build_ppo_stats, + extra_action_fetches_fn=build_ppo_action_fetches, + postprocess_fn=postprocess_ppo_gae, + gradients_fn=build_ppo_gradients, + before_loss_init=setup_mixins, mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin]) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 5b7daa0a3eae..d61ecc80ef5f 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -10,7 +10,7 @@ from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.utils.annotations import override, DeveloperAPI +from ray.rllib.utils.annotations import override from ray.rllib.utils import try_import_tf from ray.rllib.utils.debug import log_once, summarize from ray.rllib.utils.tracking_dict import UsageTrackingDict @@ -20,7 +20,6 @@ logger = logging.getLogger(__name__) -@DeveloperAPI class DynamicTFPolicyGraph(TFPolicyGraph): """A TFPolicyGraph that auto-defines placeholders dynamically at runtime. @@ -38,10 +37,8 @@ def __init__(self, config, loss_fn, stats_fn=None, - autosetup_model=True, before_loss_init=None, - action_sampler=None, - action_prob=None, + make_action_sampler=None, existing_inputs=None): """Initialize a dynamic TF policy graph. @@ -53,21 +50,17 @@ def __init__(self, graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy graph and batch input tensors - autosetup_model (bool): whether to create a model and action dist - using catalog defaults. These will be available as self.model - and self.action_dist before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ - action_sampler (Tensor): if autosetup_model is False, this must be - specified to define how the policy computes actions - action_prob (Tensor): if autosetup_model is False, this can be - specified to define the chosen action probability + make_action_sampler (func): optional function that returns a + tuple of action and action prob tensors. The function takes + (policy, input_dict, obs_space, action_space, config) as its + arguments existing_inputs (OrderedDict): when copying a policy graph, this specifies an existing dict of placeholders to use instead of defining new ones """ self.config = config - self.autosetup_model = autosetup_model self._loss_fn = loss_fn self._stats_fn = stats_fn @@ -85,9 +78,23 @@ def __init__(self, prev_rewards = tf.placeholder( tf.float32, [None], name="prev_reward") + input_dict = { + "obs": obs, + "prev_actions": prev_actions, + "prev_rewards": prev_rewards, + "is_training": self._get_is_training_placeholder(), + } + # Create the model network and action outputs - if autosetup_model: - dist_class, self.logit_dim = ModelCatalog.get_action_dist( + if make_action_sampler: + assert not existing_inputs, \ + "Cloning not supported with custom action sampler" + self.model = None + self.action_dist = None + action_sampler, action_prob = make_action_sampler( + self, input_dict, obs_space, action_space, config) + else: + dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: existing_state_in = [ @@ -102,29 +109,16 @@ def __init__(self, existing_state_in = [] existing_seq_lens = None self.model = ModelCatalog.get_model( - { - "obs": obs, - "prev_actions": prev_actions, - "prev_rewards": prev_rewards, - "is_training": self._get_is_training_placeholder(), - }, + input_dict, obs_space, action_space, - self.logit_dim, + logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) self.action_dist = dist_class(self.model.outputs) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() - else: - self.logit_dim = None - self.model = None - self.action_dist = None - if not action_sampler: - raise ValueError( - "When autosetup_model=False, action_sampler must be " - "passed in to the constructor.") # Phase 1 init sess = tf.get_default_session() @@ -139,11 +133,11 @@ def __init__(self, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, - state_inputs=self.model.state_in, - state_outputs=self.model.state_out, + state_inputs=self.model and self.model.state_in, + state_outputs=self.model and self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, - seq_lens=self.model.seq_lens, + seq_lens=self.model and self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) # Phase 2 init diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py index 2b69c1a49bd7..4c708de35e18 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph_template.py +++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py @@ -20,6 +20,7 @@ def build_tf_policy(name, before_init=None, before_loss_init=None, after_init=None, + make_action_sampler=None, mixins=None): """Helper function for creating a dynamic tf policy graph at runtime. @@ -46,6 +47,10 @@ def build_tf_policy(name, init that takes the same arguments as __init__ after_init (func): optional function to run at the end of __init__ that takes the same arguments as __init__ + make_action_sampler (func): optional function that returns a + tuple of action and action prob tensors. The function takes + (policy, input_dict, obs_space, action_space, config) as its + arguments mixins (list): list of any class mixins for the returned policy class. These mixins will be applied in order and will have higher precedence than the DynamicTFPolicyGraph class diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py index 4a4e79a15242..3a1464606612 100644 --- a/python/ray/rllib/evaluation/torch_policy_graph.py +++ b/python/ray/rllib/evaluation/torch_policy_graph.py @@ -31,7 +31,7 @@ class TorchPolicyGraph(PolicyGraph): """ def __init__(self, observation_space, action_space, model, loss, - loss_inputs, action_distribution_cls): + action_distribution_cls): """Build a policy graph from policy and loss torch modules. Note that model will be placed on GPU device if CUDA_VISIBLE_DEVICES @@ -43,13 +43,8 @@ def __init__(self, observation_space, action_space, model, loss, model (nn.Module): PyTorch policy module. Given observations as input, this module must return a list of outputs where the first item is action logits, and the rest can be any value. - loss (nn.Module): Loss defined as a PyTorch module. The inputs for - this module are defined by the `loss_inputs` param. This module - returns a single scalar loss. Note that this module should - internally be using the model module. - loss_inputs (list): List of SampleBatch columns that will be - passed to the loss module's forward() function when computing - the loss. For example, ["obs", "action", "advantages"]. + loss (func): Function that takes (policy_graph, batch_tensors) + and returns a single scalar loss. action_distribution_cls (ActionDistribution): Class for action distribution. """ @@ -61,7 +56,6 @@ def __init__(self, observation_space, action_space, model, loss, else torch.device("cpu")) self._model = model.to(self.device) self._loss = loss - self._loss_inputs = loss_inputs self._optimizer = self.optimizer() self._action_dist_cls = action_distribution_cls @@ -91,7 +85,7 @@ def learn_on_batch(self, postprocessed_batch): batch_tensors = self._lazy_tensor_dict(postprocessed_batch) with self.lock: - loss_out = self._compute_loss(batch_tensors) + loss_out = self._loss(self, batch_tensors) self._optimizer.zero_grad() loss_out.backward() @@ -107,7 +101,7 @@ def compute_gradients(self, postprocessed_batch): batch_tensors = self._lazy_tensor_dict(postprocessed_batch) with self.lock: - loss_out = self._compute_loss(batch_tensors) + loss_out = self._loss(self, batch_tensors) self._optimizer.zero_grad() loss_out.backward() @@ -169,13 +163,6 @@ def optimizer(self): """Custom PyTorch optimizer to use.""" return torch.optim.Adam(self._model.parameters()) - def _compute_loss(self, batch_tensors): - loss_in = [] - for key in self._loss_inputs: - loss_in.append(batch_tensors[key]) - loss_out = self._loss(self._model, *loss_in) - return loss_out - def _lazy_tensor_dict(self, postprocessed_batch): batch_tensors = UsageTrackingDict(postprocessed_batch) batch_tensors.set_get_interceptor( diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_graph_template.py index 66fb51032797..50685ce5bd5f 100644 --- a/python/ray/rllib/evaluation/torch_policy_graph_template.py +++ b/python/ray/rllib/evaluation/torch_policy_graph_template.py @@ -14,9 +14,12 @@ def build_torch_policy(name, loss_fn, stats_fn=None, postprocess_fn=None, + extra_action_out_fn=None, + extra_grad_process_fn=None, optimizer_fn=None, before_init=None, after_init=None, + make_model_and_action_dist=None, mixins=None): """Helper function for creating a dynamic tf policy graph at runtime. @@ -30,12 +33,20 @@ def build_torch_policy(name, values given the policy graph and batch input tensors postprocess_fn (func): optional experience postprocessing function that takes the same args as PolicyGraph.postprocess_trajectory() + extra_action_out_fn (func): optional function that returns + a dict of extra values to include in experiences + extra_grad_process_fn (func): optional function that is called after + gradients are computed and returns processing info optimizer_fn (func): optional function that returns a torch optimizer given the policy graph object before_init (func): optional function to run at the beginning of __init__ that takes the same arguments as __init__ after_init (func): optional function to run at the end of __init__ that takes the same arguments as __init__ + make_model_and_action_dist (func): optional func that takes the same + arguments as __init__ and returns a tuple of model instance and + torch action distribution class. If not specified, the default + model and action dist from the catalog will be used mixins (list): list of any class mixins for the returned policy class. These mixins will be applied in order and will have higher precedence than the TorchPolicyGraph class @@ -58,19 +69,17 @@ def __init__(self, obs_space, action_space, config): if before_init: before_init(self, obs_space, action_space, config) - self.dist_class, self.logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"], torch=True) - self.model = ModelCatalog.get_torch_model( - obs_space, self.logit_dim, self.config["model"]) + if make_model_and_action_dist: + self.model, self.dist_class = make_model_and_action_dist( + self, obs_space, action_space, config) + else: + self.dist_class, logit_dim = ModelCatalog.get_action_dist( + action_space, self.config["model"], torch=True) + self.model = ModelCatalog.get_torch_model( + obs_space, logit_dim, self.config["model"]) - TorchPolicyGraph.__init__( - self, - obs_space, - action_space, - self.model, - None, # loss fn is None since we override _compute_loss - [], # TODO(ekl) clean up torch loss handling - self.dist_class) + TorchPolicyGraph.__init__(self, obs_space, action_space, + self.model, loss_fn, self.dist_class) if after_init: after_init(self, obs_space, action_space, config) @@ -85,6 +94,20 @@ def postprocess_trajectory(self, return postprocess_fn(self, sample_batch, other_agent_batches, episode) + @override(TorchPolicyGraph) + def extra_grad_process(self): + if extra_grad_process_fn: + return extra_grad_process_fn(self) + else: + return TorchPolicyGraph.extra_grad_process(self) + + @override(TorchPolicyGraph) + def extra_action_out(self, model_out): + if extra_action_out_fn: + return extra_action_out_fn(self, model_out) + else: + return TorchPolicyGraph.extra_action_out_fn(self, model_out) + @override(TorchPolicyGraph) def optimizer(self): if optimizer_fn: @@ -99,10 +122,6 @@ def extra_grad_info(self, batch_tensors): else: return TorchPolicyGraph.extra_grad_info(self, batch_tensors) - @override(TorchPolicyGraph) - def _compute_loss(self, batch_tensors): - return loss_fn(self, batch_tensors) - graph_cls.__name__ = name graph_cls.__qualname__ = name return graph_cls From ac108dde2710ba574956f390e9dad8b34a3896bf Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 16:13:09 -0700 Subject: [PATCH 19/39] cleanup --- .../agents/a3c/a3c_torch_policy_graph.py | 26 +++++++++---------- .../evaluation/torch_policy_graph_template.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index 13704f74e938..930bd78094be 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -40,6 +40,19 @@ def a3c_torch_stats(policy, batch_tensors): } +def postprocess_torch_a3c(policy, + sample_batch, + other_agent_batches=None, + episode=None): + completed = sample_batch[SampleBatch.DONES][-1] + if completed: + last_r = 0.0 + else: + last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1]) + return compute_advantages(sample_batch, last_r, policy.config["gamma"], + policy.config["lambda"]) + + def a3c_extra_action_out(policy, model_out): return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()} @@ -57,19 +70,6 @@ def optimizer(policy): return torch.optim.Adam(policy.model.parameters(), lr=policy.config["lr"]) -def postprocess_torch_a3c(policy, - sample_batch, - other_agent_batches=None, - episode=None): - completed = sample_batch[SampleBatch.DONES][-1] - if completed: - last_r = 0.0 - else: - last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1]) - return compute_advantages(sample_batch, last_r, policy.config["gamma"], - policy.config["lambda"]) - - class ValueNetworkMixin(object): def _value(self, obs): with self.lock: diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_graph_template.py index 50685ce5bd5f..745ba893a93f 100644 --- a/python/ray/rllib/evaluation/torch_policy_graph_template.py +++ b/python/ray/rllib/evaluation/torch_policy_graph_template.py @@ -21,7 +21,7 @@ def build_torch_policy(name, after_init=None, make_model_and_action_dist=None, mixins=None): - """Helper function for creating a dynamic tf policy graph at runtime. + """Helper function for creating a torch policy graph at runtime. Arguments: name (str): name of the graph (e.g., "PPOPolicyGraph") From 707acf827347783852ba106b1d1530034a1e1637 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 16:20:28 -0700 Subject: [PATCH 20/39] remove underscores --- python/ray/rllib/agents/pg/pg.py | 8 ++++---- python/ray/rllib/agents/ppo/ppo.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index d9acebd91e80..a2e81be5ebec 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -21,13 +21,13 @@ # yapf: enable -def _make_policy_optimizer(local_ev, remote_evs, config): +def make_policy_optimizer(local_ev, remote_evs, config): optimizer_config = dict(config["optimizer"], **{"train_batch_size": config["train_batch_size"]}) return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config) -def _get_policy_graph(config): +def get_policy_graph(config): if config["use_pytorch"]: from ray.rllib.agents.pg.torch_pg_policy_graph import \ PGTorchPolicyGraph @@ -40,5 +40,5 @@ def _get_policy_graph(config): "PG", default_config=DEFAULT_CONFIG, default_policy_graph=PGPolicyGraph, - get_policy_graph=_get_policy_graph, - make_policy_optimizer=_make_policy_optimizer) + get_policy_graph=get_policy_graph, + make_policy_optimizer=make_policy_optimizer) diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 54f7a579fcc4..6bf6269a6a80 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -63,7 +63,7 @@ # yapf: enable -def _make_optimizer(local_evaluator, remote_evaluators, config): +def make_optimizer(local_evaluator, remote_evaluators, config): if config["simple_optimizer"]: return SyncSamplesOptimizer( local_evaluator, @@ -84,7 +84,7 @@ def _make_optimizer(local_evaluator, remote_evaluators, config): straggler_mitigation=config["straggler_mitigation"]) -def _update_kl(trainer, fetches): +def update_kl(trainer, fetches): if "kl" in fetches: # single-agent trainer.local_evaluator.for_policy( @@ -101,7 +101,7 @@ def update(pi, pi_id): trainer.local_evaluator.foreach_trainable_policy(update) -def _warn_about_obs_filter(trainer): +def warn_about_obs_filter(trainer): if "observation_filter" not in trainer.raw_user_config: # TODO(ekl) remove this message after a few releases logger.info( @@ -112,7 +112,7 @@ def _warn_about_obs_filter(trainer): "require observation normalization.") -def _warn_about_bad_reward_scales(trainer, result): +def warn_about_bad_reward_scales(trainer, result): # Warn about bad clipping configs if trainer.config["vf_clip_param"] <= 0: rew_scale = float("inf") @@ -132,7 +132,7 @@ def _warn_about_bad_reward_scales(trainer, result): "increasing `vf_clip_param`.") -def _validate_config(config): +def validate_config(config): if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if config["sgd_minibatch_size"] > config["train_batch_size"]: @@ -159,8 +159,8 @@ def _validate_config(config): "PPO", default_config=DEFAULT_CONFIG, default_policy_graph=PPOPolicyGraph, - make_policy_optimizer=_make_optimizer, - validate_config=_validate_config, - after_optimizer_step=_update_kl, - before_train_step=_warn_about_obs_filter, - after_train_result=_warn_about_bad_reward_scales) + make_policy_optimizer=make_optimizer, + validate_config=validate_config, + after_optimizer_step=update_kl, + before_train_step=warn_about_obs_filter, + after_train_result=warn_about_bad_reward_scales) From a4a92600af018d94983eead2898213d76b49ec1c Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 17:07:00 -0700 Subject: [PATCH 21/39] fix py2 compat --- python/ray/rllib/agents/ddpg/ddpg_policy_graph.py | 2 +- .../rllib/evaluation/tf_policy_graph_template.py | 13 +++++++++---- .../evaluation/torch_policy_graph_template.py | 15 ++++++++++----- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index 6c4917ad853f..6ac8f1ef8ab2 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -507,7 +507,7 @@ def make_noisy_actions(): def make_uniform_random_actions(): # pure random exploration option - uniform_random_actions = tf.random.uniform( + uniform_random_actions = tf.random_uniform( tf.shape(deterministic_actions)) # rescale uniform random actions according to action range tf_range = tf.constant(action_range[None], dtype="float32") diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_graph_template.py index 4c708de35e18..57981b326673 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph_template.py +++ b/python/ray/rllib/evaluation/tf_policy_graph_template.py @@ -59,13 +59,18 @@ def build_tf_policy(name, a DynamicTFPolicyGraph instance that uses the specified args """ - if mixins is None: - mixins = [] - if not name.endswith("PolicyGraph"): raise ValueError("Name should match *PolicyGraph", name) - class graph_cls(*mixins, DynamicTFPolicyGraph): + base = DynamicTFPolicyGraph + while mixins: + + class new_base(mixins.pop(), base): + pass + + base = new_base + + class graph_cls(base): def __init__(self, obs_space, action_space, diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_graph_template.py index 745ba893a93f..71756f11dc85 100644 --- a/python/ray/rllib/evaluation/torch_policy_graph_template.py +++ b/python/ray/rllib/evaluation/torch_policy_graph_template.py @@ -55,13 +55,18 @@ def build_torch_policy(name, a TorchPolicyGraph instance that uses the specified args """ - if mixins is None: - mixins = [] - if not name.endswith("TorchPolicyGraph"): raise ValueError("Name should match *TorchPolicyGraph", name) - class graph_cls(*mixins, TorchPolicyGraph): + base = TorchPolicyGraph + while mixins: + + class new_base(mixins.pop(), base): + pass + + base = new_base + + class graph_cls(base): def __init__(self, obs_space, action_space, config): config = dict(get_default_config(), **config) self.config = config @@ -106,7 +111,7 @@ def extra_action_out(self, model_out): if extra_action_out_fn: return extra_action_out_fn(self, model_out) else: - return TorchPolicyGraph.extra_action_out_fn(self, model_out) + return TorchPolicyGraph.extra_action_out(self, model_out) @override(TorchPolicyGraph) def optimizer(self): From a2281da49f563e92fa0ca5798f8b06d23fa398f2 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 22:17:55 -0700 Subject: [PATCH 22/39] Update dynamic_tf_policy_graph.py --- python/ray/rllib/evaluation/dynamic_tf_policy_graph.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index d61ecc80ef5f..feba391e3574 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -239,6 +239,7 @@ def fake_array(tensor): elif v.dtype == np.object: continue # can't handle arbitrary objects in TF shape = (None, ) + v.shape[1:] + dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(v.dtype, shape=shape, name=k) batch_tensors[k] = placeholder From 817a1f9423c27c7409d1aa527214efabfc5c6ca4 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 16 May 2019 23:01:33 -0700 Subject: [PATCH 23/39] Update tracking_dict.py --- python/ray/rllib/utils/tracking_dict.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/ray/rllib/utils/tracking_dict.py b/python/ray/rllib/utils/tracking_dict.py index d43f6e87b0f7..c0f145734e78 100644 --- a/python/ray/rllib/utils/tracking_dict.py +++ b/python/ray/rllib/utils/tracking_dict.py @@ -16,6 +16,7 @@ class UsageTrackingDict(dict): def __init__(self, *args, **kwargs): dict.__init__(self, *args, **kwargs) self.accessed_keys = set() + self.intercepted_values = {} self.get_interceptor = None def set_get_interceptor(self, fn): @@ -25,5 +26,7 @@ def __getitem__(self, key): self.accessed_keys.add(key) value = dict.__getitem__(self, key) if self.get_interceptor: - value = self.get_interceptor(value) + if key not in self.intercepted_values: + self.intercepted_values[key] = self.get_interceptor(value) + value = self.intercepted_values[key] return value From a7229cec0b8bff3afe9dfe0dbd5918910289490c Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 00:03:42 -0700 Subject: [PATCH 24/39] wip --- .../agents/a3c/a3c_torch_policy_graph.py | 30 +++++++++---------- .../ray/rllib/agents/ppo/ppo_policy_graph.py | 16 +++++----- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index 930bd78094be..f241e25ef4d3 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -13,7 +13,7 @@ from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy -def a3c_torch_loss(policy, batch_tensors): +def actor_critic_loss(policy, batch_tensors): logits, _, values, _ = policy.model({ SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] }, []) @@ -32,7 +32,7 @@ def a3c_torch_loss(policy, batch_tensors): return overall_err -def a3c_torch_stats(policy, batch_tensors): +def loss_and_entropy_stats(policy, batch_tensors): return { "policy_entropy": policy.entropy.item(), "policy_loss": policy.pi_err.item(), @@ -40,10 +40,10 @@ def a3c_torch_stats(policy, batch_tensors): } -def postprocess_torch_a3c(policy, - sample_batch, - other_agent_batches=None, - episode=None): +def add_advantages(policy, + sample_batch, + other_agent_batches=None, + episode=None): completed = sample_batch[SampleBatch.DONES][-1] if completed: last_r = 0.0 @@ -53,11 +53,11 @@ def postprocess_torch_a3c(policy, policy.config["lambda"]) -def a3c_extra_action_out(policy, model_out): +def model_value_predictions(policy, model_out): return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()} -def a3c_extra_grad_process(policy): +def apply_grad_clipping(policy): info = {} if policy.config["grad_clip"]: total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(), @@ -66,7 +66,7 @@ def a3c_extra_grad_process(policy): return info -def optimizer(policy): +def torch_optimizer(policy): return torch.optim.Adam(policy.model.parameters(), lr=policy.config["lr"]) @@ -81,10 +81,10 @@ def _value(self, obs): A3CTorchPolicyGraph = build_torch_policy( name="A3CTorchPolicyGraph", get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, - loss_fn=a3c_torch_loss, - stats_fn=a3c_torch_stats, - postprocess_fn=postprocess_torch_a3c, - extra_action_out_fn=a3c_extra_action_out, - extra_grad_process_fn=a3c_extra_grad_process, - optimizer_fn=optimizer, + loss_fn=actor_critic_loss, + stats_fn=loss_and_entropy_stats, + postprocess_fn=add_advantages, + extra_action_out_fn=model_value_predictions, + extra_grad_process_fn=apply_grad_clipping, + optimizer_fn=torch_optimizer, mixins=[ValueNetworkMixin]) diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index 6e05d7069f90..aab90a034a36 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -104,7 +104,7 @@ def reduce_mean_valid(t): self.loss = loss -def build_ppo_loss(policy, batch_tensors): +def ppo_surrogate_loss(policy, batch_tensors): if policy.model.state_in: max_seq_len = tf.reduce_max(policy.model.seq_lens) mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len) @@ -133,7 +133,7 @@ def build_ppo_loss(policy, batch_tensors): return policy.loss_obj.loss -def build_ppo_stats(policy, batch_tensors): +def kl_and_loss_stats(policy, batch_tensors): policy.explained_variance = explained_variance( batch_tensors[Postprocessing.VALUE_TARGETS], policy.value_function) @@ -151,7 +151,7 @@ def build_ppo_stats(policy, batch_tensors): return stats_fetches -def build_ppo_action_fetches(policy): +def vf_preds_and_logits_fetches(policy): """Adds value function and logits outputs to experience batches.""" return { SampleBatch.VF_PREDS: policy.value_function, @@ -185,7 +185,7 @@ def postprocess_ppo_gae(policy, return batch -def build_ppo_gradients(policy, optimizer, loss): +def clip_gradients(policy, optimizer, loss): if policy.config["grad_clip"] is not None: policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) @@ -274,10 +274,10 @@ def setup_mixins(policy, obs_space, action_space, config): PPOPolicyGraph = build_tf_policy( name="PPOPolicyGraph", get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, - loss_fn=build_ppo_loss, - stats_fn=build_ppo_stats, - extra_action_fetches_fn=build_ppo_action_fetches, + loss_fn=ppo_surrogate_loss, + stats_fn=kl_and_loss_stats, + extra_action_fetches_fn=vf_preds_and_logits_fetches, postprocess_fn=postprocess_ppo_gae, - gradients_fn=build_ppo_gradients, + gradients_fn=clip_gradients, before_loss_init=setup_mixins, mixins=[LearningRateSchedule, KLCoeffMixin, ValueNetworkMixin]) From 4b9eb6df1138801181a637dd4bc614ffdef9aa97 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 00:06:05 -0700 Subject: [PATCH 25/39] rename --- python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py | 2 +- python/ray/rllib/agents/pg/pg_policy_graph.py | 2 +- python/ray/rllib/agents/pg/torch_pg_policy_graph.py | 2 +- python/ray/rllib/agents/ppo/ppo_policy_graph.py | 2 +- python/ray/rllib/evaluation/dynamic_tf_policy_graph.py | 2 +- .../{tf_policy_graph_template.py => tf_policy_template.py} | 0 ...{torch_policy_graph_template.py => torch_policy_template.py} | 0 7 files changed, 5 insertions(+), 5 deletions(-) rename python/ray/rllib/evaluation/{tf_policy_graph_template.py => tf_policy_template.py} (100%) rename python/ray/rllib/evaluation/{torch_policy_graph_template.py => torch_policy_template.py} (100%) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index f241e25ef4d3..807c2327d77e 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -10,7 +10,7 @@ from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing from ray.rllib.evaluation.sample_batch import SampleBatch -from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy +from ray.rllib.evaluation.torch_policy_template import build_torch_policy def actor_critic_loss(policy, batch_tensors): diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index 4bdcb3e1fed5..84e330adccfd 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -5,7 +5,7 @@ import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing -from ray.rllib.evaluation.tf_policy_graph_template import build_tf_policy +from ray.rllib.evaluation.tf_policy_template import build_tf_policy from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.utils import try_import_tf diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py index 040dcadc9742..cbdfc4f5ea98 100644 --- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py @@ -8,7 +8,7 @@ from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing from ray.rllib.evaluation.sample_batch import SampleBatch -from ray.rllib.evaluation.torch_policy_graph_template import build_torch_policy +from ray.rllib.evaluation.torch_policy_template import build_torch_policy def pg_torch_loss(policy, batch_tensors): diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index aab90a034a36..5984eee545ab 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -9,7 +9,7 @@ Postprocessing from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule -from ray.rllib.evaluation.tf_policy_graph_template import build_tf_policy +from ray.rllib.evaluation.tf_policy_template import build_tf_policy from ray.rllib.models.catalog import ModelCatalog from ray.rllib.utils.explained_variance import explained_variance from ray.rllib.utils import try_import_tf diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index feba391e3574..d425770048e3 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -240,7 +240,7 @@ def fake_array(tensor): continue # can't handle arbitrary objects in TF shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype - placeholder = tf.placeholder(v.dtype, shape=shape, name=k) + placeholder = tf.placeholder(dtype, shape=shape, name=k) batch_tensors[k] = placeholder if log_once("loss_init"): diff --git a/python/ray/rllib/evaluation/tf_policy_graph_template.py b/python/ray/rllib/evaluation/tf_policy_template.py similarity index 100% rename from python/ray/rllib/evaluation/tf_policy_graph_template.py rename to python/ray/rllib/evaluation/tf_policy_template.py diff --git a/python/ray/rllib/evaluation/torch_policy_graph_template.py b/python/ray/rllib/evaluation/torch_policy_template.py similarity index 100% rename from python/ray/rllib/evaluation/torch_policy_graph_template.py rename to python/ray/rllib/evaluation/torch_policy_template.py From 6a1011e65988a0ae5b8b0803e0ad6f9ea654a3c1 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 00:08:12 -0700 Subject: [PATCH 26/39] debug level --- python/ray/rllib/evaluation/tf_policy_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index a33626b2df52..853a6110eaf1 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -173,7 +173,7 @@ def _initialize_loss(self, loss, loss_inputs): self._grads_and_vars) if log_once("loss_used"): - logger.info( + logger.debug( "These tensors were used in the loss_fn:\n\n{}\n".format( summarize(self._loss_input_dict))) From b1cecbeac8b7315fa28cd290689a70215376327b Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 00:17:32 -0700 Subject: [PATCH 27/39] rename policy_graph -> policy in new classes --- .../rllib/agents/a3c/a3c_torch_policy_graph.py | 4 ++-- python/ray/rllib/agents/pg/pg.py | 15 +++++++-------- python/ray/rllib/agents/pg/pg_policy_graph.py | 4 ++-- .../ray/rllib/agents/pg/torch_pg_policy_graph.py | 4 ++-- python/ray/rllib/agents/ppo/ppo.py | 4 ++-- python/ray/rllib/agents/ppo/ppo_policy_graph.py | 4 ++-- python/ray/rllib/agents/trainer_template.py | 16 ++++++++-------- .../ray/rllib/evaluation/tf_policy_template.py | 6 +++--- .../rllib/evaluation/torch_policy_template.py | 6 +++--- 9 files changed, 31 insertions(+), 32 deletions(-) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index 807c2327d77e..15301f85b53e 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -78,8 +78,8 @@ def _value(self, obs): return vf.detach().cpu().numpy().squeeze() -A3CTorchPolicyGraph = build_torch_policy( - name="A3CTorchPolicyGraph", +A3CTorchPolicy = build_torch_policy( + name="A3CTorchPolicy", get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, loss_fn=actor_critic_loss, stats_fn=loss_and_entropy_stats, diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index a2e81be5ebec..84cb9b9d8917 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -4,7 +4,7 @@ from ray.rllib.agents.trainer import with_common_config from ray.rllib.agents.trainer_template import build_trainer -from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph +from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy from ray.rllib.optimizers import SyncSamplesOptimizer # yapf: disable @@ -27,18 +27,17 @@ def make_policy_optimizer(local_ev, remote_evs, config): return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config) -def get_policy_graph(config): +def get_policy_class(config): if config["use_pytorch"]: - from ray.rllib.agents.pg.torch_pg_policy_graph import \ - PGTorchPolicyGraph - return PGTorchPolicyGraph + from ray.rllib.agents.pg.torch_pg_policy_graph import PGTorchPolicy + return PGTorchPolicy else: - return PGPolicyGraph + return PGTFPolicy PGTrainer = build_trainer( "PG", default_config=DEFAULT_CONFIG, - default_policy_graph=PGPolicyGraph, - get_policy_graph=get_policy_graph, + default_policy=PGTFPolicy, + get_policy_class=get_policy_class, make_policy_optimizer=make_policy_optimizer) diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index 84e330adccfd..d4f97605e2e5 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -32,8 +32,8 @@ def make_optimizer(policy): return tf.train.AdamOptimizer(learning_rate=policy.config["lr"]) -PGPolicyGraph = build_tf_policy( - name="PGPolicyGraph", +PGTFPolicy = build_tf_policy( + name="PGTFPolicy", get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, postprocess_fn=postprocess_advantages, loss_fn=policy_gradient_loss, diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py index cbdfc4f5ea98..ea280f3f6c08 100644 --- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py @@ -40,8 +40,8 @@ def make_optimizer(policy): return torch.optim.Adam(policy._model.parameters(), lr=policy.config["lr"]) -PGTorchPolicyGraph = build_torch_policy( - name="PGTorchPolicyGraph", +PGTorchPolicy = build_torch_policy( + name="PGTorchPolicy", get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, loss_fn=pg_torch_loss, stats_fn=pg_loss_stats, diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 6bf6269a6a80..99a5240b00ea 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -5,7 +5,7 @@ import logging from ray.rllib.agents import with_common_config -from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph +from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer @@ -158,7 +158,7 @@ def validate_config(config): PPOTrainer = build_trainer( "PPO", default_config=DEFAULT_CONFIG, - default_policy_graph=PPOPolicyGraph, + default_policy=PPOTFPolicy, make_policy_optimizer=make_optimizer, validate_config=validate_config, after_optimizer_step=update_kl, diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index 5984eee545ab..334ca788c936 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -271,8 +271,8 @@ def setup_mixins(policy, obs_space, action_space, config): LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) -PPOPolicyGraph = build_tf_policy( - name="PPOPolicyGraph", +PPOTFPolicy = build_tf_policy( + name="PPOTFPolicy", get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, loss_fn=ppo_surrogate_loss, stats_fn=kl_and_loss_stats, diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py index 85ed16b56044..643835fc2890 100644 --- a/python/ray/rllib/agents/trainer_template.py +++ b/python/ray/rllib/agents/trainer_template.py @@ -9,10 +9,10 @@ @DeveloperAPI def build_trainer(name, default_config, - default_policy_graph, + default_policy, make_policy_optimizer, validate_config=None, - get_policy_graph=None, + get_policy_class=None, before_train_step=None, after_optimizer_step=None, after_train_result=None): @@ -21,12 +21,12 @@ def build_trainer(name, Arguments: name (str): name of the trainer (e.g., "PPO") default_config (dict): the default config dict of the algorithm - default_policy_graph (cls): the default PolicyGraph class to use + default_policy (cls): the default PolicyGraph class to use make_policy_optimizer (func): function that returns a PolicyOptimizer instance given (local_evaluator, remote_evaluators, config) validate_config (func): optional callback that checks a given config for correctness. It may mutate the config as needed. - get_policy_graph (func): optional callback that takes a config and + get_policy_class (func): optional callback that takes a config and returns the policy graph class to override the default with before_train_step (func): optional callback to run before each train() call. It takes the trainer instance as an argument. @@ -48,15 +48,15 @@ def build_trainer(name, class trainer_cls(Trainer): _name = name _default_config = default_config - _policy_graph = default_policy_graph + _policy_graph = default_policy def _init(self, config, env_creator): if validate_config: validate_config(config) - if get_policy_graph is None: - policy_graph = default_policy_graph + if get_policy_class is None: + policy_graph = default_policy else: - policy_graph = get_policy_graph(config) + policy_graph = get_policy_class(config) self.local_evaluator = self.make_local_evaluator( env_creator, policy_graph) self.remote_evaluators = self.make_remote_evaluators( diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py index 57981b326673..40294ef4139d 100644 --- a/python/ray/rllib/evaluation/tf_policy_template.py +++ b/python/ray/rllib/evaluation/tf_policy_template.py @@ -25,7 +25,7 @@ def build_tf_policy(name, """Helper function for creating a dynamic tf policy graph at runtime. Arguments: - name (str): name of the graph (e.g., "PPOPolicyGraph") + name (str): name of the graph (e.g., "PPOPolicy") get_default_config (func): function that returns the default config to merge with any overrides loss_fn (func): function that returns a loss tensor the policy graph, @@ -59,8 +59,8 @@ def build_tf_policy(name, a DynamicTFPolicyGraph instance that uses the specified args """ - if not name.endswith("PolicyGraph"): - raise ValueError("Name should match *PolicyGraph", name) + if not name.endswith("TFPolicy"): + raise ValueError("Name should match *TFPolicy", name) base = DynamicTFPolicyGraph while mixins: diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py index 71756f11dc85..374ed7395b2a 100644 --- a/python/ray/rllib/evaluation/torch_policy_template.py +++ b/python/ray/rllib/evaluation/torch_policy_template.py @@ -24,7 +24,7 @@ def build_torch_policy(name, """Helper function for creating a torch policy graph at runtime. Arguments: - name (str): name of the graph (e.g., "PPOPolicyGraph") + name (str): name of the graph (e.g., "PPOPolicy") get_default_config (func): function that returns the default config to merge with any overrides loss_fn (func): function that returns a loss tensor the policy graph, @@ -55,8 +55,8 @@ def build_torch_policy(name, a TorchPolicyGraph instance that uses the specified args """ - if not name.endswith("TorchPolicyGraph"): - raise ValueError("Name should match *TorchPolicyGraph", name) + if not name.endswith("TorchPolicy"): + raise ValueError("Name should match *TorchPolicy", name) base = TorchPolicyGraph while mixins: From c857285f5c9f6a8d7f0c81d0a3997fffcda7270a Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 11:26:04 -0700 Subject: [PATCH 28/39] fix test --- python/ray/rllib/agents/a3c/a3c.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py index 836d9f074999..eb384058de80 100644 --- a/python/ray/rllib/agents/a3c/a3c.py +++ b/python/ray/rllib/agents/a3c/a3c.py @@ -49,8 +49,8 @@ class A3CTrainer(Trainer): def _init(self, config, env_creator): if config["use_pytorch"]: from ray.rllib.agents.a3c.a3c_torch_policy_graph import \ - A3CTorchPolicyGraph - policy_cls = A3CTorchPolicyGraph + A3CTorchPolicy + policy_cls = A3CTorchPolicy else: policy_cls = self._policy_graph From 64b267e29c3286fb5e0b4b1e4f3cea2cf7d0cc18 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 14:39:22 -0700 Subject: [PATCH 29/39] rename ppo tf policy --- python/ray/rllib/evaluation/policy_evaluator.py | 10 +++++----- python/ray/rllib/examples/multiagent_two_trainers.py | 4 ++-- .../ray/rllib/tests/test_external_multi_agent_env.py | 4 ++-- python/ray/rllib/tests/test_io.py | 4 ++-- python/ray/rllib/tests/test_multi_agent_env.py | 8 ++++---- python/ray/rllib/tests/test_nested_spaces.py | 6 +++--- python/ray/rllib/tests/test_optimizers.py | 6 +++--- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py index f6761122156e..48e19dfcb96e 100644 --- a/python/ray/rllib/evaluation/policy_evaluator.py +++ b/python/ray/rllib/evaluation/policy_evaluator.py @@ -65,7 +65,7 @@ class PolicyEvaluator(EvaluatorInterface): >>> # Create a policy evaluator and using it to collect experiences. >>> evaluator = PolicyEvaluator( ... env_creator=lambda _: gym.make("CartPole-v0"), - ... policy_graph=PGPolicyGraph) + ... policy_graph=PGTFPolicy) >>> print(evaluator.sample()) SampleBatch({ "obs": [[...]], "actions": [[...]], "rewards": [[...]], @@ -76,7 +76,7 @@ class PolicyEvaluator(EvaluatorInterface): ... evaluator_cls=PolicyEvaluator, ... evaluator_args={ ... "env_creator": lambda _: gym.make("CartPole-v0"), - ... "policy_graph": PGPolicyGraph, + ... "policy_graph": PGTFPolicy, ... }, ... num_workers=10) >>> for _ in range(10): optimizer.step() @@ -87,12 +87,12 @@ class PolicyEvaluator(EvaluatorInterface): ... policy_graphs={ ... # Use an ensemble of two policies for car agents ... "car_policy1": - ... (PGPolicyGraph, Box(...), Discrete(...), {"gamma": 0.99}), + ... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.99}), ... "car_policy2": - ... (PGPolicyGraph, Box(...), Discrete(...), {"gamma": 0.95}), + ... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.95}), ... # Use a single shared policy for all traffic lights ... "traffic_light_policy": - ... (PGPolicyGraph, Box(...), Discrete(...), {}), + ... (PGTFPolicy, Box(...), Discrete(...), {}), ... }, ... policy_mapping_fn=lambda agent_id: ... random.choice(["car_policy1", "car_policy2"]) diff --git a/python/ray/rllib/examples/multiagent_two_trainers.py b/python/ray/rllib/examples/multiagent_two_trainers.py index 2c18f2bf4b96..1d4257e4eb9d 100644 --- a/python/ray/rllib/examples/multiagent_two_trainers.py +++ b/python/ray/rllib/examples/multiagent_two_trainers.py @@ -18,7 +18,7 @@ from ray.rllib.agents.dqn.dqn import DQNTrainer from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph from ray.rllib.agents.ppo.ppo import PPOTrainer -from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph +from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy from ray.rllib.tests.test_multi_agent_env import MultiCartpole from ray.tune.logger import pretty_print from ray.tune.registry import register_env @@ -39,7 +39,7 @@ # You can also have multiple policy graphs per trainer, but here we just # show one each for PPO and DQN. policy_graphs = { - "ppo_policy": (PPOPolicyGraph, obs_space, act_space, {}), + "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}), "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}), } diff --git a/python/ray/rllib/tests/test_external_multi_agent_env.py b/python/ray/rllib/tests/test_external_multi_agent_env.py index e5e182b38655..c01e6fa0b7ae 100644 --- a/python/ray/rllib/tests/test_external_multi_agent_env.py +++ b/python/ray/rllib/tests/test_external_multi_agent_env.py @@ -8,7 +8,7 @@ import unittest import ray -from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph +from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv @@ -67,7 +67,7 @@ def testTrainExternalMultiCartpoleManyPolicies(self): obs_space = single_env.observation_space policies = {} for i in range(20): - policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space, + policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = PolicyEvaluator( diff --git a/python/ray/rllib/tests/test_io.py b/python/ray/rllib/tests/test_io.py index 9f92c9107c4e..0706be1019cc 100644 --- a/python/ray/rllib/tests/test_io.py +++ b/python/ray/rllib/tests/test_io.py @@ -15,7 +15,7 @@ import ray from ray.rllib.agents.pg import PGTrainer -from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph +from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy from ray.rllib.evaluation import SampleBatch from ray.rllib.offline import IOContext, JsonWriter, JsonReader from ray.rllib.offline.json_writer import _to_json @@ -159,7 +159,7 @@ def testMultiAgent(self): def gen_policy(): obs_space = single_env.observation_space act_space = single_env.action_space - return (PGPolicyGraph, obs_space, act_space, {}) + return (PGTFPolicy, obs_space, act_space, {}) pg = PGTrainer( env="multi_cartpole", diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py index eccb9aa82fb8..72130712d555 100644 --- a/python/ray/rllib/tests/test_multi_agent_env.py +++ b/python/ray/rllib/tests/test_multi_agent_env.py @@ -8,7 +8,7 @@ import ray from ray.rllib.agents.pg import PGTrainer -from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph +from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer, AsyncGradientsOptimizer) @@ -470,7 +470,7 @@ def get_initial_state(self): self.assertEqual(batch["state_out_0"][1], h) def testReturningModelBasedRolloutsData(self): - class ModelBasedPolicyGraph(PGPolicyGraph): + class ModelBasedPolicyGraph(PGTFPolicy): def compute_actions(self, obs_batch, state_batches, @@ -584,7 +584,7 @@ def _testWithOptimizer(self, optimizer_cls): } else: policies = { - "p1": (PGPolicyGraph, obs_space, act_space, {}), + "p1": (PGTFPolicy, obs_space, act_space, {}), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } ev = PolicyEvaluator( @@ -640,7 +640,7 @@ def testTrainMultiCartpoleManyPolicies(self): obs_space = env.observation_space policies = {} for i in range(20): - policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space, + policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = PolicyEvaluator( diff --git a/python/ray/rllib/tests/test_nested_spaces.py b/python/ray/rllib/tests/test_nested_spaces.py index e4285e42287c..b70bd9a2908e 100644 --- a/python/ray/rllib/tests/test_nested_spaces.py +++ b/python/ray/rllib/tests/test_nested_spaces.py @@ -12,7 +12,7 @@ import ray from ray.rllib.agents.a3c import A2CTrainer from ray.rllib.agents.pg import PGTrainer -from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph +from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy from ray.rllib.env import MultiAgentEnv from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.vector_env import VectorEnv @@ -333,10 +333,10 @@ def testMultiAgentComplexSpaces(self): "multiagent": { "policy_graphs": { "tuple_policy": ( - PGPolicyGraph, TUPLE_SPACE, act_space, + PGTFPolicy, TUPLE_SPACE, act_space, {"model": {"custom_model": "tuple_spy"}}), "dict_policy": ( - PGPolicyGraph, DICT_SPACE, act_space, + PGTFPolicy, DICT_SPACE, act_space, {"model": {"custom_model": "dict_spy"}}), }, "policy_mapping_fn": lambda a: { diff --git a/python/ray/rllib/tests/test_optimizers.py b/python/ray/rllib/tests/test_optimizers.py index 9c9e6b56b426..5436baeafa90 100644 --- a/python/ray/rllib/tests/test_optimizers.py +++ b/python/ray/rllib/tests/test_optimizers.py @@ -9,7 +9,7 @@ import ray from ray.rllib.agents.ppo import PPOTrainer -from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph +from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy from ray.rllib.evaluation import SampleBatch from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator from ray.rllib.optimizers import AsyncGradientsOptimizer, AsyncSamplesOptimizer @@ -240,12 +240,12 @@ def make_sess(): local = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), - policy_graph=PPOPolicyGraph, + policy_graph=PPOTFPolicy, tf_session_creator=make_sess) remotes = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), - policy_graph=PPOPolicyGraph, + policy_graph=PPOTFPolicy, tf_session_creator=make_sess) ] return local, remotes From 52f06e9dce3612c830fe99a59fd1cebcfbe83680 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 16:23:22 -0700 Subject: [PATCH 30/39] port appo too --- python/ray/rllib/agents/ppo/appo.py | 6 +- .../ray/rllib/agents/ppo/appo_policy_graph.py | 549 +++++++----------- .../evaluation/dynamic_tf_policy_graph.py | 29 +- .../rllib/evaluation/tf_policy_template.py | 11 +- 4 files changed, 259 insertions(+), 336 deletions(-) diff --git a/python/ray/rllib/agents/ppo/appo.py b/python/ray/rllib/agents/ppo/appo.py index ac3251775d52..b32531dd7d5c 100644 --- a/python/ray/rllib/agents/ppo/appo.py +++ b/python/ray/rllib/agents/ppo/appo.py @@ -2,7 +2,7 @@ from __future__ import division from __future__ import print_function -from ray.rllib.agents.ppo.appo_policy_graph import AsyncPPOPolicyGraph +from ray.rllib.agents.ppo.appo_policy_graph import AsyncPPOTFPolicy from ray.rllib.agents.trainer import with_base_config from ray.rllib.agents import impala from ray.rllib.utils.annotations import override @@ -57,8 +57,8 @@ class APPOTrainer(impala.ImpalaTrainer): _name = "APPO" _default_config = DEFAULT_CONFIG - _policy_graph = AsyncPPOPolicyGraph + _policy_graph = AsyncPPOTFPolicy @override(impala.ImpalaTrainer) def _get_policy_graph(self): - return AsyncPPOPolicyGraph + return AsyncPPOTFPolicy diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py index caaaf512bcb1..b2ff83ee2e85 100644 --- a/python/ray/rllib/agents/ppo/appo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -12,14 +12,11 @@ import ray from ray.rllib.agents.impala import vtrace -from ray.rllib.evaluation.policy_graph import PolicyGraph -from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY -from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \ - LearningRateSchedule -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.utils.annotations import override +from ray.rllib.evaluation.postprocessing import Postprocessing +from ray.rllib.evaluation.sample_batch import SampleBatch +from ray.rllib.evaluation.tf_policy_template import build_tf_policy +from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule from ray.rllib.utils.explained_variance import explained_variance -from ray.rllib.models.action_dist import MultiCategorical from ray.rllib.evaluation.postprocessing import compute_advantages from ray.rllib.utils import try_import_tf @@ -27,6 +24,8 @@ logger = logging.getLogger(__name__) +BEHAVIOUR_LOGITS = "behaviour_logits" + class PPOSurrogateLoss(object): """Loss used when V-trace is disabled. @@ -163,333 +162,233 @@ def __init__(self, self.entropy * entropy_coeff) -class APPOPostprocessing(object): - """Adds the policy logits, VF preds, and advantages to the trajectory.""" - - @override(TFPolicyGraph) - def extra_compute_action_fetches(self): - out = {"behaviour_logits": self.model.outputs} - if not self.config["vtrace"]: - out["vf_preds"] = self.value_function - return dict(TFPolicyGraph.extra_compute_action_fetches(self), **out) - - @override(PolicyGraph) - def postprocess_trajectory(self, - sample_batch, - other_agent_batches=None, - episode=None): - if not self.config["vtrace"]: - completed = sample_batch["dones"][-1] - if completed: - last_r = 0.0 - else: - next_state = [] - for i in range(len(self.model.state_in)): - next_state.append( - [sample_batch["state_out_{}".format(i)][-1]]) - last_r = self.value(sample_batch["new_obs"][-1], *next_state) - batch = compute_advantages( - sample_batch, - last_r, - self.config["gamma"], - self.config["lambda"], - use_gae=self.config["use_gae"]) - else: - batch = sample_batch - del batch.data["new_obs"] # not used, so save some bandwidth - return batch - - -class AsyncPPOPolicyGraph(LearningRateSchedule, APPOPostprocessing, - TFPolicyGraph): - def __init__(self, - observation_space, - action_space, - config, - existing_inputs=None): - config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) - assert config["batch_mode"] == "truncate_episodes", \ - "Must use `truncate_episodes` batch mode with V-trace." - self.config = config - self.sess = tf.get_default_session() - self.grads = None - - if isinstance(action_space, gym.spaces.Discrete): - is_multidiscrete = False - output_hidden_shape = [action_space.n] - elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): - is_multidiscrete = True - output_hidden_shape = action_space.nvec.astype(np.int32) - else: - is_multidiscrete = False - output_hidden_shape = 1 - - # Policy network model - dist_class, logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"]) - - # Create input placeholders - if existing_inputs: - if self.config["vtrace"]: - actions, dones, behaviour_logits, rewards, observations, \ - prev_actions, prev_rewards = existing_inputs[:7] - existing_state_in = existing_inputs[7:-1] - existing_seq_lens = existing_inputs[-1] - else: - actions, dones, behaviour_logits, rewards, observations, \ - prev_actions, prev_rewards, adv_ph, value_targets = \ - existing_inputs[:9] - existing_state_in = existing_inputs[9:-1] - existing_seq_lens = existing_inputs[-1] +def _make_time_major(policy, tensor, drop_last=False): + """Swaps batch and trajectory axis. + Args: + policy: Policy reference + tensor: A tensor or list of tensors to reshape. + drop_last: A bool indicating whether to drop the last + trajectory item. + Returns: + res: A tensor with swapped axes or a list of tensors with + swapped axes. + """ + if isinstance(tensor, list): + return [_make_time_major(policy, t, drop_last) for t in tensor] + + if policy.model.state_init: + B = tf.shape(policy.model.seq_lens)[0] + T = tf.shape(tensor)[0] // B + else: + # Important: chop the tensor into batches at known episode cut + # boundaries. TODO(ekl) this is kind of a hack + T = policy.config["sample_batch_size"] + B = tf.shape(tensor)[0] // T + rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) + + # swap B and T axes + res = tf.transpose( + rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) + + if drop_last: + return res[:-1] + return res + + +def build_appo_surrogate_loss(policy, batch_tensors): + if isinstance(policy.action_space, gym.spaces.Discrete): + is_multidiscrete = False + output_hidden_shape = [policy.action_space.n] + elif isinstance(policy.action_space, + gym.spaces.multi_discrete.MultiDiscrete): + is_multidiscrete = True + output_hidden_shape = policy.action_space.nvec.astype(np.int32) + else: + is_multidiscrete = False + output_hidden_shape = 1 + + def make_time_major(*args, **kw): + return _make_time_major(policy, *args, **kw) + + actions = batch_tensors[SampleBatch.ACTIONS] + dones = batch_tensors[SampleBatch.DONES] + rewards = batch_tensors[SampleBatch.REWARDS] + behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS] + unpacked_behaviour_logits = tf.split( + behaviour_logits, output_hidden_shape, axis=1) + unpacked_outputs = tf.split( + policy.model.outputs, output_hidden_shape, axis=1) + prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ + behaviour_logits + action_dist = policy.action_dist + prev_action_dist = policy.dist_class(prev_dist_inputs) + values = policy.value_function + + if policy.model.state_in: + max_seq_len = tf.reduce_max(policy.model.seq_lens) - 1 + mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len) + mask = tf.reshape(mask, [-1]) + else: + mask = tf.ones_like(rewards) + + if policy.config["vtrace"]: + logger.info("Using V-Trace surrogate loss (vtrace=True)") + + # Prepare actions for loss + loss_actions = actions if is_multidiscrete else tf.expand_dims( + actions, axis=1) + + policy.loss = VTraceSurrogateLoss( + actions=make_time_major(loss_actions, drop_last=True), + prev_actions_logp=make_time_major( + prev_action_dist.logp(actions), drop_last=True), + actions_logp=make_time_major( + action_dist.logp(actions), drop_last=True), + action_kl=prev_action_dist.kl(action_dist), + actions_entropy=make_time_major( + action_dist.entropy(), drop_last=True), + dones=make_time_major(dones, drop_last=True), + behaviour_logits=make_time_major( + unpacked_behaviour_logits, drop_last=True), + target_logits=make_time_major(unpacked_outputs, drop_last=True), + discount=policy.config["gamma"], + rewards=make_time_major(rewards, drop_last=True), + values=make_time_major(values, drop_last=True), + bootstrap_value=make_time_major(values)[-1], + dist_class=policy.dist_class, + valid_mask=make_time_major(mask, drop_last=True), + vf_loss_coeff=policy.config["vf_loss_coeff"], + entropy_coeff=policy.config["entropy_coeff"], + clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"], + clip_pg_rho_threshold=policy.config[ + "vtrace_clip_pg_rho_threshold"], + clip_param=policy.config["clip_param"]) + else: + logger.info("Using PPO surrogate loss (vtrace=False)") + policy.loss = PPOSurrogateLoss( + prev_actions_logp=make_time_major(prev_action_dist.logp(actions)), + actions_logp=make_time_major(action_dist.logp(actions)), + action_kl=prev_action_dist.kl(action_dist), + actions_entropy=make_time_major(action_dist.entropy()), + values=make_time_major(values), + valid_mask=make_time_major(mask), + advantages=make_time_major( + batch_tensors[Postprocessing.ADVANTAGES]), + value_targets=make_time_major( + batch_tensors[Postprocessing.VALUE_TARGETS]), + vf_loss_coeff=policy.config["vf_loss_coeff"], + entropy_coeff=policy.config["entropy_coeff"], + clip_param=policy.config["clip_param"]) + + return policy.loss.total_loss + + +def stats(policy, batch_tensors): + values_batched = _make_time_major( + policy, policy.value_function, drop_last=policy.config["vtrace"]) + + return { + "cur_lr": tf.cast(policy.cur_lr, tf.float64), + "policy_loss": policy.loss.pi_loss, + "entropy": policy.loss.entropy, + "var_gnorm": tf.global_norm(policy.var_list), + "vf_loss": policy.loss.vf_loss, + "vf_explained_var": explained_variance( + tf.reshape(policy.loss.value_targets, [-1]), + tf.reshape(values_batched, [-1])), + } + + +def grad_stats(policy, grads): + return { + "grad_gnorm": tf.global_norm(grads), + } + + +def postprocess_trajectory(policy, + sample_batch, + other_agent_batches=None, + episode=None): + if not policy.config["vtrace"]: + completed = sample_batch["dones"][-1] + if completed: + last_r = 0.0 else: - actions = ModelCatalog.get_action_placeholder(action_space) - dones = tf.placeholder(tf.bool, [None], name="dones") - rewards = tf.placeholder(tf.float32, [None], name="rewards") - behaviour_logits = tf.placeholder( - tf.float32, [None, logit_dim], name="behaviour_logits") - observations = tf.placeholder( - tf.float32, [None] + list(observation_space.shape)) - existing_state_in = None - existing_seq_lens = None - - if not self.config["vtrace"]: - adv_ph = tf.placeholder( - tf.float32, name="advantages", shape=(None, )) - value_targets = tf.placeholder( - tf.float32, name="value_targets", shape=(None, )) - self.observations = observations - - # Unpack behaviour logits - unpacked_behaviour_logits = tf.split( - behaviour_logits, output_hidden_shape, axis=1) - - # Setup the policy - dist_class, logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"]) - prev_actions = ModelCatalog.get_action_placeholder(action_space) - prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") - self.model = ModelCatalog.get_model( - { - "obs": observations, - "prev_actions": prev_actions, - "prev_rewards": prev_rewards, - "is_training": self._get_is_training_placeholder(), - }, - observation_space, - action_space, - logit_dim, - self.config["model"], - state_in=existing_state_in, - seq_lens=existing_seq_lens) - unpacked_outputs = tf.split( - self.model.outputs, output_hidden_shape, axis=1) - - dist_inputs = unpacked_outputs if is_multidiscrete else \ - self.model.outputs - prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ - behaviour_logits - - action_dist = dist_class(dist_inputs) - prev_action_dist = dist_class(prev_dist_inputs) - - values = self.model.value_function() - self.value_function = values + next_state = [] + for i in range(len(policy.model.state_in)): + next_state.append([sample_batch["state_out_{}".format(i)][-1]]) + last_r = policy.value(sample_batch["new_obs"][-1], *next_state) + batch = compute_advantages( + sample_batch, + last_r, + policy.config["gamma"], + policy.config["lambda"], + use_gae=policy.config["use_gae"]) + else: + batch = sample_batch + del batch.data["new_obs"] # not used, so save some bandwidth + return batch + + +def add_values_and_logits(policy): + out = {BEHAVIOUR_LOGITS: policy.model.outputs} + if not policy.config["vtrace"]: + out[SampleBatch.VF_PREDS] = policy.value_function + return out + + +def validate_config(policy, obs_space, action_space, config): + assert config["batch_mode"] == "truncate_episodes", \ + "Must use `truncate_episodes` batch mode with V-trace." + + +def optimizer(policy): + if policy.config["opt_type"] == "adam": + return tf.train.AdamOptimizer(policy.cur_lr) + else: + return tf.train.RMSPropOptimizer(policy.cur_lr, policy.config["decay"], + policy.config["momentum"], + policy.config["epsilon"]) + + +def gradients(policy, optimizer, loss): + grads = tf.gradients(loss, policy.var_list) + policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) + clipped_grads = list(zip(policy.grads, policy.var_list)) + return clipped_grads + + +class ValueNetworkMixin(object): + def __init__(self): + self.value_function = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) - def make_time_major(tensor, drop_last=False): - """Swaps batch and trajectory axis. - Args: - tensor: A tensor or list of tensors to reshape. - drop_last: A bool indicating whether to drop the last - trajectory item. - Returns: - res: A tensor with swapped axes or a list of tensors with - swapped axes. - """ - if isinstance(tensor, list): - return [make_time_major(t, drop_last) for t in tensor] - - if self.model.state_init: - B = tf.shape(self.model.seq_lens)[0] - T = tf.shape(tensor)[0] // B - else: - # Important: chop the tensor into batches at known episode cut - # boundaries. TODO(ekl) this is kind of a hack - T = self.config["sample_batch_size"] - B = tf.shape(tensor)[0] // T - rs = tf.reshape(tensor, - tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) - - # swap B and T axes - res = tf.transpose( - rs, - [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) - - if drop_last: - return res[:-1] - return res - - if self.model.state_in: - max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 - mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) - mask = tf.reshape(mask, [-1]) - else: - mask = tf.ones_like(rewards) - - # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. - if self.config["vtrace"]: - logger.info("Using V-Trace surrogate loss (vtrace=True)") - - # Prepare actions for loss - loss_actions = actions if is_multidiscrete else tf.expand_dims( - actions, axis=1) - - self.loss = VTraceSurrogateLoss( - actions=make_time_major(loss_actions, drop_last=True), - prev_actions_logp=make_time_major( - prev_action_dist.logp(actions), drop_last=True), - actions_logp=make_time_major( - action_dist.logp(actions), drop_last=True), - action_kl=prev_action_dist.kl(action_dist), - actions_entropy=make_time_major( - action_dist.entropy(), drop_last=True), - dones=make_time_major(dones, drop_last=True), - behaviour_logits=make_time_major( - unpacked_behaviour_logits, drop_last=True), - target_logits=make_time_major( - unpacked_outputs, drop_last=True), - discount=config["gamma"], - rewards=make_time_major(rewards, drop_last=True), - values=make_time_major(values, drop_last=True), - bootstrap_value=make_time_major(values)[-1], - dist_class=dist_class, - valid_mask=make_time_major(mask, drop_last=True), - vf_loss_coeff=self.config["vf_loss_coeff"], - entropy_coeff=self.config["entropy_coeff"], - clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], - clip_pg_rho_threshold=self.config[ - "vtrace_clip_pg_rho_threshold"], - clip_param=self.config["clip_param"]) - else: - logger.info("Using PPO surrogate loss (vtrace=False)") - self.loss = PPOSurrogateLoss( - prev_actions_logp=make_time_major( - prev_action_dist.logp(actions)), - actions_logp=make_time_major(action_dist.logp(actions)), - action_kl=prev_action_dist.kl(action_dist), - actions_entropy=make_time_major(action_dist.entropy()), - values=make_time_major(values), - valid_mask=make_time_major(mask), - advantages=make_time_major(adv_ph), - value_targets=make_time_major(value_targets), - vf_loss_coeff=self.config["vf_loss_coeff"], - entropy_coeff=self.config["entropy_coeff"], - clip_param=self.config["clip_param"]) - - # KL divergence between worker and learner logits for debugging - model_dist = MultiCategorical(unpacked_outputs) - behaviour_dist = MultiCategorical(unpacked_behaviour_logits) - - kls = model_dist.kl(behaviour_dist) - if len(kls) > 1: - self.KL_stats = {} - - for i, kl in enumerate(kls): - self.KL_stats.update({ - "mean_KL_{}".format(i): tf.reduce_mean(kl), - "max_KL_{}".format(i): tf.reduce_max(kl), - }) - else: - self.KL_stats = { - "mean_KL": tf.reduce_mean(kls[0]), - "max_KL": tf.reduce_max(kls[0]), - } - - # Initialize TFPolicyGraph - loss_in = [ - ("actions", actions), - ("dones", dones), - ("behaviour_logits", behaviour_logits), - ("rewards", rewards), - ("obs", observations), - ("prev_actions", prev_actions), - ("prev_rewards", prev_rewards), - ] - if not self.config["vtrace"]: - loss_in.append(("advantages", adv_ph)) - loss_in.append(("value_targets", value_targets)) - LearningRateSchedule.__init__(self, self.config["lr"], - self.config["lr_schedule"]) - TFPolicyGraph.__init__( - self, - observation_space, - action_space, - self.sess, - obs_input=observations, - action_sampler=action_dist.sample(), - action_prob=action_dist.sampled_action_prob(), - loss=self.loss.total_loss, - model=self.model, - loss_inputs=loss_in, - state_inputs=self.model.state_in, - state_outputs=self.model.state_out, - prev_action_input=prev_actions, - prev_reward_input=prev_rewards, - seq_lens=self.model.seq_lens, - max_seq_len=self.config["model"]["max_seq_len"], - batch_divisibility_req=self.config["sample_batch_size"]) - - self.sess.run(tf.global_variables_initializer()) - - values_batched = make_time_major( - values, drop_last=self.config["vtrace"]) - self.stats_fetches = { - LEARNER_STATS_KEY: dict({ - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "policy_loss": self.loss.pi_loss, - "entropy": self.loss.entropy, - "grad_gnorm": tf.global_norm(self._grads), - "var_gnorm": tf.global_norm(self.var_list), - "vf_loss": self.loss.vf_loss, - "vf_explained_var": explained_variance( - tf.reshape(self.loss.value_targets, [-1]), - tf.reshape(values_batched, [-1])), - }, **self.KL_stats), - } - - def optimizer(self): - if self.config["opt_type"] == "adam": - return tf.train.AdamOptimizer(self.cur_lr) - else: - return tf.train.RMSPropOptimizer(self.cur_lr, self.config["decay"], - self.config["momentum"], - self.config["epsilon"]) - - def gradients(self, optimizer, loss): - grads = tf.gradients(loss, self.var_list) - self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"]) - clipped_grads = list(zip(self.grads, self.var_list)) - return clipped_grads - - def extra_compute_grad_fetches(self): - return self.stats_fetches - def value(self, ob, *args): - feed_dict = {self.observations: [ob], self.model.seq_lens: [1]} + feed_dict = {self._obs_input: [ob], self.model.seq_lens: [1]} assert len(args) == len(self.model.state_in), \ (args, self.model.state_in) for k, v in zip(self.model.state_in, args): feed_dict[k] = v - vf = self.sess.run(self.value_function, feed_dict) + vf = self._sess.run(self.value_function, feed_dict) return vf[0] - def get_initial_state(self): - return self.model.state_init - def copy(self, existing_inputs): - return AsyncPPOPolicyGraph( - self.observation_space, - self.action_space, - self.config, - existing_inputs=existing_inputs) +def setup_mixins(policy, obs_space, action_space, config): + LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) + ValueNetworkMixin.__init__(policy) + + +AsyncPPOTFPolicy = build_tf_policy( + name="AsyncPPOTFPolicy", + get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG, + loss_fn=build_appo_surrogate_loss, + stats_fn=stats, + grad_stats_fn=grad_stats, + postprocess_fn=postprocess_trajectory, + optimizer_fn=optimizer, + extra_action_fetches_fn=add_values_and_logits, + before_init=validate_config, + before_loss_init=setup_mixins, + mixins=[LearningRateSchedule, ValueNetworkMixin], + get_batch_divisibility_req=lambda p: p.config["sample_batch_size"]) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index d425770048e3..64f8f747d984 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -37,9 +37,11 @@ def __init__(self, config, loss_fn, stats_fn=None, + grad_stats_fn=None, before_loss_init=None, make_action_sampler=None, - existing_inputs=None): + existing_inputs=None, + get_batch_divisibility_req=None): """Initialize a dynamic TF policy graph. Arguments: @@ -50,6 +52,8 @@ def __init__(self, graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy graph and batch input tensors + grad_stats_fn (func): optional function that returns a dict of + TF fetches given the policy graph and loss gradient tensors before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_action_sampler (func): optional function that returns a @@ -59,10 +63,13 @@ def __init__(self, existing_inputs (OrderedDict): when copying a policy graph, this specifies an existing dict of placeholders to use instead of defining new ones + get_batch_divisibility_req (func): optional function that returns + the divisibility requirement for sample batches """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn + self._grad_stats_fn = grad_stats_fn # Setup standard placeholders if existing_inputs is not None: @@ -90,11 +97,12 @@ def __init__(self, assert not existing_inputs, \ "Cloning not supported with custom action sampler" self.model = None + self.dist_class = None self.action_dist = None action_sampler, action_prob = make_action_sampler( self, input_dict, obs_space, action_space, config) else: - dist_class, logit_dim = ModelCatalog.get_action_dist( + self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: existing_state_in = [ @@ -116,12 +124,16 @@ def __init__(self, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) - self.action_dist = dist_class(self.model.outputs) + self.action_dist = self.dist_class(self.model.outputs) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() + if get_batch_divisibility_req: + batch_divisibility_req = get_batch_divisibility_req(self) + else: + batch_divisibility_req = 1 TFPolicyGraph.__init__( self, obs_space, @@ -138,7 +150,8 @@ def __init__(self, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model and self.model.seq_lens, - max_seq_len=config["model"]["max_seq_len"]) + max_seq_len=config["model"]["max_seq_len"], + batch_divisibility_req=batch_divisibility_req) # Phase 2 init before_loss_init(self, obs_space, action_space, config) @@ -184,6 +197,9 @@ def copy(self, existing_inputs): TFPolicyGraph._initialize_loss( instance, loss, [(k, existing_inputs[i]) for i, (k, _) in enumerate(self._loss_inputs)]) + if instance._grad_stats_fn: + instance._stats_fetches.update( + instance._grad_stats_fn(instance, instance._grads)) return instance @override(PolicyGraph) @@ -205,7 +221,7 @@ def fake_array(tensor): SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.ACTIONS: fake_array(self._sampler), - SampleBatch.REWARDS: np.array([0], dtype=np.int32), + SampleBatch.REWARDS: np.array([0], dtype=np.float32), SampleBatch.DONES: np.array([False], dtype=np.bool), } state_init = self.get_initial_state() @@ -253,6 +269,7 @@ def fake_array(tensor): self._stats_fetches.update(self._stats_fn(self, batch_tensors)) for k in sorted(batch_tensors.accessed_keys): loss_inputs.append((k, batch_tensors[k])) - TFPolicyGraph._initialize_loss(self, loss, loss_inputs) + if self._grad_stats_fn: + self._stats_fetches.update(self._grad_stats_fn(self, self._grads)) self._sess.run(tf.global_variables_initializer()) diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py index 40294ef4139d..888e82b316eb 100644 --- a/python/ray/rllib/evaluation/tf_policy_template.py +++ b/python/ray/rllib/evaluation/tf_policy_template.py @@ -13,6 +13,7 @@ def build_tf_policy(name, get_default_config, loss_fn, stats_fn=None, + grad_stats_fn=None, extra_action_fetches_fn=None, postprocess_fn=None, optimizer_fn=None, @@ -21,7 +22,8 @@ def build_tf_policy(name, before_loss_init=None, after_init=None, make_action_sampler=None, - mixins=None): + mixins=None, + get_batch_divisibility_req=None): """Helper function for creating a dynamic tf policy graph at runtime. Arguments: @@ -32,6 +34,8 @@ def build_tf_policy(name, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy graph and batch input tensors + grad_stats_fn (func): optional function that returns a dict of + TF fetches given the policy graph and loss gradient tensors extra_action_fetches_fn (func): optional function that returns a dict of TF fetches given the policy graph object postprocess_fn (func): optional experience postprocessing function @@ -54,6 +58,8 @@ def build_tf_policy(name, mixins (list): list of any class mixins for the returned policy class. These mixins will be applied in order and will have higher precedence than the DynamicTFPolicyGraph class + get_batch_divisibility_req (func): optional function that returns + the divisibility requirement for sample batches Returns: a DynamicTFPolicyGraph instance that uses the specified args @@ -96,7 +102,8 @@ def before_loss_init_wrapper(policy, obs_space, action_space, action_space, config, loss_fn, - stats_fn, + stats_fn=stats_fn, + grad_stats_fn=grad_stats_fn, before_loss_init=before_loss_init_wrapper, existing_inputs=existing_inputs) From 3f64d4faa70ecbbca49522b5e7b0a377c98e1ceb Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 16:52:00 -0700 Subject: [PATCH 31/39] forgot grads --- python/ray/rllib/agents/ppo/appo_policy_graph.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py index b2ff83ee2e85..e78eac98662b 100644 --- a/python/ray/rllib/agents/ppo/appo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -342,7 +342,7 @@ def validate_config(policy, obs_space, action_space, config): "Must use `truncate_episodes` batch mode with V-trace." -def optimizer(policy): +def choose_optimizer(policy): if policy.config["opt_type"] == "adam": return tf.train.AdamOptimizer(policy.cur_lr) else: @@ -351,7 +351,7 @@ def optimizer(policy): policy.config["epsilon"]) -def gradients(policy, optimizer, loss): +def clip_gradients(policy, optimizer, loss): grads = tf.gradients(loss, policy.var_list) policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) clipped_grads = list(zip(policy.grads, policy.var_list)) @@ -386,7 +386,8 @@ def setup_mixins(policy, obs_space, action_space, config): stats_fn=stats, grad_stats_fn=grad_stats, postprocess_fn=postprocess_trajectory, - optimizer_fn=optimizer, + optimizer_fn=choose_optimizer, + gradients_fn=clip_gradients, extra_action_fetches_fn=add_values_and_logits, before_init=validate_config, before_loss_init=setup_mixins, From 0c6a22c84ed7a805db8285eb2b493e7cf0388fa9 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 17:22:29 -0700 Subject: [PATCH 32/39] default policy optimizer --- python/ray/rllib/agents/pg/pg.py | 10 +--------- python/ray/rllib/agents/trainer_template.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index 84cb9b9d8917..a28c7f73e3a0 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -5,7 +5,6 @@ from ray.rllib.agents.trainer import with_common_config from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.agents.pg.pg_policy_graph import PGTFPolicy -from ray.rllib.optimizers import SyncSamplesOptimizer # yapf: disable # __sphinx_doc_begin__ @@ -21,12 +20,6 @@ # yapf: enable -def make_policy_optimizer(local_ev, remote_evs, config): - optimizer_config = dict(config["optimizer"], - **{"train_batch_size": config["train_batch_size"]}) - return SyncSamplesOptimizer(local_ev, remote_evs, **optimizer_config) - - def get_policy_class(config): if config["use_pytorch"]: from ray.rllib.agents.pg.torch_pg_policy_graph import PGTorchPolicy @@ -39,5 +32,4 @@ def get_policy_class(config): "PG", default_config=DEFAULT_CONFIG, default_policy=PGTFPolicy, - get_policy_class=get_policy_class, - make_policy_optimizer=make_policy_optimizer) + get_policy_class=get_policy_class) diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py index 643835fc2890..d27cf63f43f8 100644 --- a/python/ray/rllib/agents/trainer_template.py +++ b/python/ray/rllib/agents/trainer_template.py @@ -3,6 +3,7 @@ from __future__ import print_function from ray.rllib.agents.trainer import Trainer +from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.utils.annotations import override, DeveloperAPI @@ -10,7 +11,7 @@ def build_trainer(name, default_config, default_policy, - make_policy_optimizer, + make_policy_optimizer=None, validate_config=None, get_policy_class=None, before_train_step=None, @@ -22,8 +23,9 @@ def build_trainer(name, name (str): name of the trainer (e.g., "PPO") default_config (dict): the default config dict of the algorithm default_policy (cls): the default PolicyGraph class to use - make_policy_optimizer (func): function that returns a PolicyOptimizer - instance given (local_evaluator, remote_evaluators, config) + make_policy_optimizer (func): optional function that returns a + PolicyOptimizer instance given + (local_evaluator, remote_evaluators, config) validate_config (func): optional callback that checks a given config for correctness. It may mutate the config as needed. get_policy_class (func): optional callback that takes a config and @@ -64,6 +66,13 @@ def _init(self, config, env_creator): if make_policy_optimizer: self.optimizer = make_policy_optimizer( self.local_evaluator, self.remote_evaluators, config) + else: + optimizer_config = dict( + config["optimizer"], + **{"train_batch_size": config["train_batch_size"]}) + self.optimizer = SyncSamplesOptimizer(self.local_evaluator, + self.remote_evaluators, + **optimizer_config) @override(Trainer) def _train(self): From c7e0320af9cf84772930c2110862a3580e2b196b Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 17:25:43 -0700 Subject: [PATCH 33/39] make default config optional --- python/ray/rllib/agents/pg/pg.py | 2 +- python/ray/rllib/agents/ppo/ppo.py | 2 +- python/ray/rllib/agents/trainer_template.py | 7 ++++--- python/ray/rllib/evaluation/tf_policy_template.py | 9 +++++---- python/ray/rllib/evaluation/torch_policy_template.py | 9 +++++---- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index a28c7f73e3a0..ffbb899d1b9e 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -29,7 +29,7 @@ def get_policy_class(config): PGTrainer = build_trainer( - "PG", + name="PG", default_config=DEFAULT_CONFIG, default_policy=PGTFPolicy, get_policy_class=get_policy_class) diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 99a5240b00ea..d3f5abdaa95c 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -156,7 +156,7 @@ def validate_config(config): PPOTrainer = build_trainer( - "PPO", + name="PPO", default_config=DEFAULT_CONFIG, default_policy=PPOTFPolicy, make_policy_optimizer=make_optimizer, diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py index d27cf63f43f8..618bc3b30ace 100644 --- a/python/ray/rllib/agents/trainer_template.py +++ b/python/ray/rllib/agents/trainer_template.py @@ -9,8 +9,8 @@ @DeveloperAPI def build_trainer(name, - default_config, default_policy, + default_config=None, make_policy_optimizer=None, validate_config=None, get_policy_class=None, @@ -21,8 +21,9 @@ def build_trainer(name, Arguments: name (str): name of the trainer (e.g., "PPO") - default_config (dict): the default config dict of the algorithm default_policy (cls): the default PolicyGraph class to use + default_config (dict): the default config dict of the algorithm, + otherwises uses the Trainer default config make_policy_optimizer (func): optional function that returns a PolicyOptimizer instance given (local_evaluator, remote_evaluators, config) @@ -49,7 +50,7 @@ def build_trainer(name, class trainer_cls(Trainer): _name = name - _default_config = default_config + _default_config = default_config or Trainer.COMMON_CONFIG _policy_graph = default_policy def _init(self, config, env_creator): diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py index 888e82b316eb..138533d589d0 100644 --- a/python/ray/rllib/evaluation/tf_policy_template.py +++ b/python/ray/rllib/evaluation/tf_policy_template.py @@ -10,8 +10,8 @@ @DeveloperAPI def build_tf_policy(name, - get_default_config, loss_fn, + get_default_config=None, stats_fn=None, grad_stats_fn=None, extra_action_fetches_fn=None, @@ -28,10 +28,10 @@ def build_tf_policy(name, Arguments: name (str): name of the graph (e.g., "PPOPolicy") - get_default_config (func): function that returns the default config - to merge with any overrides loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders + get_default_config (func): optional function that returns the default + config to merge with any overrides stats_fn (func): optional function that returns a dict of TF fetches given the policy graph and batch input tensors grad_stats_fn (func): optional function that returns a dict of @@ -82,7 +82,8 @@ def __init__(self, action_space, config, existing_inputs=None): - config = dict(get_default_config(), **config) + if get_default_config: + config = dict(get_default_config(), **config) if before_init: before_init(self, obs_space, action_space, config) diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py index 374ed7395b2a..a666ed83d752 100644 --- a/python/ray/rllib/evaluation/torch_policy_template.py +++ b/python/ray/rllib/evaluation/torch_policy_template.py @@ -10,8 +10,8 @@ @DeveloperAPI def build_torch_policy(name, - get_default_config, loss_fn, + get_default_config=None, stats_fn=None, postprocess_fn=None, extra_action_out_fn=None, @@ -25,10 +25,10 @@ def build_torch_policy(name, Arguments: name (str): name of the graph (e.g., "PPOPolicy") - get_default_config (func): function that returns the default config - to merge with any overrides loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders + get_default_config (func): optional function that returns the default + config to merge with any overrides stats_fn (func): optional function that returns a dict of values given the policy graph and batch input tensors postprocess_fn (func): optional experience postprocessing function @@ -68,7 +68,8 @@ class new_base(mixins.pop(), base): class graph_cls(base): def __init__(self, obs_space, action_space, config): - config = dict(get_default_config(), **config) + if get_default_config: + config = dict(get_default_config(), **config) self.config = config if before_init: From 46523a0b9ce1d4ff9be0b2232422d2545e1a18a2 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 17:34:44 -0700 Subject: [PATCH 34/39] add config to optimizer --- .../ray/rllib/agents/a3c/a3c_torch_policy_graph.py | 4 ++-- python/ray/rllib/agents/pg/pg_policy_graph.py | 4 ++-- .../ray/rllib/agents/pg/torch_pg_policy_graph.py | 4 ++-- python/ray/rllib/agents/ppo/appo_policy_graph.py | 7 +++---- python/ray/rllib/evaluation/tf_policy_graph.py | 2 +- python/ray/rllib/evaluation/tf_policy_template.py | 14 +++++++------- .../ray/rllib/evaluation/torch_policy_template.py | 10 +++++----- 7 files changed, 22 insertions(+), 23 deletions(-) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index 15301f85b53e..fa6f857f9eca 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -66,8 +66,8 @@ def apply_grad_clipping(policy): return info -def torch_optimizer(policy): - return torch.optim.Adam(policy.model.parameters(), lr=policy.config["lr"]) +def torch_optimizer(policy, config): + return torch.optim.Adam(policy.model.parameters(), lr=config["lr"]) class ValueNetworkMixin(object): diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index d4f97605e2e5..666c62a13f2f 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -28,8 +28,8 @@ def postprocess_advantages(policy, sample_batch, 0.0, policy.config["gamma"], use_gae=False) -def make_optimizer(policy): - return tf.train.AdamOptimizer(learning_rate=policy.config["lr"]) +def make_optimizer(policy, config): + return tf.train.AdamOptimizer(learning_rate=config["lr"]) PGTFPolicy = build_tf_policy( diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py index ea280f3f6c08..6a929536a858 100644 --- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py @@ -36,8 +36,8 @@ def pg_loss_stats(policy, batch_tensors): return {"policy_loss": policy.pi_err.item()} -def make_optimizer(policy): - return torch.optim.Adam(policy._model.parameters(), lr=policy.config["lr"]) +def make_optimizer(policy, config): + return torch.optim.Adam(policy._model.parameters(), lr=config["lr"]) PGTorchPolicy = build_torch_policy( diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py index e78eac98662b..438618cbf34e 100644 --- a/python/ray/rllib/agents/ppo/appo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -342,13 +342,12 @@ def validate_config(policy, obs_space, action_space, config): "Must use `truncate_episodes` batch mode with V-trace." -def choose_optimizer(policy): +def choose_optimizer(policy, config): if policy.config["opt_type"] == "adam": return tf.train.AdamOptimizer(policy.cur_lr) else: - return tf.train.RMSPropOptimizer(policy.cur_lr, policy.config["decay"], - policy.config["momentum"], - policy.config["epsilon"]) + return tf.train.RMSPropOptimizer(policy.cur_lr, config["decay"], + config["momentum"], config["epsilon"]) def clip_gradients(policy, optimizer, loss): diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index 853a6110eaf1..e20c03fc0d60 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -284,7 +284,7 @@ def extra_compute_grad_fetches(self): @DeveloperAPI def optimizer(self): """TF optimizer to use for policy optimization.""" - return tf.train.AdamOptimizer() + return tf.train.AdamOptimizer(self.config["lr"]) @DeveloperAPI def gradients(self, optimizer, loss): diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py index 138533d589d0..b654e6e64f28 100644 --- a/python/ray/rllib/evaluation/tf_policy_template.py +++ b/python/ray/rllib/evaluation/tf_policy_template.py @@ -24,24 +24,24 @@ def build_tf_policy(name, make_action_sampler=None, mixins=None, get_batch_divisibility_req=None): - """Helper function for creating a dynamic tf policy graph at runtime. + """Helper function for creating a dynamic tf policy at runtime. Arguments: name (str): name of the graph (e.g., "PPOPolicy") - loss_fn (func): function that returns a loss tensor the policy graph, + loss_fn (func): function that returns a loss tensor the policy, and dict of experience tensor placeholders get_default_config (func): optional function that returns the default config to merge with any overrides stats_fn (func): optional function that returns a dict of - TF fetches given the policy graph and batch input tensors + TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of - TF fetches given the policy graph and loss gradient tensors + TF fetches given the policy and loss gradient tensors extra_action_fetches_fn (func): optional function that returns - a dict of TF fetches given the policy graph object + a dict of TF fetches given the policy object postprocess_fn (func): optional experience postprocessing function that takes the same args as PolicyGraph.postprocess_trajectory() optimizer_fn (func): optional function that returns a tf.Optimizer - given the policy graph object + given the policy and config gradients_fn (func): optional function that returns a list of gradients given a tf optimizer and loss tensor. If not specified, this defaults to optimizer.compute_gradients(loss) @@ -124,7 +124,7 @@ def postprocess_trajectory(self, @override(TFPolicyGraph) def optimizer(self): if optimizer_fn: - return optimizer_fn(self) + return optimizer_fn(self, self.config) else: return TFPolicyGraph.optimizer(self) diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py index a666ed83d752..fdc0cf5bca0e 100644 --- a/python/ray/rllib/evaluation/torch_policy_template.py +++ b/python/ray/rllib/evaluation/torch_policy_template.py @@ -21,16 +21,16 @@ def build_torch_policy(name, after_init=None, make_model_and_action_dist=None, mixins=None): - """Helper function for creating a torch policy graph at runtime. + """Helper function for creating a torch policy at runtime. Arguments: name (str): name of the graph (e.g., "PPOPolicy") - loss_fn (func): function that returns a loss tensor the policy graph, + loss_fn (func): function that returns a loss tensor the policy, and dict of experience tensor placeholders get_default_config (func): optional function that returns the default config to merge with any overrides stats_fn (func): optional function that returns a dict of - values given the policy graph and batch input tensors + values given the policy and batch input tensors postprocess_fn (func): optional experience postprocessing function that takes the same args as PolicyGraph.postprocess_trajectory() extra_action_out_fn (func): optional function that returns @@ -38,7 +38,7 @@ def build_torch_policy(name, extra_grad_process_fn (func): optional function that is called after gradients are computed and returns processing info optimizer_fn (func): optional function that returns a torch optimizer - given the policy graph object + given the policy and config before_init (func): optional function to run at the beginning of __init__ that takes the same arguments as __init__ after_init (func): optional function to run at the end of __init__ @@ -117,7 +117,7 @@ def extra_action_out(self, model_out): @override(TorchPolicyGraph) def optimizer(self): if optimizer_fn: - return optimizer_fn(self) + return optimizer_fn(self, self.config) else: return TorchPolicyGraph.optimizer(self) From 8a48029fb18313b1ac9ec5927dfd9eda5a337d80 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 17:35:41 -0700 Subject: [PATCH 35/39] use lr by default in optimizer --- python/ray/rllib/agents/pg/pg_policy_graph.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index 666c62a13f2f..54fcd041cc72 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -28,13 +28,8 @@ def postprocess_advantages(policy, sample_batch, 0.0, policy.config["gamma"], use_gae=False) -def make_optimizer(policy, config): - return tf.train.AdamOptimizer(learning_rate=config["lr"]) - - PGTFPolicy = build_tf_policy( name="PGTFPolicy", get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, postprocess_fn=postprocess_advantages, - loss_fn=policy_gradient_loss, - optimizer_fn=make_optimizer) + loss_fn=policy_gradient_loss) From 65db45e0037fb6db64b02e30b330f55dcc3e7aa0 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 17:36:59 -0700 Subject: [PATCH 36/39] update --- python/ray/rllib/evaluation/tf_policy_graph.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index e20c03fc0d60..b921e6cfb0d1 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -284,7 +284,10 @@ def extra_compute_grad_fetches(self): @DeveloperAPI def optimizer(self): """TF optimizer to use for policy optimization.""" - return tf.train.AdamOptimizer(self.config["lr"]) + if hasattr(self, "config"): + return tf.train.AdamOptimizer(self.config["lr"]) + else: + return tf.train.AdamOptimizer() @DeveloperAPI def gradients(self, optimizer, loss): From 4830ab6f21527d08c2e4211a81853d61c2ed7077 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 18:02:29 -0700 Subject: [PATCH 37/39] comments --- python/ray/rllib/agents/ppo/appo_policy_graph.py | 4 +++- python/ray/rllib/evaluation/tf_policy_template.py | 8 ++++---- python/ray/rllib/evaluation/torch_policy_template.py | 8 ++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py index 438618cbf34e..5aa76913194f 100644 --- a/python/ray/rllib/agents/ppo/appo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -164,11 +164,13 @@ def __init__(self, def _make_time_major(policy, tensor, drop_last=False): """Swaps batch and trajectory axis. - Args: + + Arguments: policy: Policy reference tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. + Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. diff --git a/python/ray/rllib/evaluation/tf_policy_template.py b/python/ray/rllib/evaluation/tf_policy_template.py index b654e6e64f28..b2549e973a65 100644 --- a/python/ray/rllib/evaluation/tf_policy_template.py +++ b/python/ray/rllib/evaluation/tf_policy_template.py @@ -46,11 +46,11 @@ def build_tf_policy(name, given a tf optimizer and loss tensor. If not specified, this defaults to optimizer.compute_gradients(loss) before_init (func): optional function to run at the beginning of - __init__ that takes the same arguments as __init__ + policy init that takes the same arguments as the policy constructor before_loss_init (func): optional function to run prior to loss - init that takes the same arguments as __init__ - after_init (func): optional function to run at the end of __init__ - that takes the same arguments as __init__ + init that takes the same arguments as the policy constructor + after_init (func): optional function to run at the end of policy init + that takes the same arguments as the policy constructor make_action_sampler (func): optional function that returns a tuple of action and action prob tensors. The function takes (policy, input_dict, obs_space, action_space, config) as its diff --git a/python/ray/rllib/evaluation/torch_policy_template.py b/python/ray/rllib/evaluation/torch_policy_template.py index fdc0cf5bca0e..7f65c2b963b8 100644 --- a/python/ray/rllib/evaluation/torch_policy_template.py +++ b/python/ray/rllib/evaluation/torch_policy_template.py @@ -40,11 +40,11 @@ def build_torch_policy(name, optimizer_fn (func): optional function that returns a torch optimizer given the policy and config before_init (func): optional function to run at the beginning of - __init__ that takes the same arguments as __init__ - after_init (func): optional function to run at the end of __init__ - that takes the same arguments as __init__ + policy init that takes the same arguments as the policy constructor + after_init (func): optional function to run at the end of policy init + that takes the same arguments as the policy constructor make_model_and_action_dist (func): optional func that takes the same - arguments as __init__ and returns a tuple of model instance and + arguments as policy init and returns a tuple of model instance and torch action distribution class. If not specified, the default model and action dist from the catalog will be used mixins (list): list of any class mixins for the returned policy class. From 816d590e4dad5f9b7c683cfe6de27ce8540ce353 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 18:36:21 -0700 Subject: [PATCH 38/39] remove optimizer --- python/ray/rllib/agents/pg/torch_pg_policy_graph.py | 9 +-------- python/ray/rllib/evaluation/torch_policy_graph.py | 6 +++++- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py index 6a929536a858..cda1b6eb5057 100644 --- a/python/ray/rllib/agents/pg/torch_pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/torch_pg_policy_graph.py @@ -2,8 +2,6 @@ from __future__ import division from __future__ import print_function -import torch - import ray from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing @@ -36,14 +34,9 @@ def pg_loss_stats(policy, batch_tensors): return {"policy_loss": policy.pi_err.item()} -def make_optimizer(policy, config): - return torch.optim.Adam(policy._model.parameters(), lr=config["lr"]) - - PGTorchPolicy = build_torch_policy( name="PGTorchPolicy", get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, loss_fn=pg_torch_loss, stats_fn=pg_loss_stats, - postprocess_fn=postprocess_advantages, - optimizer_fn=make_optimizer) + postprocess_fn=postprocess_advantages) diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py index 3a1464606612..ccf1b9eeb81d 100644 --- a/python/ray/rllib/evaluation/torch_policy_graph.py +++ b/python/ray/rllib/evaluation/torch_policy_graph.py @@ -161,7 +161,11 @@ def extra_grad_info(self, batch_tensors): def optimizer(self): """Custom PyTorch optimizer to use.""" - return torch.optim.Adam(self._model.parameters()) + if hasattr(self, "config"): + return torch.optim.Adam( + self._model.parameters(), lr=self.config["lr"]) + else: + return torch.optim.Adam(self._model.parameters()) def _lazy_tensor_dict(self, postprocessed_batch): batch_tensors = UsageTrackingDict(postprocessed_batch) From 65173a57b6af328616a832e47f06a24b8586d5ef Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 17 May 2019 18:48:05 -0700 Subject: [PATCH 39/39] fix tuple actions support in dynamic tf graph --- python/ray/rllib/evaluation/dynamic_tf_policy_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py index 64f8f747d984..73e08fcf9093 100644 --- a/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py +++ b/python/ray/rllib/evaluation/dynamic_tf_policy_graph.py @@ -220,7 +220,7 @@ def fake_array(tensor): SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), - SampleBatch.ACTIONS: fake_array(self._sampler), + SampleBatch.ACTIONS: fake_array(self._prev_action_input), SampleBatch.REWARDS: np.array([0], dtype=np.float32), SampleBatch.DONES: np.array([False], dtype=np.bool), }