From 21c4b8286a60ba87b4de2a8785f1f1fec41ddb6a Mon Sep 17 00:00:00 2001 From: Piotr Kozakowski Date: Mon, 3 Dec 2018 19:09:07 +0100 Subject: [PATCH 1/3] Use T2TModel for policies --- tensor2tensor/models/research/rl.py | 158 +++++++++++++++++----------- tensor2tensor/rl/ppo.py | 7 +- tensor2tensor/rl/ppo_learner.py | 23 ++-- 3 files changed, 119 insertions(+), 69 deletions(-) diff --git a/tensor2tensor/models/research/rl.py b/tensor2tensor/models/research/rl.py index 62479b5fb..27dd020a6 100644 --- a/tensor2tensor/models/research/rl.py +++ b/tensor2tensor/models/research/rl.py @@ -27,6 +27,7 @@ from tensor2tensor.rl.envs.simulated_batch_env import SimulatedBatchEnv from tensor2tensor.rl.envs.simulated_batch_gym_env import SimulatedBatchGymEnv from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model import tensorflow as tf import tensorflow_probability as tfp @@ -77,14 +78,14 @@ def basic_policy_parameters(): @registry.register_hparams def ppo_discrete_action_base(): hparams = ppo_base_v1() - hparams.add_hparam("policy_network", feed_forward_categorical_fun) + hparams.add_hparam("policy_network", "feed_forward_categorical_policy") return hparams @registry.register_hparams def discrete_random_action_base(): hparams = common_hparams.basic_params1() - hparams.add_hparam("policy_network", random_policy_fun) + hparams.add_hparam("policy_network", "random_policy_policy") return hparams @@ -100,7 +101,7 @@ def ppo_atari_base(): hparams.value_loss_coef = 1 hparams.optimization_epochs = 3 hparams.epochs_num = 1000 - hparams.policy_network = feed_forward_cnn_small_categorical_fun + hparams.policy_network = "feed_forward_cnn_small_categorical_policy" hparams.clipping_coef = 0.2 hparams.optimization_batch_size = 20 hparams.max_gradients_norm = 0.5 @@ -157,15 +158,28 @@ def get_policy(observations, hparams, action_space): """Get a policy network. Args: - observations: Tensor with observations + observations hparams: parameters action_space: action space Returns: - Tensor with policy and value function output + Tuple (action logits, value). """ - policy_network_lambda = hparams.policy_network - return policy_network_lambda(action_space, hparams, observations) + if not isinstance(action_space, gym.spaces.Discrete): + raise ValueError("Expecting discrete action space.") + + model = registry.model(hparams.policy_network)( + hparams, tf.estimator.ModeKeys.TRAIN + ) + obs_shape = common_layers.shape_list(observations) + features = { + "inputs": observations, + "target_action": tf.zeros(obs_shape[:2] + [action_space.n]), + "target_value": tf.zeros(obs_shape[:2]) + } + with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): + (targets, _) = model(features) + return (targets["target_action"], targets["target_value"]) @registry.register_hparams @@ -173,7 +187,7 @@ def ppo_pong_ae_base(): """Pong autoencoder base parameters.""" hparams = ppo_original_params() hparams.learning_rate = 1e-4 - hparams.network = dense_bitwise_categorical_fun + hparams.network = "dense_bitwise_categorical_policy" return hparams @@ -250,10 +264,18 @@ def mfrl_tiny(): return hparams +class DiscretePolicyBase(t2t_model.T2TModel): + + @staticmethod + def _get_num_actions(features): + return common_layers.shape_list(features["target_action"])[2] + + NetworkOutput = collections.namedtuple( "NetworkOutput", "policy, value, action_postprocessing") +# TODO(koz4k): Translate it to T2TModel or remove. def feed_forward_gaussian_fun(action_space, config, observations): """Feed-forward Gaussian.""" if not isinstance(action_space, gym.spaces.box.Box): @@ -303,36 +325,40 @@ def clip_logits(logits, config): return logits -def feed_forward_categorical_fun(action_space, config, observations): +@registry.register_model +class FeedForwardCategoricalPolicy(DiscretePolicyBase): """Feed-forward categorical.""" - if not isinstance(action_space, gym.spaces.Discrete): - raise ValueError("Expecting discrete action space.") - flat_observations = tf.reshape(observations, [ - tf.shape(observations)[0], tf.shape(observations)[1], - functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)]) - with tf.variable_scope("network_parameters"): + + def body(self, features): + observations = features["inputs"] + flat_observations = tf.reshape(observations, [ + tf.shape(observations)[0], tf.shape(observations)[1], + functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)]) with tf.variable_scope("policy"): x = flat_observations - for size in config.policy_layers: + for size in self.hparams.policy_layers: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) - logits = tf.contrib.layers.fully_connected(x, action_space.n, - activation_fn=None) + logits = tf.contrib.layers.fully_connected( + x, self._get_num_actions(features), activation_fn=None + ) with tf.variable_scope("value"): x = flat_observations - for size in config.value_layers: + for size in self.hparams.value_layers: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0] - logits = clip_logits(logits, config) - policy = tfp.distributions.Categorical(logits=logits) - return NetworkOutput(policy, value, lambda a: a) + logits = clip_logits(logits, self.hparams) + return {"target_action": logits, "target_value": value} -def feed_forward_cnn_small_categorical_fun(action_space, config, observations): +@registry.register_model +class FeedForwardCnnSmallCategoricalPolicy(DiscretePolicyBase): """Small cnn network with categorical output.""" - obs_shape = common_layers.shape_list(observations) - x = tf.reshape(observations, [-1] + obs_shape[2:]) - with tf.variable_scope("network_parameters"): - dropout = getattr(config, "dropout_ppo", 0.0) + + def body(self, features): + observations = features["inputs"] + obs_shape = common_layers.shape_list(observations) + x = tf.reshape(observations, [-1] + obs_shape[2:]) + dropout = getattr(self.hparams, "dropout_ppo", 0.0) with tf.variable_scope("feed_forward_cnn_small"): x = tf.to_float(x) / 255.0 x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], @@ -346,23 +372,25 @@ def feed_forward_cnn_small_categorical_fun(action_space, config, observations): flat_x = tf.nn.dropout(flat_x, keep_prob=1.0 - dropout) x = tf.contrib.layers.fully_connected(flat_x, 128, tf.nn.relu) - logits = tf.contrib.layers.fully_connected(x, action_space.n, - activation_fn=None) - logits = clip_logits(logits, config) + logits = tf.contrib.layers.fully_connected( + x, self._get_num_actions(features), activation_fn=None + ) + logits = clip_logits(logits, self.hparams) value = tf.contrib.layers.fully_connected( x, 1, activation_fn=None)[..., 0] - policy = tfp.distributions.Categorical(logits=logits) - return NetworkOutput(policy, value, lambda a: a) + return {"target_action": logits, "target_value": value} -def feed_forward_cnn_small_categorical_fun_new( - action_space, config, observations): +@registry.register_model +class FeedForwardCnnSmallCategoricalPolicyNew(DiscretePolicyBase): """Small cnn network with categorical output.""" - obs_shape = common_layers.shape_list(observations) - x = tf.reshape(observations, [-1] + obs_shape[2:]) - with tf.variable_scope("network_parameters"): - dropout = getattr(config, "dropout_ppo", 0.0) + + def body(self, features): + observations = features["inputs"] + obs_shape = common_layers.shape_list(observations) + x = tf.reshape(observations, [-1] + obs_shape[2:]) + dropout = getattr(self.hparams, "dropout_ppo", 0.0) with tf.variable_scope("feed_forward_cnn_small"): x = tf.to_float(x) / 255.0 x = tf.nn.dropout(x, keep_prob=1.0 - dropout) @@ -384,22 +412,23 @@ def feed_forward_cnn_small_categorical_fun_new( flat_x = tf.nn.dropout(flat_x, keep_prob=1.0 - dropout) x = tf.layers.dense(flat_x, 128, activation=tf.nn.relu, name="dense1") - logits = tf.layers.dense(x, action_space.n, name="dense2") - logits = clip_logits(logits, config) + logits = tf.layers.dense( + x, self._get_num_actions(features), name="dense2" + ) + logits = clip_logits(logits, self.hparams) value = tf.layers.dense(x, 1, name="value")[..., 0] - policy = tfp.distributions.Categorical(logits=logits) + return {"target_action": logits, "target_value": value} - return NetworkOutput(policy, value, lambda a: a) - -def dense_bitwise_categorical_fun(action_space, config, observations): +@registry.register_model +class DenseBitwiseCategoricalPolicy(DiscretePolicyBase): """Dense network with bitwise input and categorical output.""" - del config - obs_shape = common_layers.shape_list(observations) - x = tf.reshape(observations, [-1] + obs_shape[2:]) - with tf.variable_scope("network_parameters"): + def body(self, features): + observations = features["inputs"] + obs_shape = common_layers.shape_list(observations) + x = tf.reshape(observations, [-1] + obs_shape[2:]) with tf.variable_scope("dense_bitwise"): x = discretization.int_to_bit_embed(x, 8, 32) flat_x = tf.reshape( @@ -409,22 +438,33 @@ def dense_bitwise_categorical_fun(action_space, config, observations): x = tf.contrib.layers.fully_connected(flat_x, 256, tf.nn.relu) x = tf.contrib.layers.fully_connected(flat_x, 128, tf.nn.relu) - logits = tf.contrib.layers.fully_connected(x, action_space.n, - activation_fn=None) + logits = tf.contrib.layers.fully_connected( + x, self._get_num_actions(features), activation_fn=None + ) value = tf.contrib.layers.fully_connected( x, 1, activation_fn=None)[..., 0] - policy = tfp.distributions.Categorical(logits=logits) - return NetworkOutput(policy, value, lambda a: a) + return {"target_action": logits, "target_value": value} -def random_policy_fun(action_space, unused_config, observations): +@registry.register_model +class RandomPolicy(DiscretePolicyBase): """Random policy with categorical output.""" - obs_shape = observations.shape.as_list() - with tf.variable_scope("network_parameters"): + + def body(self, features): + observations = features["inputs"] + obs_shape = observations.shape.as_list() + #logits = tf.constant( + # 1. / float(self.action_space.n), + # shape=[1, obs_shape[0] * obs_shape[1], self.action_space.n] + #) + logits = tf.constant( + 1. / float(self.action_space.n), + shape=(obs_shape[:2] + [self._get_num_actions(features)]) + ) + #policy = tfp.distributions.Categorical( + # probs=[[[1. / float(self.action_space.n)] * self.action_space.n] * + # (obs_shape[0] * obs_shape[1])]) value = tf.zeros(obs_shape[:2]) - policy = tfp.distributions.Categorical( - probs=[[[1. / float(action_space.n)] * action_space.n] * - (obs_shape[0] * obs_shape[1])]) - return NetworkOutput(policy, value, lambda a: a) + return {"target_action": logits, "target_value": value} diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py index 00d4c37e9..8c6fe58fe 100644 --- a/tensor2tensor/rl/ppo.py +++ b/tensor2tensor/rl/ppo.py @@ -22,8 +22,10 @@ from __future__ import print_function from tensor2tensor.models.research.rl import get_policy +from tensor2tensor.utils import registry import tensorflow as tf +import tensorflow_probability as tfp def get_optimiser(config): @@ -35,7 +37,10 @@ def get_optimiser(config): def define_ppo_step(data_points, optimizer, hparams, action_space): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points - new_policy_dist, new_value, _ = get_policy(observation, hparams, action_space) + + (logits, new_value) = get_policy(observation, hparams, action_space) + new_policy_dist = tfp.distributions.Categorical(logits=logits) + new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf diff --git a/tensor2tensor/rl/ppo_learner.py b/tensor2tensor/rl/ppo_learner.py index 2a6fa5603..ced0b4477 100644 --- a/tensor2tensor/rl/ppo_learner.py +++ b/tensor2tensor/rl/ppo_learner.py @@ -27,9 +27,11 @@ from tensor2tensor.rl.envs.tf_atari_wrappers import StackWrapper from tensor2tensor.rl.envs.tf_atari_wrappers import WrapperBase from tensor2tensor.rl.policy_learner import PolicyLearner +from tensor2tensor.utils import registry from tensor2tensor.utils import trainer_lib import tensorflow as tf +import tensorflow_probability as tfp class PPOLearner(PolicyLearner): @@ -111,7 +113,8 @@ def evaluate(self, env_fn, hparams, stochastic): force_beginning_resets=False, policy_to_actions_lambda=policy_to_actions_lambda) model_saver = tf.train.Saver( - tf.global_variables(".*network_parameters.*")) + tf.global_variables(hparams.policy_network + "/.*") + ) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) @@ -164,7 +167,9 @@ def _run_train(ppo_hparams, summary_writer = tf.summary.FileWriter( event_dir, graph=tf.get_default_graph(), flush_secs=60) - model_saver = tf.train.Saver(tf.global_variables(".*network_parameters.*")) + model_saver = tf.train.Saver( + tf.global_variables(ppo_hparams.policy_network + "/.*") + ) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) @@ -349,16 +354,16 @@ def step(index, scores_sum, scores_num): def env_step(arg1, arg2, arg3): # pylint: disable=unused-argument """Step of the environment.""" - actor_critic = get_policy( - tf.expand_dims(obs_copy, 0), ppo_hparams, batch_env.action_space) - policy = actor_critic.policy + + (logits, value_function) = get_policy( + tf.expand_dims(obs_copy, 0), ppo_hparams, batch_env.action_space + ) + policy = tfp.distributions.Categorical(logits=logits) action = policy_to_actions_lambda(policy) - postprocessed_action = actor_critic.action_postprocessing(action) - reward, done = batch_env.simulate(postprocessed_action[0, ...]) + reward, done = batch_env.simulate(action[0, ...]) - pdf = policy.prob(action)[0] - value_function = actor_critic.value[0] + pdf = policy.prob(action) pdf = tf.reshape(pdf, shape=(num_agents,)) value_function = tf.reshape(value_function, shape=(num_agents,)) done = tf.reshape(done, shape=(num_agents,)) From 460868108d2661a836b9fbed33b1b3a53a3cf38c Mon Sep 17 00:00:00 2001 From: Piotr Kozakowski Date: Tue, 4 Dec 2018 16:11:03 +0100 Subject: [PATCH 2/3] Implement sampling with temperature from policy --- tensor2tensor/rl/dopamine_connector.py | 16 +++++-- tensor2tensor/rl/policy_learner.py | 14 ++++-- tensor2tensor/rl/ppo.py | 1 - tensor2tensor/rl/ppo_learner.py | 44 +++++++++++-------- tensor2tensor/rl/rl_utils.py | 24 +++++----- tensor2tensor/rl/trainer_model_based.py | 12 +++-- .../rl/trainer_model_based_params.py | 11 ++++- 7 files changed, 81 insertions(+), 41 deletions(-) diff --git a/tensor2tensor/rl/dopamine_connector.py b/tensor2tensor/rl/dopamine_connector.py index 4843d45ae..83d50a66b 100644 --- a/tensor2tensor/rl/dopamine_connector.py +++ b/tensor2tensor/rl/dopamine_connector.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function +from copy import copy from dopamine.agents.dqn import dqn_agent from dopamine.agents.dqn.dqn_agent import NATURE_DQN_OBSERVATION_SHAPE from dopamine.agents.dqn.dqn_agent import NATURE_DQN_STACK_SIZE @@ -285,6 +286,7 @@ def train(self, simulated, save_continuously, epoch, + sampling_temp=1.0, num_env_steps=None, env_step_multiplier=1, eval_env_fn=None, @@ -294,6 +296,11 @@ def train(self, if num_env_steps is None: num_env_steps = hparams.num_frames + hparams = copy(hparams) + hparams.set_hparams( + "agent_epsilon_eval", min(hparams.agent_epsilon_eval * sampling_temp, 1) + ) + target_iterations, training_steps_per_iteration = \ self._target_iteractions_and_steps( num_env_steps=num_env_steps * env_step_multiplier, @@ -307,11 +314,14 @@ def train(self, self.completed_iterations = target_iterations - def evaluate(self, env_fn, hparams, stochastic): + def evaluate(self, env_fn, hparams, sampling_temp): target_iterations = 0 training_steps_per_iteration = 0 - if not stochastic: - hparams.set_hparam("agent_epsilon_eval", 0.) + + hparams = copy(hparams) + hparams.set_hparams( + "agent_epsilon_eval", min(hparams.agent_epsilon_eval * sampling_temp, 1) + ) create_environment_fn = get_create_env_fun( env_fn, time_limit=hparams.time_limit) diff --git a/tensor2tensor/rl/policy_learner.py b/tensor2tensor/rl/policy_learner.py index 804b6b2f8..8368b714d 100644 --- a/tensor2tensor/rl/policy_learner.py +++ b/tensor2tensor/rl/policy_learner.py @@ -29,13 +29,21 @@ def __init__(self, frame_stack_size, base_event_dir, agent_model_dir): self.agent_model_dir = agent_model_dir def train( - self, env_fn, hparams, simulated, save_continuously, epoch, - num_env_steps=None, env_step_multiplier=1, eval_env_fn=None, + self, + env_fn, + hparams, + simulated, + save_continuously, + epoch, + sampling_temp=1.0, + num_env_steps=None, + env_step_multiplier=1, + eval_env_fn=None, report_fn=None ): # TODO(konradczechowski): pass name_scope instead of epoch? # TODO(konradczechowski): move 'simulated' to batch_env raise NotImplementedError() - def evaluate(self, env_fn, hparams, stochastic): + def evaluate(self, env_fn, hparams, sampling_temp): raise NotImplementedError() diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py index 8c6fe58fe..2a7d22af8 100644 --- a/tensor2tensor/rl/ppo.py +++ b/tensor2tensor/rl/ppo.py @@ -22,7 +22,6 @@ from __future__ import print_function from tensor2tensor.models.research.rl import get_policy -from tensor2tensor.utils import registry import tensorflow as tf import tensorflow_probability as tfp diff --git a/tensor2tensor/rl/ppo_learner.py b/tensor2tensor/rl/ppo_learner.py index ced0b4477..35310b0a8 100644 --- a/tensor2tensor/rl/ppo_learner.py +++ b/tensor2tensor/rl/ppo_learner.py @@ -22,12 +22,12 @@ import math import os +from tensor2tensor.layers import common_layers from tensor2tensor.models.research.rl import get_policy from tensor2tensor.rl import ppo from tensor2tensor.rl.envs.tf_atari_wrappers import StackWrapper from tensor2tensor.rl.envs.tf_atari_wrappers import WrapperBase from tensor2tensor.rl.policy_learner import PolicyLearner -from tensor2tensor.utils import registry from tensor2tensor.utils import trainer_lib import tensorflow as tf @@ -47,10 +47,14 @@ def train(self, simulated, save_continuously, epoch, + sampling_temp=1.0, num_env_steps=None, env_step_multiplier=1, eval_env_fn=None, report_fn=None): + assert sampling_temp == 1.0 or hparams.learning_rate == 0.0, \ + "Sampling with non-1 temperature does not make sense during training." + if not save_continuously: # We do not save model, as that resets frames that we need at restarts. # But we need to save at the last step, so we set it very high. @@ -73,6 +77,7 @@ def train(self, env, hparams, eval_env_fn, + sampling_temp, frame_stack_size=self.frame_stack_size, force_beginning_resets=simulated)) @@ -95,12 +100,7 @@ def train(self, initializers, report_fn=report_fn) - def evaluate(self, env_fn, hparams, stochastic): - if stochastic: - policy_to_actions_lambda = lambda policy: policy.sample() - else: - policy_to_actions_lambda = lambda policy: policy.mode() - + def evaluate(self, env_fn, hparams, sampling_temp): with tf.Graph().as_default(): with tf.name_scope("rl_eval"): eval_env = env_fn(in_graph=True) @@ -111,7 +111,8 @@ def evaluate(self, env_fn, hparams, stochastic): eval_phase=True, frame_stack_size=self.frame_stack_size, force_beginning_resets=False, - policy_to_actions_lambda=policy_to_actions_lambda) + sampling_temp=sampling_temp, + ) model_saver = tf.train.Saver( tf.global_variables(hparams.policy_network + "/.*") ) @@ -124,7 +125,13 @@ def evaluate(self, env_fn, hparams, stochastic): sess.run(collect_memory) -def _define_train(train_env, ppo_hparams, eval_env_fn=None, **collect_kwargs): +def _define_train( + train_env, + ppo_hparams, + eval_env_fn=None, + sampling_temp=1.0, + **collect_kwargs +): """Define the training setup.""" memory, collect_summary, train_initialization = ( _define_collect( @@ -132,13 +139,14 @@ def _define_train(train_env, ppo_hparams, eval_env_fn=None, **collect_kwargs): ppo_hparams, "ppo_train", eval_phase=False, - policy_to_actions_lambda=(lambda policy: policy.sample()), + sampling_temp=sampling_temp, **collect_kwargs)) ppo_summary = ppo.define_ppo_epoch( memory, ppo_hparams, train_env.action_space, train_env.batch_size) train_summary = tf.summary.merge([collect_summary, ppo_summary]) if ppo_hparams.eval_every_epochs: + # TODO(koz4k): Do we need this at all? assert eval_env_fn is not None eval_env = eval_env_fn(in_graph=True) (_, eval_collect_summary, eval_initialization) = ( @@ -147,7 +155,7 @@ def _define_train(train_env, ppo_hparams, eval_env_fn=None, **collect_kwargs): ppo_hparams, "ppo_eval", eval_phase=True, - policy_to_actions_lambda=(lambda policy: policy.mode()), + sampling_temp=0.0, **collect_kwargs)) return (train_summary, eval_collect_summary, (train_initialization, eval_initialization)) @@ -270,17 +278,17 @@ def simulate(self, action): def _define_collect(batch_env, ppo_hparams, scope, frame_stack_size, eval_phase, - policy_to_actions_lambda, force_beginning_resets): + sampling_temp, force_beginning_resets): """Collect trajectories. Args: batch_env: Batch environment. ppo_hparams: PPO hparams, defined in tensor2tensor.models.research.rl. scope: var scope. - frame_stack_size: TODO(koz4k): Write docstring. + frame_stack_size: Number of last observations to feed into the policy. eval_phase: TODO(koz4k): Write docstring. - policy_to_actions_lambda: TODO(koz4k): Write docstring. - force_beginning_resets: TODO(koz4k): Write docstring. + sampling_temp: Sampling temperature for the policy. + force_beginning_resets: Whether to reset at the beginning of each episode. Returns: Returns memory (observations, rewards, dones, actions, @@ -358,12 +366,12 @@ def env_step(arg1, arg2, arg3): # pylint: disable=unused-argument (logits, value_function) = get_policy( tf.expand_dims(obs_copy, 0), ppo_hparams, batch_env.action_space ) - policy = tfp.distributions.Categorical(logits=logits) - action = policy_to_actions_lambda(policy) + action = common_layers.sample_with_temperature(logits, sampling_temp) + action = tf.cast(action, tf.int32) reward, done = batch_env.simulate(action[0, ...]) - pdf = policy.prob(action) + pdf = tfp.distributions.Categorical(logits=logits).prob(action) pdf = tf.reshape(pdf, shape=(num_agents,)) value_function = tf.reshape(value_function, shape=(num_agents,)) done = tf.reshape(done, shape=(num_agents,)) diff --git a/tensor2tensor/rl/rl_utils.py b/tensor2tensor/rl/rl_utils.py index 98a1f17e4..f4acecd73 100644 --- a/tensor2tensor/rl/rl_utils.py +++ b/tensor2tensor/rl/rl_utils.py @@ -50,13 +50,15 @@ def compute_mean_reward(rollouts, clipped): return mean_rewards -def get_metric_name(stochastic, max_num_noops, clipped): - return "mean_reward/eval/stochastic_{}_max_noops_{}_{}".format( - stochastic, max_num_noops, "clipped" if clipped else "unclipped") +def get_metric_name(sampling_temp, max_num_noops, clipped): + return "mean_reward/eval/sampling_temp_{}_max_noops_{}_{}".format( + sampling_temp, max_num_noops, "clipped" if clipped else "unclipped" + ) -def evaluate_single_config(hparams, stochastic, max_num_noops, - agent_model_dir): +def evaluate_single_config( + hparams, sampling_temp, max_num_noops, agent_model_dir +): """Evaluate the PPO agent in the real environment.""" eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params) env = setup_env( @@ -68,7 +70,7 @@ def evaluate_single_config(hparams, stochastic, max_num_noops, hparams.frame_stack_size, base_event_dir=None, agent_model_dir=agent_model_dir ) - learner.evaluate(env_fn, eval_hparams, stochastic) + learner.evaluate(env_fn, eval_hparams, sampling_temp) rollouts = env.current_epoch_rollouts() env.close() @@ -80,15 +82,15 @@ def evaluate_single_config(hparams, stochastic, max_num_noops, def evaluate_all_configs(hparams, agent_model_dir): """Evaluate the agent with multiple eval configurations.""" metrics = {} - # Iterate over all combinations of picking actions by sampling/mode and - # whether to do initial no-ops. - for stochastic in (True, False): + # Iterate over all combinations of sampling temperatures and whether to do + # initial no-ops. + for sampling_temp in hparams.eval_sampling_temps: for max_num_noops in (hparams.eval_max_num_noops, 0): scores = evaluate_single_config( - hparams, stochastic, max_num_noops, agent_model_dir + hparams, sampling_temp, max_num_noops, agent_model_dir ) for (score, clipped) in zip(scores, (True, False)): - metric_name = get_metric_name(stochastic, max_num_noops, clipped) + metric_name = get_metric_name(sampling_temp, max_num_noops, clipped) metrics[metric_name] = score return metrics diff --git a/tensor2tensor/rl/trainer_model_based.py b/tensor2tensor/rl/trainer_model_based.py index f1f6bcfb3..3b8aad5d0 100644 --- a/tensor2tensor/rl/trainer_model_based.py +++ b/tensor2tensor/rl/trainer_model_based.py @@ -215,8 +215,13 @@ def train_agent_real_env(env, learner, hparams, epoch): env_fn = rl.make_real_env_fn(env) num_env_steps = real_env_step_increment(hparams) learner.train( - env_fn, train_hparams, simulated=False, save_continuously=False, - epoch=epoch, num_env_steps=num_env_steps + env_fn, + train_hparams, + simulated=False, + save_continuously=False, + epoch=epoch, + sampling_temp=hparams.real_sampling_temp, + num_env_steps=num_env_steps, ) # Save unfinished rollouts to history. env.reset() @@ -492,7 +497,8 @@ def training_loop(hparams, output_dir, report_fn=None, report_metric=None): if report_fn: if report_metric == "mean_reward": metric_name = rl_utils.get_metric_name( - stochastic=True, max_num_noops=hparams.eval_max_num_noops, + sampling_temp=hparams.eval_sampling_temps[0], + max_num_noops=hparams.eval_max_num_noops, clipped=False ) report_fn(eval_metrics[metric_name], epoch) diff --git a/tensor2tensor/rl/trainer_model_based_params.py b/tensor2tensor/rl/trainer_model_based_params.py index bb29767d9..1459755fd 100644 --- a/tensor2tensor/rl/trainer_model_based_params.py +++ b/tensor2tensor/rl/trainer_model_based_params.py @@ -77,8 +77,12 @@ def _rlmb_base(): # In your experiments, you want to optimize this rate to your schedule. learning_rate_bump=3.0, - # Batch size during evaluation. Metrics are averaged over this number of - # rollouts. + # Policy sampling temperature to use when gathering data from the real + # environment. + real_sampling_temp=1.0, + + # Sampling temperatures to try during eval. + eval_sampling_temps=[0.0, 0.2, 0.5, 0.8, 1.0, 2.0], eval_max_num_noops=8, game="pong", @@ -106,6 +110,8 @@ def _rlmb_base(): real_batch_size=-1, # Number of simulated environments to train on simultaneously. simulated_batch_size=-1, + # Batch size during evaluation. Metrics are averaged over this number of + # rollouts. eval_batch_size=-1, ) @@ -467,6 +473,7 @@ def _rlmb_tiny_overrides(): wm_eval_rollout_ratios=[1], env_timesteps_limit=7, simulated_rollout_length=2, + eval_sampling_temps=[0.0, 1.0], ) From e737f897b1ac6f7ab33ce374292bec15016b92d3 Mon Sep 17 00:00:00 2001 From: Piotr Kozakowski Date: Tue, 4 Dec 2018 17:33:17 +0100 Subject: [PATCH 3/3] Fixes --- tensor2tensor/models/research/rl.py | 39 ++++++++++----------- tensor2tensor/rl/dopamine_connector.py | 4 +-- tensor2tensor/rl/ppo.py | 2 +- tensor2tensor/rl/trainer_model_free.py | 3 +- tensor2tensor/rl/trainer_model_free_test.py | 5 +-- 5 files changed, 26 insertions(+), 27 deletions(-) diff --git a/tensor2tensor/models/research/rl.py b/tensor2tensor/models/research/rl.py index 27dd020a6..c596d4df5 100644 --- a/tensor2tensor/models/research/rl.py +++ b/tensor2tensor/models/research/rl.py @@ -61,12 +61,12 @@ def ppo_base_v1(): return hparams -@registry.register_hparams -def ppo_continuous_action_base(): - hparams = ppo_base_v1() - hparams.add_hparam("policy_network", feed_forward_gaussian_fun) - hparams.add_hparam("policy_network_params", "basic_policy_parameters") - return hparams +#@registry.register_hparams +#def ppo_continuous_action_base(): +# hparams = ppo_base_v1() +# hparams.add_hparam("policy_network", feed_forward_gaussian_fun) +# hparams.add_hparam("policy_network_params", "basic_policy_parameters") +# return hparams @registry.register_hparams @@ -85,7 +85,7 @@ def ppo_discrete_action_base(): @registry.register_hparams def discrete_random_action_base(): hparams = common_hparams.basic_params1() - hparams.add_hparam("policy_network", "random_policy_policy") + hparams.add_hparam("policy_network", "random_policy") return hparams @@ -239,6 +239,12 @@ def mfrl_original(): batch_size=16, eval_batch_size=2, frame_stack_size=4, + eval_sampling_temps=[0.0, 0.2, 0.5, 0.8, 1.0, 2.0], + eval_max_num_noops=8, + resize_height_factor=2, + resize_width_factor=2, + grayscale=0, + env_timesteps_limit=-1, ) @@ -248,11 +254,6 @@ def mfrl_base(): hparams = mfrl_original() hparams.add_hparam("ppo_epochs_num", 3000) hparams.add_hparam("ppo_eval_every_epochs", 100) - hparams.add_hparam("eval_max_num_noops", 8) - hparams.add_hparam("resize_height_factor", 2) - hparams.add_hparam("resize_width_factor", 2) - hparams.add_hparam("grayscale", 0) - hparams.add_hparam("env_timesteps_limit", -1) return hparams @@ -455,16 +456,12 @@ class RandomPolicy(DiscretePolicyBase): def body(self, features): observations = features["inputs"] obs_shape = observations.shape.as_list() - #logits = tf.constant( - # 1. / float(self.action_space.n), - # shape=[1, obs_shape[0] * obs_shape[1], self.action_space.n] - #) + # Just so Saver doesn't complain because of no variables. + tf.get_variable("dummy_var", initializer=0.0) + num_actions = self._get_num_actions(features) logits = tf.constant( - 1. / float(self.action_space.n), - shape=(obs_shape[:2] + [self._get_num_actions(features)]) + 1. / float(num_actions), + shape=(obs_shape[:2] + [num_actions]) ) - #policy = tfp.distributions.Categorical( - # probs=[[[1. / float(self.action_space.n)] * self.action_space.n] * - # (obs_shape[0] * obs_shape[1])]) value = tf.zeros(obs_shape[:2]) return {"target_action": logits, "target_value": value} diff --git a/tensor2tensor/rl/dopamine_connector.py b/tensor2tensor/rl/dopamine_connector.py index 83d50a66b..0f227cd9f 100644 --- a/tensor2tensor/rl/dopamine_connector.py +++ b/tensor2tensor/rl/dopamine_connector.py @@ -297,7 +297,7 @@ def train(self, num_env_steps = hparams.num_frames hparams = copy(hparams) - hparams.set_hparams( + hparams.set_hparam( "agent_epsilon_eval", min(hparams.agent_epsilon_eval * sampling_temp, 1) ) @@ -319,7 +319,7 @@ def evaluate(self, env_fn, hparams, sampling_temp): training_steps_per_iteration = 0 hparams = copy(hparams) - hparams.set_hparams( + hparams.set_hparam( "agent_epsilon_eval", min(hparams.agent_epsilon_eval * sampling_temp, 1) ) diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py index 2a7d22af8..e9db06f51 100644 --- a/tensor2tensor/rl/ppo.py +++ b/tensor2tensor/rl/ppo.py @@ -116,7 +116,7 @@ def define_ppo_epoch(memory, hparams, action_space, batch_size): dataset = dataset.shuffle(buffer_size=hparams.epoch_length-1, reshuffle_each_iteration=True) dataset = dataset.repeat(-1) - dataset = dataset.batch(hparams.optimization_batch_size) + dataset = dataset.batch(hparams.optimization_batch_size, drop_remainder=True) iterator = dataset.make_initializable_iterator() optimizer = get_optimiser(hparams) diff --git a/tensor2tensor/rl/trainer_model_free.py b/tensor2tensor/rl/trainer_model_free.py index dea16b1fa..6948b0abf 100644 --- a/tensor2tensor/rl/trainer_model_free.py +++ b/tensor2tensor/rl/trainer_model_free.py @@ -78,7 +78,8 @@ def train(hparams, output_dir, report_fn=None): if not steps or steps[-1] < eval_every_epochs: steps.append(eval_every_epochs) metric_name = rl_utils.get_metric_name( - stochastic=True, max_num_noops=hparams.eval_max_num_noops, + sampling_temp=hparams.eval_sampling_temps[0], + max_num_noops=hparams.eval_max_num_noops, clipped=False ) for step in steps: diff --git a/tensor2tensor/rl/trainer_model_free_test.py b/tensor2tensor/rl/trainer_model_free_test.py index be3300312..678641847 100644 --- a/tensor2tensor/rl/trainer_model_free_test.py +++ b/tensor2tensor/rl/trainer_model_free_test.py @@ -31,8 +31,9 @@ class TrainTest(tf.test.TestCase): def test_train_pong(self): hparams = registry.hparams("mfrl_original") hparams.batch_size = 2 - hparams.ppo_epochs_num = 2 - hparams.ppo_epoch_length = 3 + hparams.eval_sampling_temps = [0.0, 1.0] + hparams.add_hparam("ppo_epochs_num", 2) + hparams.add_hparam("ppo_epoch_length", 3) FLAGS.output_dir = tf.test.get_temp_dir() trainer_model_free.train(hparams, FLAGS.output_dir)