Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 107 additions & 70 deletions tensor2tensor/models/research/rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from tensor2tensor.rl.envs.simulated_batch_env import SimulatedBatchEnv
from tensor2tensor.rl.envs.simulated_batch_gym_env import SimulatedBatchGymEnv
from tensor2tensor.utils import registry
from tensor2tensor.utils import t2t_model

import tensorflow as tf
import tensorflow_probability as tfp
Expand Down Expand Up @@ -60,12 +61,12 @@ def ppo_base_v1():
return hparams


@registry.register_hparams
def ppo_continuous_action_base():
hparams = ppo_base_v1()
hparams.add_hparam("policy_network", feed_forward_gaussian_fun)
hparams.add_hparam("policy_network_params", "basic_policy_parameters")
return hparams
#@registry.register_hparams
#def ppo_continuous_action_base():
# hparams = ppo_base_v1()
# hparams.add_hparam("policy_network", feed_forward_gaussian_fun)
# hparams.add_hparam("policy_network_params", "basic_policy_parameters")
# return hparams


@registry.register_hparams
Expand All @@ -77,14 +78,14 @@ def basic_policy_parameters():
@registry.register_hparams
def ppo_discrete_action_base():
hparams = ppo_base_v1()
hparams.add_hparam("policy_network", feed_forward_categorical_fun)
hparams.add_hparam("policy_network", "feed_forward_categorical_policy")
return hparams


@registry.register_hparams
def discrete_random_action_base():
hparams = common_hparams.basic_params1()
hparams.add_hparam("policy_network", random_policy_fun)
hparams.add_hparam("policy_network", "random_policy")
return hparams


Expand All @@ -100,7 +101,7 @@ def ppo_atari_base():
hparams.value_loss_coef = 1
hparams.optimization_epochs = 3
hparams.epochs_num = 1000
hparams.policy_network = feed_forward_cnn_small_categorical_fun
hparams.policy_network = "feed_forward_cnn_small_categorical_policy"
hparams.clipping_coef = 0.2
hparams.optimization_batch_size = 20
hparams.max_gradients_norm = 0.5
Expand Down Expand Up @@ -157,23 +158,36 @@ def get_policy(observations, hparams, action_space):
"""Get a policy network.

Args:
observations: Tensor with observations
observations
hparams: parameters
action_space: action space

Returns:
Tensor with policy and value function output
Tuple (action logits, value).
"""
policy_network_lambda = hparams.policy_network
return policy_network_lambda(action_space, hparams, observations)
if not isinstance(action_space, gym.spaces.Discrete):
raise ValueError("Expecting discrete action space.")

model = registry.model(hparams.policy_network)(
hparams, tf.estimator.ModeKeys.TRAIN
)
obs_shape = common_layers.shape_list(observations)
features = {
"inputs": observations,
"target_action": tf.zeros(obs_shape[:2] + [action_space.n]),
"target_value": tf.zeros(obs_shape[:2])
}
with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
(targets, _) = model(features)
return (targets["target_action"], targets["target_value"])


@registry.register_hparams
def ppo_pong_ae_base():
"""Pong autoencoder base parameters."""
hparams = ppo_original_params()
hparams.learning_rate = 1e-4
hparams.network = dense_bitwise_categorical_fun
hparams.network = "dense_bitwise_categorical_policy"
return hparams


Expand Down Expand Up @@ -225,6 +239,12 @@ def mfrl_original():
batch_size=16,
eval_batch_size=2,
frame_stack_size=4,
eval_sampling_temps=[0.0, 0.2, 0.5, 0.8, 1.0, 2.0],
eval_max_num_noops=8,
resize_height_factor=2,
resize_width_factor=2,
grayscale=0,
env_timesteps_limit=-1,
)


Expand All @@ -234,11 +254,6 @@ def mfrl_base():
hparams = mfrl_original()
hparams.add_hparam("ppo_epochs_num", 3000)
hparams.add_hparam("ppo_eval_every_epochs", 100)
hparams.add_hparam("eval_max_num_noops", 8)
hparams.add_hparam("resize_height_factor", 2)
hparams.add_hparam("resize_width_factor", 2)
hparams.add_hparam("grayscale", 0)
hparams.add_hparam("env_timesteps_limit", -1)
return hparams


Expand All @@ -250,10 +265,18 @@ def mfrl_tiny():
return hparams


class DiscretePolicyBase(t2t_model.T2TModel):

@staticmethod
def _get_num_actions(features):
return common_layers.shape_list(features["target_action"])[2]


NetworkOutput = collections.namedtuple(
"NetworkOutput", "policy, value, action_postprocessing")


# TODO(koz4k): Translate it to T2TModel or remove.
def feed_forward_gaussian_fun(action_space, config, observations):
"""Feed-forward Gaussian."""
if not isinstance(action_space, gym.spaces.box.Box):
Expand Down Expand Up @@ -303,36 +326,40 @@ def clip_logits(logits, config):
return logits


def feed_forward_categorical_fun(action_space, config, observations):
@registry.register_model
class FeedForwardCategoricalPolicy(DiscretePolicyBase):
"""Feed-forward categorical."""
if not isinstance(action_space, gym.spaces.Discrete):
raise ValueError("Expecting discrete action space.")
flat_observations = tf.reshape(observations, [
tf.shape(observations)[0], tf.shape(observations)[1],
functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)])
with tf.variable_scope("network_parameters"):

def body(self, features):
observations = features["inputs"]
flat_observations = tf.reshape(observations, [
tf.shape(observations)[0], tf.shape(observations)[1],
functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)])
with tf.variable_scope("policy"):
x = flat_observations
for size in config.policy_layers:
for size in self.hparams.policy_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
logits = tf.contrib.layers.fully_connected(x, action_space.n,
activation_fn=None)
logits = tf.contrib.layers.fully_connected(
x, self._get_num_actions(features), activation_fn=None
)
with tf.variable_scope("value"):
x = flat_observations
for size in config.value_layers:
for size in self.hparams.value_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0]
logits = clip_logits(logits, config)
policy = tfp.distributions.Categorical(logits=logits)
return NetworkOutput(policy, value, lambda a: a)
logits = clip_logits(logits, self.hparams)
return {"target_action": logits, "target_value": value}


def feed_forward_cnn_small_categorical_fun(action_space, config, observations):
@registry.register_model
class FeedForwardCnnSmallCategoricalPolicy(DiscretePolicyBase):
"""Small cnn network with categorical output."""
obs_shape = common_layers.shape_list(observations)
x = tf.reshape(observations, [-1] + obs_shape[2:])
with tf.variable_scope("network_parameters"):
dropout = getattr(config, "dropout_ppo", 0.0)

def body(self, features):
observations = features["inputs"]
obs_shape = common_layers.shape_list(observations)
x = tf.reshape(observations, [-1] + obs_shape[2:])
dropout = getattr(self.hparams, "dropout_ppo", 0.0)
with tf.variable_scope("feed_forward_cnn_small"):
x = tf.to_float(x) / 255.0
x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2],
Expand All @@ -346,23 +373,25 @@ def feed_forward_cnn_small_categorical_fun(action_space, config, observations):
flat_x = tf.nn.dropout(flat_x, keep_prob=1.0 - dropout)
x = tf.contrib.layers.fully_connected(flat_x, 128, tf.nn.relu)

logits = tf.contrib.layers.fully_connected(x, action_space.n,
activation_fn=None)
logits = clip_logits(logits, config)
logits = tf.contrib.layers.fully_connected(
x, self._get_num_actions(features), activation_fn=None
)
logits = clip_logits(logits, self.hparams)

value = tf.contrib.layers.fully_connected(
x, 1, activation_fn=None)[..., 0]
policy = tfp.distributions.Categorical(logits=logits)
return NetworkOutput(policy, value, lambda a: a)
return {"target_action": logits, "target_value": value}


def feed_forward_cnn_small_categorical_fun_new(
action_space, config, observations):
@registry.register_model
class FeedForwardCnnSmallCategoricalPolicyNew(DiscretePolicyBase):
"""Small cnn network with categorical output."""
obs_shape = common_layers.shape_list(observations)
x = tf.reshape(observations, [-1] + obs_shape[2:])
with tf.variable_scope("network_parameters"):
dropout = getattr(config, "dropout_ppo", 0.0)

def body(self, features):
observations = features["inputs"]
obs_shape = common_layers.shape_list(observations)
x = tf.reshape(observations, [-1] + obs_shape[2:])
dropout = getattr(self.hparams, "dropout_ppo", 0.0)
with tf.variable_scope("feed_forward_cnn_small"):
x = tf.to_float(x) / 255.0
x = tf.nn.dropout(x, keep_prob=1.0 - dropout)
Expand All @@ -384,22 +413,23 @@ def feed_forward_cnn_small_categorical_fun_new(
flat_x = tf.nn.dropout(flat_x, keep_prob=1.0 - dropout)
x = tf.layers.dense(flat_x, 128, activation=tf.nn.relu, name="dense1")

logits = tf.layers.dense(x, action_space.n, name="dense2")
logits = clip_logits(logits, config)
logits = tf.layers.dense(
x, self._get_num_actions(features), name="dense2"
)
logits = clip_logits(logits, self.hparams)

value = tf.layers.dense(x, 1, name="value")[..., 0]
policy = tfp.distributions.Categorical(logits=logits)
return {"target_action": logits, "target_value": value}

return NetworkOutput(policy, value, lambda a: a)


def dense_bitwise_categorical_fun(action_space, config, observations):
@registry.register_model
class DenseBitwiseCategoricalPolicy(DiscretePolicyBase):
"""Dense network with bitwise input and categorical output."""
del config
obs_shape = common_layers.shape_list(observations)
x = tf.reshape(observations, [-1] + obs_shape[2:])

with tf.variable_scope("network_parameters"):
def body(self, features):
observations = features["inputs"]
obs_shape = common_layers.shape_list(observations)
x = tf.reshape(observations, [-1] + obs_shape[2:])
with tf.variable_scope("dense_bitwise"):
x = discretization.int_to_bit_embed(x, 8, 32)
flat_x = tf.reshape(
Expand All @@ -409,22 +439,29 @@ def dense_bitwise_categorical_fun(action_space, config, observations):
x = tf.contrib.layers.fully_connected(flat_x, 256, tf.nn.relu)
x = tf.contrib.layers.fully_connected(flat_x, 128, tf.nn.relu)

logits = tf.contrib.layers.fully_connected(x, action_space.n,
activation_fn=None)
logits = tf.contrib.layers.fully_connected(
x, self._get_num_actions(features), activation_fn=None
)

value = tf.contrib.layers.fully_connected(
x, 1, activation_fn=None)[..., 0]
policy = tfp.distributions.Categorical(logits=logits)

return NetworkOutput(policy, value, lambda a: a)
return {"target_action": logits, "target_value": value}


def random_policy_fun(action_space, unused_config, observations):
@registry.register_model
class RandomPolicy(DiscretePolicyBase):
"""Random policy with categorical output."""
obs_shape = observations.shape.as_list()
with tf.variable_scope("network_parameters"):

def body(self, features):
observations = features["inputs"]
obs_shape = observations.shape.as_list()
# Just so Saver doesn't complain because of no variables.
tf.get_variable("dummy_var", initializer=0.0)
num_actions = self._get_num_actions(features)
logits = tf.constant(
1. / float(num_actions),
shape=(obs_shape[:2] + [num_actions])
)
value = tf.zeros(obs_shape[:2])
policy = tfp.distributions.Categorical(
probs=[[[1. / float(action_space.n)] * action_space.n] *
(obs_shape[0] * obs_shape[1])])
return NetworkOutput(policy, value, lambda a: a)
return {"target_action": logits, "target_value": value}
16 changes: 13 additions & 3 deletions tensor2tensor/rl/dopamine_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from __future__ import division
from __future__ import print_function

from copy import copy
from dopamine.agents.dqn import dqn_agent
from dopamine.agents.dqn.dqn_agent import NATURE_DQN_OBSERVATION_SHAPE
from dopamine.agents.dqn.dqn_agent import NATURE_DQN_STACK_SIZE
Expand Down Expand Up @@ -285,6 +286,7 @@ def train(self,
simulated,
save_continuously,
epoch,
sampling_temp=1.0,
num_env_steps=None,
env_step_multiplier=1,
eval_env_fn=None,
Expand All @@ -294,6 +296,11 @@ def train(self,
if num_env_steps is None:
num_env_steps = hparams.num_frames

hparams = copy(hparams)
hparams.set_hparam(
"agent_epsilon_eval", min(hparams.agent_epsilon_eval * sampling_temp, 1)
)

target_iterations, training_steps_per_iteration = \
self._target_iteractions_and_steps(
num_env_steps=num_env_steps * env_step_multiplier,
Expand All @@ -307,11 +314,14 @@ def train(self,

self.completed_iterations = target_iterations

def evaluate(self, env_fn, hparams, stochastic):
def evaluate(self, env_fn, hparams, sampling_temp):
target_iterations = 0
training_steps_per_iteration = 0
if not stochastic:
hparams.set_hparam("agent_epsilon_eval", 0.)

hparams = copy(hparams)
hparams.set_hparam(
"agent_epsilon_eval", min(hparams.agent_epsilon_eval * sampling_temp, 1)
)

create_environment_fn = get_create_env_fun(
env_fn, time_limit=hparams.time_limit)
Expand Down
14 changes: 11 additions & 3 deletions tensor2tensor/rl/policy_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,21 @@ def __init__(self, frame_stack_size, base_event_dir, agent_model_dir):
self.agent_model_dir = agent_model_dir

def train(
self, env_fn, hparams, simulated, save_continuously, epoch,
num_env_steps=None, env_step_multiplier=1, eval_env_fn=None,
self,
env_fn,
hparams,
simulated,
save_continuously,
epoch,
sampling_temp=1.0,
num_env_steps=None,
env_step_multiplier=1,
eval_env_fn=None,
report_fn=None
):
# TODO(konradczechowski): pass name_scope instead of epoch?
# TODO(konradczechowski): move 'simulated' to batch_env
raise NotImplementedError()

def evaluate(self, env_fn, hparams, stochastic):
def evaluate(self, env_fn, hparams, sampling_temp):
raise NotImplementedError()
Loading