diff --git a/tensor2tensor/models/research/rl.py b/tensor2tensor/models/research/rl.py index 1d6b5f52f..5c52f3db0 100644 --- a/tensor2tensor/models/research/rl.py +++ b/tensor2tensor/models/research/rl.py @@ -233,6 +233,19 @@ def mfrl_base(): hparams = mfrl_original() hparams.add_hparam("ppo_epochs_num", 3000) hparams.add_hparam("ppo_eval_every_epochs", 100) + hparams.add_hparam("eval_max_num_noops", 8) + hparams.add_hparam("resize_height_factor", 2) + hparams.add_hparam("resize_width_factor", 2) + hparams.add_hparam("grayscale", 1) + hparams.add_hparam("env_timesteps_limit", -1) + return hparams + + +@registry.register_hparams +def mfrl_tiny(): + hparams = mfrl_base() + hparams.ppo_epochs_num = 100 + hparams.ppo_eval_every_epochs = 10 return hparams diff --git a/tensor2tensor/rl/rl_utils.py b/tensor2tensor/rl/rl_utils.py new file mode 100644 index 000000000..cff0ae440 --- /dev/null +++ b/tensor2tensor/rl/rl_utils.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Utilities for RL training +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import six + +from tensor2tensor.data_generators.gym_env import T2TGymEnv +from tensor2tensor.models.research import rl +from tensor2tensor.rl.dopamine_connector import DQNLearner +from tensor2tensor.rl.ppo_learner import PPOLearner +from tensor2tensor.utils import trainer_lib + +import tensorflow as tf + + +flags = tf.flags +FLAGS = flags.FLAGS + + +def compute_mean_reward(rollouts, clipped): + """Calculate mean rewards from given epoch.""" + reward_name = "reward" if clipped else "unclipped_reward" + rewards = [] + for rollout in rollouts: + if rollout[-1].done: + rollout_reward = sum(getattr(frame, reward_name) for frame in rollout) + rewards.append(rollout_reward) + if rewards: + mean_rewards = np.mean(rewards) + else: + mean_rewards = 0 + return mean_rewards + + +def get_metric_name(stochastic, max_num_noops, clipped): + return "mean_reward/eval/stochastic_{}_max_noops_{}_{}".format( + stochastic, max_num_noops, "clipped" if clipped else "unclipped") + + +def evaluate_single_config(hparams, stochastic, max_num_noops, + agent_model_dir): + """Evaluate the PPO agent in the real environment.""" + eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params) + env = setup_env( + hparams, batch_size=hparams.eval_batch_size, max_num_noops=max_num_noops + ) + env.start_new_epoch(0) + env_fn = rl.make_real_env_fn(env) + learner = LEARNERS[hparams.base_algo]( + hparams.frame_stack_size, base_event_dir=None, + agent_model_dir=agent_model_dir + ) + learner.evaluate(env_fn, eval_hparams, stochastic) + rollouts = env.current_epoch_rollouts() + env.close() + + return tuple( + compute_mean_reward(rollouts, clipped) for clipped in (True, False) + ) + + +def evaluate_all_configs(hparams, agent_model_dir): + """Evaluate the agent with multiple eval configurations.""" + metrics = {} + # Iterate over all combinations of picking actions by sampling/mode and + # whether to do initial no-ops. + for stochastic in (True, False): + for max_num_noops in (hparams.eval_max_num_noops, 0): + scores = evaluate_single_config( + hparams, stochastic, max_num_noops, agent_model_dir + ) + for (score, clipped) in zip(scores, (True, False)): + metric_name = get_metric_name(stochastic, max_num_noops, clipped) + metrics[metric_name] = score + + return metrics + + +LEARNERS = { + "ppo": PPOLearner, + "dqn": DQNLearner, +} + + +def setup_env(hparams, batch_size, max_num_noops): + """Setup.""" + game_mode = "Deterministic-v4" + camel_game_name = "".join( + [w[0].upper() + w[1:] for w in hparams.game.split("_")]) + camel_game_name += game_mode + env_name = camel_game_name + + env = T2TGymEnv(base_env_name=env_name, + batch_size=batch_size, + grayscale=hparams.grayscale, + resize_width_factor=hparams.resize_width_factor, + resize_height_factor=hparams.resize_height_factor, + base_env_timesteps_limit=hparams.env_timesteps_limit, + max_num_noops=max_num_noops) + return env + +def update_hparams_from_hparams(target_hparams, source_hparams, prefix): + """Copy a subset of hparams to target_hparams.""" + for (param_name, param_value) in six.iteritems(source_hparams.values()): + if param_name.startswith(prefix): + target_hparams.set_hparam(param_name[len(prefix):], param_value) diff --git a/tensor2tensor/rl/trainer_model_based.py b/tensor2tensor/rl/trainer_model_based.py index 70929b960..f1f6bcfb3 100644 --- a/tensor2tensor/rl/trainer_model_based.py +++ b/tensor2tensor/rl/trainer_model_based.py @@ -37,12 +37,10 @@ import six from tensor2tensor.bin import t2t_trainer # pylint: disable=unused-import -from tensor2tensor.data_generators.gym_env import T2TGymEnv from tensor2tensor.layers import common_video from tensor2tensor.models.research import rl +from tensor2tensor.rl import rl_utils from tensor2tensor.rl import trainer_model_based_params -from tensor2tensor.rl.dopamine_connector import DQNLearner -from tensor2tensor.rl.ppo_learner import PPOLearner from tensor2tensor.utils import trainer_lib import tensorflow as tf @@ -52,19 +50,6 @@ FLAGS = flags.FLAGS -LEARNERS = { - "ppo": PPOLearner, - "dqn": DQNLearner, -} - - -def update_hparams_from_hparams(target_hparams, source_hparams, prefix): - """Copy a subset of hparams to target_hparams.""" - for (param_name, param_value) in six.iteritems(source_hparams.values()): - if param_name.startswith(prefix): - target_hparams.set_hparam(param_name[len(prefix):], param_value) - - def real_env_step_increment(hparams): """Real env step increment.""" return int(math.ceil( @@ -207,7 +192,7 @@ def initial_frame_chooser(batch_size): base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) - update_hparams_from_hparams( + rl_utils.update_hparams_from_hparams( train_hparams, hparams, base_algo_str + "_" ) @@ -223,7 +208,7 @@ def train_agent_real_env(env, learner, hparams, epoch): base_algo_str = hparams.base_algo train_hparams = trainer_lib.create_hparams(hparams.base_algo_params) - update_hparams_from_hparams( + rl_utils.update_hparams_from_hparams( train_hparams, hparams, "real_" + base_algo_str + "_" ) @@ -263,82 +248,6 @@ def train_world_model( return world_model_steps_num -def setup_env(hparams, batch_size, max_num_noops): - """Setup.""" - game_mode = "Deterministic-v4" - camel_game_name = "".join( - [w[0].upper() + w[1:] for w in hparams.game.split("_")]) - camel_game_name += game_mode - env_name = camel_game_name - - env = T2TGymEnv(base_env_name=env_name, - batch_size=batch_size, - grayscale=hparams.grayscale, - resize_width_factor=hparams.resize_width_factor, - resize_height_factor=hparams.resize_height_factor, - base_env_timesteps_limit=hparams.env_timesteps_limit, - max_num_noops=max_num_noops) - return env - - -def evaluate_single_config(hparams, stochastic, max_num_noops, agent_model_dir): - """Evaluate the PPO agent in the real environment.""" - eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params) - env = setup_env( - hparams, batch_size=hparams.eval_batch_size, max_num_noops=max_num_noops - ) - env.start_new_epoch(0) - env_fn = rl.make_real_env_fn(env) - learner = LEARNERS[hparams.base_algo]( - hparams.frame_stack_size, base_event_dir=None, - agent_model_dir=agent_model_dir - ) - learner.evaluate(env_fn, eval_hparams, stochastic) - rollouts = env.current_epoch_rollouts() - env.close() - - return tuple( - compute_mean_reward(rollouts, clipped) for clipped in (True, False) - ) - - -def get_metric_name(stochastic, max_num_noops, clipped): - return "mean_reward/eval/stochastic_{}_max_noops_{}_{}".format( - stochastic, max_num_noops, "clipped" if clipped else "unclipped") - - -def evaluate_all_configs(hparams, agent_model_dir): - """Evaluate the agent with multiple eval configurations.""" - metrics = {} - # Iterate over all combinations of picking actions by sampling/mode and - # whether to do initial no-ops. - for stochastic in (True, False): - for max_num_noops in (hparams.eval_max_num_noops, 0): - scores = evaluate_single_config( - hparams, stochastic, max_num_noops, agent_model_dir - ) - for (score, clipped) in zip(scores, (True, False)): - metric_name = get_metric_name(stochastic, max_num_noops, clipped) - metrics[metric_name] = score - - return metrics - - -def compute_mean_reward(rollouts, clipped): - """Calculate mean rewards from given epoch.""" - reward_name = "reward" if clipped else "unclipped_reward" - rewards = [] - for rollout in rollouts: - if rollout[-1].done: - rollout_reward = sum(getattr(frame, reward_name) for frame in rollout) - rewards.append(rollout_reward) - if rewards: - mean_rewards = np.mean(rewards) - else: - mean_rewards = 0 - return mean_rewards - - def evaluate_world_model(real_env, hparams, world_model_dir, debug_video_path): """Evaluate the world model (reward accuracy).""" frame_stack_size = hparams.frame_stack_size @@ -485,13 +394,13 @@ def training_loop(hparams, output_dir, report_fn=None, report_metric=None): epoch = -1 data_dir = directories["data"] - env = setup_env( + env = rl_utils.setup_env( hparams, batch_size=hparams.real_batch_size, max_num_noops=hparams.max_num_noops ) env.start_new_epoch(epoch, data_dir) - learner = LEARNERS[hparams.base_algo]( + learner = rl_utils.LEARNERS[hparams.base_algo]( hparams.frame_stack_size, directories["policy"], directories["policy"] ) @@ -507,7 +416,7 @@ def training_loop(hparams, output_dir, report_fn=None, report_metric=None): policy_model_dir = directories["policy"] tf.logging.info("Initial training of the policy in real environment.") train_agent_real_env(env, learner, hparams, epoch) - metrics["mean_reward/train/clipped"] = compute_mean_reward( + metrics["mean_reward/train/clipped"] = rl_utils.compute_mean_reward( env.current_epoch_rollouts(), clipped=True ) tf.logging.info("Mean training reward (initial): {}".format( @@ -555,14 +464,14 @@ def training_loop(hparams, output_dir, report_fn=None, report_metric=None): # we'd overwrite them with wrong data. log("Metrics found for this epoch, skipping evaluation.") else: - metrics["mean_reward/train/clipped"] = compute_mean_reward( + metrics["mean_reward/train/clipped"] = rl_utils.compute_mean_reward( env.current_epoch_rollouts(), clipped=True ) log("Mean training reward: {}".format( metrics["mean_reward/train/clipped"] )) - eval_metrics = evaluate_all_configs(hparams, policy_model_dir) + eval_metrics = rl_utils.evaluate_all_configs(hparams, policy_model_dir) log("Agent eval metrics:\n{}".format(pprint.pformat(eval_metrics))) metrics.update(eval_metrics) @@ -582,7 +491,7 @@ def training_loop(hparams, output_dir, report_fn=None, report_metric=None): # Report metrics if report_fn: if report_metric == "mean_reward": - metric_name = get_metric_name( + metric_name = rl_utils.get_metric_name( stochastic=True, max_num_noops=hparams.eval_max_num_noops, clipped=False ) diff --git a/tensor2tensor/rl/trainer_model_free.py b/tensor2tensor/rl/trainer_model_free.py index 96ffebff7..31da7699c 100644 --- a/tensor2tensor/rl/trainer_model_free.py +++ b/tensor2tensor/rl/trainer_model_free.py @@ -27,19 +27,20 @@ from __future__ import division from __future__ import print_function -import six +import pprint -from tensor2tensor.data_generators import gym_env from tensor2tensor.models.research import rl -from tensor2tensor.rl.ppo_learner import PPOLearner +from tensor2tensor.rl import rl_utils from tensor2tensor.utils import flags as t2t_flags # pylint: disable=unused-import from tensor2tensor.utils import trainer_lib import tensorflow as tf + flags = tf.flags FLAGS = flags.FLAGS + # To maintain compatibility with some internal libs, we guard against these flag # definitions possibly erring. Apologies for the ugliness. try: @@ -48,46 +49,48 @@ pass -LEARNERS = { - "ppo": PPOLearner -} - - -def update_hparams_from_hparams(target_hparams, source_hparams, prefix): - """Copy a subset of hparams to target_hparams.""" - for (param_name, param_value) in six.iteritems(source_hparams.values()): - if param_name.startswith(prefix): - target_hparams.set_hparam(param_name[len(prefix):], param_value) - - def initialize_env_specs(hparams): """Initializes env_specs using T2TGymEnvs.""" - if getattr(hparams, "game", None): - game_name = gym_env.camel_case_name(hparams.game) - env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), - batch_size=hparams.batch_size) - env.start_new_epoch(0) - hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) - eval_env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), - batch_size=hparams.eval_batch_size) - eval_env.start_new_epoch(0) - hparams.add_hparam("eval_env_fn", rl.make_real_env_fn(eval_env)) + env = rl_utils.setup_env(hparams, hparams.batch_size, + hparams.eval_max_num_noops) + env.start_new_epoch(0) + hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) return hparams def train(hparams, output_dir, report_fn=None): hparams = initialize_env_specs(hparams) - learner = LEARNERS[hparams.base_algo]( + learner = rl_utils.LEARNERS[hparams.base_algo]( hparams.frame_stack_size, FLAGS.output_dir, output_dir ) policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params) - update_hparams_from_hparams( + rl_utils.update_hparams_from_hparams( policy_hparams, hparams, hparams.base_algo + "_" ) - learner.train( - hparams.env_fn, policy_hparams, simulated=False, save_continuously=True, - epoch=0, eval_env_fn=hparams.eval_env_fn, report_fn=report_fn + total_steps = policy_hparams.epochs_num + eval_every_epochs = policy_hparams.eval_every_epochs + if eval_every_epochs == 0: + eval_every_epochs = total_steps + policy_hparams.eval_every_epochs = 0 + + steps = list(range(eval_every_epochs, total_steps+1, eval_every_epochs)) + if not steps or steps[-1] < eval_every_epochs: + steps.append(eval_every_epochs) + metric_name = rl_utils.get_metric_name( + stochastic=True, max_num_noops=hparams.eval_max_num_noops, + clipped=False ) + for step in steps: + policy_hparams.epochs_num = step + learner.train( + hparams.env_fn, policy_hparams, simulated=False, save_continuously=True, + epoch=0 + ) + eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir) + tf.logging.info("Agent eval metrics:\n{}".format( + pprint.pformat(eval_metrics))) + if report_fn: + report_fn(eval_metrics[metric_name], step) def main(_):