From f7a54c60ba27f098c56c6f9335abe8da6fc0b17a Mon Sep 17 00:00:00 2001 From: Tarun Gupta Date: Sun, 21 Jun 2020 02:01:18 +0100 Subject: [PATCH 01/14] vdn start --- algos_tf14/dqnagent.py | 4 +- algos_tf14/vdnagent.py | 148 +++++++++++++++++++++++++++++++++++++++++ envs/smac_env.py | 3 + tf14_runner.py | 2 + 4 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 algos_tf14/vdnagent.py diff --git a/algos_tf14/dqnagent.py b/algos_tf14/dqnagent.py index bf7b10f6..007bd0ad 100644 --- a/algos_tf14/dqnagent.py +++ b/algos_tf14/dqnagent.py @@ -11,7 +11,7 @@ from common.categorical import CategoricalQ class DQNAgent: - def __init__(self, sess, base_name, observation_space, action_space, config): + def __init__(self, sess, base_name, observation_space, action_space, config, logger): observation_shape = observation_space.shape actions_num = action_space.n self.config = config @@ -47,7 +47,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config): self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames']) self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames']) if self.env_name: - self.env = env_configurations.configurations[self.env_name]['env_creator']() + self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name']) self.sess = sess self.steps_num = self.config['steps_num'] self.states = deque([], maxlen=self.steps_num) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py new file mode 100644 index 00000000..41a6dcd0 --- /dev/null +++ b/algos_tf14/vdnagent.py @@ -0,0 +1,148 @@ +import tensorflow as tf +import algos_tf14.models +from common import tr_helpers, experience, env_configurations +import numpy as np +import collections +import time +from collections import deque +from tensorboardX import SummaryWriter +from datetime import datetime +from algos_tf14.tensorflow_utils import TensorFlowVariables +from common.categorical import CategoricalQ + +class VDNAgent: + def __init__(self, sess, base_name, observation_space, action_space, config, logger): + observation_shape = observation_space.shape + actions_num = action_space.n + self.config = config + self.is_adaptive_lr = config['lr_schedule'] == 'adaptive' + self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay' + self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay' + self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32) + self.learning_rate_ph = tf.placeholder('float32', (), name = 'lr_ph') + self.games_to_track = tr_helpers.get_or_default(config, 'games_to_track', 100) + self.max_epochs = tr_helpers.get_or_default(self.config, 'max_epochs', 1e6) + + self.game_rewards = deque([], maxlen=self.games_to_track) + self.game_lengths = deque([], maxlen=self.games_to_track) + + self.epoch_num = tf.Variable( tf.constant(0, shape=(), dtype=tf.float32), trainable=False) + self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1) + self.current_lr = self.learning_rate_ph + + if self.is_adaptive_lr: + self.lr_threshold = config['lr_threshold'] + if self.is_polynom_decay_lr: + self.lr_multiplier = tf.train.polynomial_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs, end_learning_rate=0.001, power=tr_helpers.get_or_default(config, 'decay_power', 1.0)) + if self.is_exp_decay_lr: + self.lr_multiplier = tf.train.exponential_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs, decay_rate = config['decay_rate']) + + self.env_name = config['env_name'] + self.network = config['network'] + self.obs_shape = observation_shape + self.actions_num = actions_num + self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("%d, %H:%M:%S")) + self.epsilon = self.config['epsilon'] + self.rewards_shaper = self.config['reward_shaper'] + self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames']) + self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames']) + if self.env_name: + self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name']) + self.sess = sess + self.steps_num = self.config['steps_num'] + self.states = deque([], maxlen=self.steps_num) + self.is_prioritized = config['replay_buffer_type'] != 'normal' + self.atoms_num = self.config['atoms_num'] + assert self.atoms_num == 1 + + self.state_shape = (self.env.env_info['state_shape'],) + self.n_agents = self.env.env_info['n_agents'] + + if not self.is_prioritized: + self.exp_buffer = experience.ReplayBuffer(config['replay_buffer_size']) + else: + self.exp_buffer = experience.PrioritizedReplayBuffer(config['replay_buffer_size'], config['priority_alpha']) + self.sample_weights_ph = tf.placeholder(tf.float32, shape= [None,] , name='sample_weights') + + self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape , name = 'obs_ph') + self.state_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.state_shape , name = 'state_ph') + self.actions_ph = tf.placeholder(tf.int32, shape=[None,], name = 'actions_ph') + self.rewards_ph = tf.placeholder(tf.float32, shape=[None,], name = 'rewards_ph') + self.next_obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape , name = 'next_obs_ph') + self.is_done_ph = tf.placeholder(tf.float32, shape=[None,], name = 'is_done_ph') + self.is_not_done = 1 - self.is_done_ph + self.name = base_name + + self.gamma = self.config['gamma'] + self.gamma_step = self.gamma**self.steps_num + self.grad_norm = config['grad_norm'] + self.input_obs = self.obs_ph + self.input_next_obs = self.next_obs_ph + if observation_space.dtype == np.uint8: + print('scaling obs') + self.input_obs = tf.to_float(self.input_obs) / 255.0 + self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0 + self.setup_qvalues(actions_num) + self.sess.run(tf.global_variables_initializer()) + + def setup_qvalues(self, actions_num): + config = { + 'name' : 'agent', + 'inputs' : self.input_obs, + 'actions_num' : actions_num, + } + self.qvalues = self.network(config, reuse=False) + config = { + 'name' : 'target', + 'inputs' : self.input_next_obs, + 'actions_num' : actions_num, + } + self.target_qvalues = tf.stop_gradient(self.network(config, reuse=False)) + + def play_episode(self, epsilon=0.0): + mb_obs = [] + mb_rewards = [] + mb_actions = [] + mb_avail_actions = [] + mb_dones = [] + mb_states = [] + + obs = self.env.reset() + obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape)) + mb_obs.append(obs) + mb_states.append(self.env.get_state()) + mb_avail_actions.append(self.env.get_action_mask()) + + while True: + step_act = self.get_action(obs, epsilon) + print(step_act) + break + + + + + def get_action(self, obs, epsilon=0.0): + print(obs.shape) + if np.random.random() < epsilon: + action = self.env.action_space.sample() + else: + qvals = self.get_qvalues(obs) + print(qvals.shape) + action = np.argmax(qvals) + return action + + def get_qvalues(self, obs): + print(obs.shape) + return self.sess.run(self.qvalues, {self.obs_ph: obs}) + + def train(self): + self.play_episode() + + + + + + + + + diff --git a/envs/smac_env.py b/envs/smac_env.py index 178a9cba..edbc7090 100644 --- a/envs/smac_env.py +++ b/envs/smac_env.py @@ -44,6 +44,9 @@ def _preproc_actions(self, actions): actions[ind] = np.random.choice(avail_actions) #rewards[ind] = -0.05 return actions, rewards + + def get_state(self): + return self.env.get_state() def step(self, actions): fixed_rewards = None diff --git a/tf14_runner.py b/tf14_runner.py index 3fd847b7..d24d4f78 100644 --- a/tf14_runner.py +++ b/tf14_runner.py @@ -9,6 +9,7 @@ import algos_tf14.a2c_continuous as a2c_continuous import algos_tf14.a2c_discrete as a2c_discrete import algos_tf14.dqnagent as dqnagent +import algos_tf14.vdnagent as vdnagent import common.tr_helpers as tr_helpers import yaml @@ -49,6 +50,7 @@ def __init__(self, logger): self.algo_factory.register_builder('a2c_continuous', lambda **kwargs : a2c_continuous.A2CAgent(**kwargs)) self.algo_factory.register_builder('a2c_discrete', lambda **kwargs : a2c_discrete.A2CAgent(**kwargs)) self.algo_factory.register_builder('dqn', lambda **kwargs : dqnagent.DQNAgent(**kwargs)) + self.algo_factory.register_builder('vdn', lambda **kwargs : vdnagent.VDNAgent(**kwargs)) self.player_factory = common.object_factory.ObjectFactory() self.player_factory.register_builder('a2c_continuous', lambda **kwargs : players.PpoPlayerContinuous(**kwargs)) From 8b9c3db519a4941b9d6328e5a4e3fc02d3d9eb1e Mon Sep 17 00:00:00 2001 From: Tarun Gupta Date: Sun, 21 Jun 2020 02:36:51 +0100 Subject: [PATCH 02/14] up --- algos_tf14/vdnagent.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index 41a6dcd0..1fad7e17 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -106,33 +106,44 @@ def play_episode(self, epsilon=0.0): mb_avail_actions = [] mb_dones = [] mb_states = [] + step_count = 0 obs = self.env.reset() obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape)) mb_obs.append(obs) mb_states.append(self.env.get_state()) - mb_avail_actions.append(self.env.get_action_mask()) - + avail_acts = self.env.get_action_mask() + mb_avail_actions.append(avail_acts) while True: - step_act = self.get_action(obs, epsilon) - print(step_act) - break + step_count += 1 + step_act = self.get_action(obs, avail_acts, epsilon) + next_obs, rewards, dones, _ = self.env.step(step_act) + mb_actions.append(step_act) + mb_obs.append(next_obs) + mb_rewards.append(rewards) + mb_dones.append(dones) + mb_states.append(self.env.get_state()) + obs = next_obs + obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape)) + avail_acts = self.env.get_action_mask() + mb_avail_actions.append(avail_acts) + if all(dones) or self.steps_num < step_count: + break - def get_action(self, obs, epsilon=0.0): + def get_action(self, obs, avail_acts, epsilon=0.0): print(obs.shape) if np.random.random() < epsilon: action = self.env.action_space.sample() else: qvals = self.get_qvalues(obs) - print(qvals.shape) - action = np.argmax(qvals) + qvals[avail_acts == False] = -9999999 + action = np.argmax(qvals, axis=1) return action def get_qvalues(self, obs): - print(obs.shape) return self.sess.run(self.qvalues, {self.obs_ph: obs}) def train(self): From d7e0d9150eda09ea33a9e2eb86bc12aa940c303d Mon Sep 17 00:00:00 2001 From: Tarun Gupta Date: Sun, 21 Jun 2020 20:03:11 +0100 Subject: [PATCH 03/14] updates --- algos_tf14/vdnagent.py | 129 ++++++++++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 34 deletions(-) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index 1fad7e17..c1dbf5ea 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -50,7 +50,9 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name']) self.sess = sess self.steps_num = self.config['steps_num'] - self.states = deque([], maxlen=self.steps_num) + + self.obs_act_rew = deque([], maxlen=self.steps_num) + self.is_prioritized = config['replay_buffer_type'] != 'normal' self.atoms_num = self.config['atoms_num'] assert self.atoms_num == 1 @@ -83,7 +85,20 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.input_obs = tf.to_float(self.input_obs) / 255.0 self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0 self.setup_qvalues(actions_num) - self.sess.run(tf.global_variables_initializer()) + + if self.env_name: + self.sess.run(tf.global_variables_initializer()) +# self.reg_loss = tf.losses.get_regularization_loss() +# self.td_loss_mean += self.reg_loss +# self.learning_rate = self.config['learning_rate'] +# self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights) + +# self.saver = tf.train.Saver() +# self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)] +# self.variables = TensorFlowVariables(self.qvalues, self.sess) + if self.env_name: + sess.run(tf.global_variables_initializer()) + self._reset() def setup_qvalues(self, actions_num): config = { @@ -91,6 +106,7 @@ def setup_qvalues(self, actions_num): 'inputs' : self.input_obs, 'actions_num' : actions_num, } + #(n_agents, n_actions) self.qvalues = self.network(config, reuse=False) config = { 'name' : 'target', @@ -99,42 +115,85 @@ def setup_qvalues(self, actions_num): } self.target_qvalues = tf.stop_gradient(self.network(config, reuse=False)) - def play_episode(self, epsilon=0.0): - mb_obs = [] - mb_rewards = [] - mb_actions = [] - mb_avail_actions = [] - mb_dones = [] - mb_states = [] - step_count = 0 - - obs = self.env.reset() - obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape)) - mb_obs.append(obs) - mb_states.append(self.env.get_state()) - avail_acts = self.env.get_action_mask() - mb_avail_actions.append(avail_acts) + if self.config['is_double'] == True: + config = { + 'name' : 'agent', + 'inputs' : self.input_next_obs, + 'actions_num' : actions_num, + } + self.next_qvalues = tf.stop_gradient(self.network(config, reuse=True)) + + self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent') + self.target_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target') + + #(n_agents, 1) + self.current_action_qvalues = tf.reduce_sum(tf.one_hot(self.actions_ph, actions_num) * self.qvalues, reduction_indices = 1) + + if self.config['is_double'] == True: + self.next_selected_actions = tf.argmax(self.next_qvalues, axis = 1) + self.next_selected_actions_onehot = tf.one_hot(self.next_selected_actions, actions_num) + self.next_state_values_target = tf.stop_gradient( tf.reduce_sum( self.target_qvalues * self.next_selected_actions_onehot , reduction_indices=[1,] )) + else: + self.next_state_values_target = tf.stop_gradient(tf.reduce_max(self.target_qvalues, reduction_indices=1)) + + def play_steps(self, steps, epsilon=0.0): + done_reward = None + done_shaped_reward = None + done_steps = None + steps_rewards = 0 + cur_gamma = 1 + cur_obs_act_rew_len = len(self.obs_act_rew) + + # always break after one while True: - step_count += 1 - step_act = self.get_action(obs, avail_acts, epsilon) - next_obs, rewards, dones, _ = self.env.step(step_act) - mb_actions.append(step_act) - mb_obs.append(next_obs) - mb_rewards.append(rewards) - mb_dones.append(dones) - mb_states.append(self.env.get_state()) - - obs = next_obs + if cur_obs_act_rew_len > 0: + obs = self.obs_act_rew[-1][0] + else: + obs = self.current_obs obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape)) - avail_acts = self.env.get_action_mask() - mb_avail_actions.append(avail_acts) - - if all(dones) or self.steps_num < step_count: + + action = self.get_action(obs, self.env.get_action_mask(), epsilon) + print(action) + print(self.sess.run(self.qvalues, {self.obs_ph: obs})) + print(self.sess.run(self.next_state_values_target, {self.obs_ph: obs, self.actions_ph: action})) + new_obs, reward, is_done, _ = self.env.step(action) + #reward = reward * (1 - is_done) + + self.step_count += 1 + self.total_reward += reward + shaped_reward = self.rewards_shaper(reward) + self.total_shaped_reward += shaped_reward + self.obs_act_rew.append([new_obs, action, shaped_reward]) + + if len(self.obs_act_rew) < steps: break + + for i in range(steps): + sreward = self.obs_act_rew[i][2] + steps_rewards += sreward * cur_gamma + cur_gamma = cur_gamma * self.gamma + + next_obs, current_action, _ = self.obs_act_rew[0] + self.exp_buffer.add(self.current_obs, current_action, steps_rewards, new_obs, is_done) + self.current_obs = next_obs + break - + if all(is_done): + done_reward = self.total_reward + done_steps = self.step_count + done_shaped_reward = self.total_shaped_reward + self._reset() + return done_reward, done_shaped_reward, done_steps + + def _reset(self): + self.obs_act_rew.clear() + if self.env_name: + self.current_obs = self.env.reset() + self.total_reward = 0.0 + self.total_shaped_reward = 0.0 + self.step_count = 0 + def get_action(self, obs, avail_acts, epsilon=0.0): - print(obs.shape) if np.random.random() < epsilon: action = self.env.action_space.sample() else: @@ -147,7 +206,9 @@ def get_qvalues(self, obs): return self.sess.run(self.qvalues, {self.obs_ph: obs}) def train(self): - self.play_episode() + for _ in range(5): + self.play_steps(steps=3) + From 76fbc524b355f1162b792f8128f6f15ddf459a05 Mon Sep 17 00:00:00 2001 From: Tarun Gupta Date: Sun, 21 Jun 2020 20:50:11 +0100 Subject: [PATCH 04/14] state added to exp replay --- algos_tf14/vdnagent.py | 233 ++++++++++++++++++++++++++++++++--------- common/experience.py | 11 +- 2 files changed, 191 insertions(+), 53 deletions(-) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index c1dbf5ea..f356a14c 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -86,20 +86,27 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0 self.setup_qvalues(actions_num) - if self.env_name: - self.sess.run(tf.global_variables_initializer()) -# self.reg_loss = tf.losses.get_regularization_loss() -# self.td_loss_mean += self.reg_loss -# self.learning_rate = self.config['learning_rate'] -# self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights) - -# self.saver = tf.train.Saver() -# self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)] -# self.variables = TensorFlowVariables(self.qvalues, self.sess) + self.reg_loss = tf.losses.get_regularization_loss() + self.td_loss_mean += self.reg_loss + self.learning_rate = self.config['learning_rate'] + self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights) + + self.saver = tf.train.Saver() + self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)] + self.variables = TensorFlowVariables(self.qvalues, self.sess) if self.env_name: sess.run(tf.global_variables_initializer()) self._reset() + def get_weights(self): + return self.variables.get_flat() + + def set_weights(self, weights): + return self.variables.set_flat(weights) + + def update_epoch(self): + return self.sess.run([self.update_epoch_op])[0] + def setup_qvalues(self, actions_num): config = { 'name' : 'agent', @@ -132,9 +139,56 @@ def setup_qvalues(self, actions_num): if self.config['is_double'] == True: self.next_selected_actions = tf.argmax(self.next_qvalues, axis = 1) self.next_selected_actions_onehot = tf.one_hot(self.next_selected_actions, actions_num) - self.next_state_values_target = tf.stop_gradient( tf.reduce_sum( self.target_qvalues * self.next_selected_actions_onehot , reduction_indices=[1,] )) + self.next_obs_values_target = tf.stop_gradient( tf.reduce_sum( self.target_qvalues * self.next_selected_actions_onehot , reduction_indices=[1,] )) + else: + self.next_obs_values_target = tf.stop_gradient(tf.reduce_max(self.target_qvalues, reduction_indices=1)) + + ##MIXING: + self.current_action_qvalues_mix = tf.reduce_sum(self.current_action_qvalues, axis=0) + self.target_action_qvalues_mix = tf.reduce_sum(self.next_obs_values_target, axis=0) + + self.reference_qvalues = self.rewards_ph + self.gamma_step *self.is_not_done * self.target_action_qvalues_mix + + if self.is_prioritized: + # we need to return l1 loss to update priority buffer + self.abs_errors = tf.abs(self.current_action_qvalues_mix - self.reference_qvalues) + 1e-5 + # the same as multiply gradients later (other way is used in different examples over internet) + self.td_loss = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, reduction=tf.losses.Reduction.NONE) * self.sample_weights_ph + self.td_loss_mean = tf.reduce_mean(self.td_loss) + else: + self.td_loss_mean = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, reduction=tf.losses.Reduction.MEAN) + + self.reg_loss = tf.losses.get_regularization_loss() + self.td_loss_mean += self.reg_loss + self.learning_rate = self.config['learning_rate'] + if self.env_name: + self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights) + + def save(self, fn): + self.saver.save(self.sess, fn) + + def restore(self, fn): + self.saver.restore(self.sess, fn) + + def _reset(self): + self.obs_act_rew.clear() + if self.env_name: + self.current_obs = self.env.reset() + self.total_reward = 0.0 + self.total_shaped_reward = 0.0 + self.step_count = 0 + + def get_action(self, obs, avail_acts, epsilon=0.0): + if np.random.random() < epsilon: + action = self.env.action_space.sample() else: - self.next_state_values_target = tf.stop_gradient(tf.reduce_max(self.target_qvalues, reduction_indices=1)) + qvals = self.get_qvalues(obs) + qvals[avail_acts == False] = -9999999 + action = np.argmax(qvals, axis=1) + return action + + def get_qvalues(self, obs): + return self.sess.run(self.qvalues, {self.obs_ph: obs}) def play_steps(self, steps, epsilon=0.0): done_reward = None @@ -151,11 +205,9 @@ def play_steps(self, steps, epsilon=0.0): else: obs = self.current_obs obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape)) + state = self.env.get_state() action = self.get_action(obs, self.env.get_action_mask(), epsilon) - print(action) - print(self.sess.run(self.qvalues, {self.obs_ph: obs})) - print(self.sess.run(self.next_state_values_target, {self.obs_ph: obs, self.actions_ph: action})) new_obs, reward, is_done, _ = self.env.step(action) #reward = reward * (1 - is_done) @@ -163,7 +215,7 @@ def play_steps(self, steps, epsilon=0.0): self.total_reward += reward shaped_reward = self.rewards_shaper(reward) self.total_shaped_reward += shaped_reward - self.obs_act_rew.append([new_obs, action, shaped_reward]) + self.obs_act_rew.append([new_obs, action, shaped_reward, state]) if len(self.obs_act_rew) < steps: break @@ -173,8 +225,8 @@ def play_steps(self, steps, epsilon=0.0): steps_rewards += sreward * cur_gamma cur_gamma = cur_gamma * self.gamma - next_obs, current_action, _ = self.obs_act_rew[0] - self.exp_buffer.add(self.current_obs, current_action, steps_rewards, new_obs, is_done) + next_obs, current_action, _, current_st = self.obs_act_rew[0] + self.exp_buffer.add(self.current_obs, current_action, current_st, steps_rewards, new_obs, is_done) self.current_obs = next_obs break @@ -184,37 +236,122 @@ def play_steps(self, steps, epsilon=0.0): done_shaped_reward = self.total_shaped_reward self._reset() return done_reward, done_shaped_reward, done_steps - - def _reset(self): - self.obs_act_rew.clear() - if self.env_name: - self.current_obs = self.env.reset() - self.total_reward = 0.0 - self.total_shaped_reward = 0.0 - self.step_count = 0 - - def get_action(self, obs, avail_acts, epsilon=0.0): - if np.random.random() < epsilon: - action = self.env.action_space.sample() - else: - qvals = self.get_qvalues(obs) - qvals[avail_acts == False] = -9999999 - action = np.argmax(qvals, axis=1) - return action - def get_qvalues(self, obs): - return self.sess.run(self.qvalues, {self.obs_ph: obs}) - + def load_weights_into_target_network(self): + self.sess.run(self.assigns_op) + + def sample_batch(self, exp_replay, batch_size): + obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(batch_size) + return { + self.obs_ph:obs_batch, self.actions_ph:act_batch, self.state_ph: st_batch, + self.rewards_ph:reward_batch, self.is_done_ph:is_done_batch, self.next_obs_ph:next_obs_batch + } + + def sample_prioritized_batch(self, exp_replay, batch_size, beta): + obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch, sample_weights, sample_idxes = exp_replay.sample(batch_size, beta) + batch = { self.obs_ph:obs_batch, self.actions_ph:act_batch, self.state_ph: st_batch, self.rewards_ph:reward_batch, + self.is_done_ph:is_done_batch, self.next_obs_ph:next_obs_batch, self.sample_weights_ph: sample_weights } + return [batch , sample_idxes] + def train(self): - for _ in range(5): - self.play_steps(steps=3) - - - - - - - - + mem_free_steps = 0 + last_mean_rewards = -100500 + epoch_num = 0 + frame = 0 + update_time = 0 + play_time = 0 + + start_time = time.time() + total_time = 0 + self.load_weights_into_target_network() + for _ in range(0, self.config['num_steps_fill_buffer']): + self.play_steps(self.steps_num, self.epsilon) + steps_per_epoch = self.config['steps_per_epoch'] + num_epochs_to_copy = self.config['num_epochs_to_copy'] + batch_size = self.config['batch_size'] + lives_reward = self.config['lives_reward'] + episodes_to_log = self.config['episodes_to_log'] + frame = 0 + play_time = 0 + update_time = 0 + rewards = [] + shaped_rewards = [] + steps = [] + losses = deque([], maxlen=100) +# while True: +# epoch_num = self.update_epoch() +# t_play_start = time.time() +# self.epsilon = self.epsilon_processor(frame) +# self.beta = self.beta_processor(frame) + +# for _ in range(0, steps_per_epoch): +# reward, shaped_reward, step = self.play_steps(self.steps_num, self.epsilon) +# if reward != None: +# self.game_lengths.append(step) +# self.game_rewards.append(reward) +# #shaped_rewards.append(shaped_reward) +# t_play_end = time.time() +# play_time += t_play_end - t_play_start + +# # train +# frame = frame + steps_per_epoch +# t_start = time.time() +# if self.is_prioritized: +# batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta) +# _, loss_t, errors_update, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch) +# self.exp_buffer.update_priorities(idxes, errors_update) +# else: +# batch = self.sample_batch(self.exp_buffer, batch_size=batch_size) +# _, loss_t, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.lr_multiplier], batch) + +# losses.append(loss_t) +# t_end = time.time() +# update_time += t_end - t_start +# total_time += update_time +# if frame % 1000 == 0: +# mem_free_steps += 1 +# if mem_free_steps == 10: +# mem_free_steps = 0 +# tr_helpers.free_mem() +# sum_time = update_time + play_time +# print('frames per seconds: ', 1000 / (sum_time)) +# self.writer.add_scalar('performance/fps', 1000 / sum_time, frame) +# self.writer.add_scalar('performance/upd_time', update_time, frame) +# self.writer.add_scalar('performance/play_time', play_time, frame) +# self.writer.add_scalar('losses/td_loss', np.mean(losses), frame) +# self.writer.add_scalar('info/lr_mul', lr_mul, frame) +# self.writer.add_scalar('info/lr', self.learning_rate*lr_mul, frame) +# self.writer.add_scalar('info/epochs', epoch_num, frame) +# self.writer.add_scalar('info/epsilon', self.epsilon, frame) +# if self.is_prioritized: +# self.writer.add_scalar('beta', self.beta, frame) + +# update_time = 0 +# play_time = 0 +# num_games = len(self.game_rewards) +# if num_games > 10: +# d = num_games / lives_reward +# mean_rewards = np.sum(self.game_rewards) / d +# mean_lengths = np.sum(self.game_lengths) / d +# self.writer.add_scalar('rewards/mean', mean_rewards, frame) +# self.writer.add_scalar('rewards/time', mean_rewards, total_time) +# self.writer.add_scalar('episode_lengths/mean', mean_lengths, frame) +# self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time) + +# if mean_rewards > last_mean_rewards: +# print('saving next best rewards: ', mean_rewards) +# last_mean_rewards = mean_rewards +# self.save("./nn/" + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards)) +# if last_mean_rewards > self.config['score_to_win']: +# print('network won!') +# return last_mean_rewards, epoch_num + +# if frame % num_epochs_to_copy == 0: +# self.load_weigths_into_target_network() + +# if epoch_num >= self.max_epochs: +# print('Max epochs reached') +# self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(np.sum(self.game_rewards) * lives_reward / len(self.game_rewards))) +# return last_mean_rewards, epoch_num diff --git a/common/experience.py b/common/experience.py index 28885e91..2765db79 100644 --- a/common/experience.py +++ b/common/experience.py @@ -20,8 +20,8 @@ def __init__(self, size): def __len__(self): return len(self._storage) - def add(self, obs_t, action, reward, obs_tp1, done): - data = (obs_t, action, reward, obs_tp1, done) + def add(self, obs_t, action, state_t, reward, obs_tp1, done): + data = (obs_t, action, state_t, reward, obs_tp1, done) if self._next_idx >= len(self._storage): self._storage.append(data) @@ -30,16 +30,17 @@ def add(self, obs_t, action, reward, obs_tp1, done): self._next_idx = (self._next_idx + 1) % self._maxsize def _encode_sample(self, idxes): - obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] + obses_t, actions, states_t, rewards, obses_tp1, dones = [], [], [], [], [] for i in idxes: data = self._storage[i] - obs_t, action, reward, obs_tp1, done = data + obs_t, action, state_t, reward, obs_tp1, done = data obses_t.append(np.array(obs_t, copy=False)) actions.append(np.array(action, copy=False)) + states_t.append(np.array(state_t, copy=False)) rewards.append(reward) obses_tp1.append(np.array(obs_tp1, copy=False)) dones.append(done) - return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) + return np.array(obses_t), np.array(actions), np.array(states_t), np.array(rewards), np.array(obses_tp1), np.array(dones) def sample(self, batch_size): """Sample a batch of experiences. From 8dc9c6127e15a811adf52f181254fa99e79b8122 Mon Sep 17 00:00:00 2001 From: Tarun Gupta Date: Sun, 21 Jun 2020 21:54:54 +0100 Subject: [PATCH 05/14] rudimentary vdn ready --- algos_tf14/vdnagent.py | 139 +++++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 69 deletions(-) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index f356a14c..901793e9 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -9,6 +9,7 @@ from datetime import datetime from algos_tf14.tensorflow_utils import TensorFlowVariables from common.categorical import CategoricalQ +import tensorflow_probability as tfp class VDNAgent: def __init__(self, sess, base_name, observation_space, action_space, config, logger): @@ -180,7 +181,7 @@ def _reset(self): def get_action(self, obs, avail_acts, epsilon=0.0): if np.random.random() < epsilon: - action = self.env.action_space.sample() + action = tfp.distributions.Categorical(probs=avail_acts.astype(float)).sample().eval(session=self.sess) else: qvals = self.get_qvalues(obs) qvals[avail_acts == False] = -9999999 @@ -279,79 +280,79 @@ def train(self): steps = [] losses = deque([], maxlen=100) -# while True: -# epoch_num = self.update_epoch() -# t_play_start = time.time() -# self.epsilon = self.epsilon_processor(frame) -# self.beta = self.beta_processor(frame) + while True: + epoch_num = self.update_epoch() + t_play_start = time.time() + self.epsilon = self.epsilon_processor(frame) + self.beta = self.beta_processor(frame) -# for _ in range(0, steps_per_epoch): -# reward, shaped_reward, step = self.play_steps(self.steps_num, self.epsilon) -# if reward != None: -# self.game_lengths.append(step) -# self.game_rewards.append(reward) -# #shaped_rewards.append(shaped_reward) + for _ in range(0, steps_per_epoch): + reward, shaped_reward, step = self.play_steps(self.steps_num, self.epsilon) + if reward != None: + self.game_lengths.append(step) + self.game_rewards.append(reward) + #shaped_rewards.append(shaped_reward) -# t_play_end = time.time() -# play_time += t_play_end - t_play_start + t_play_end = time.time() + play_time += t_play_end - t_play_start -# # train -# frame = frame + steps_per_epoch -# t_start = time.time() -# if self.is_prioritized: -# batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta) -# _, loss_t, errors_update, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch) -# self.exp_buffer.update_priorities(idxes, errors_update) -# else: -# batch = self.sample_batch(self.exp_buffer, batch_size=batch_size) -# _, loss_t, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.lr_multiplier], batch) + # train + frame = frame + steps_per_epoch + t_start = time.time() + if self.is_prioritized: + batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta) + _, loss_t, errors_update, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch) + self.exp_buffer.update_priorities(idxes, errors_update) + else: + batch = self.sample_batch(self.exp_buffer, batch_size=batch_size) + _, loss_t, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.lr_multiplier], batch) -# losses.append(loss_t) -# t_end = time.time() -# update_time += t_end - t_start -# total_time += update_time -# if frame % 1000 == 0: -# mem_free_steps += 1 -# if mem_free_steps == 10: -# mem_free_steps = 0 -# tr_helpers.free_mem() -# sum_time = update_time + play_time -# print('frames per seconds: ', 1000 / (sum_time)) -# self.writer.add_scalar('performance/fps', 1000 / sum_time, frame) -# self.writer.add_scalar('performance/upd_time', update_time, frame) -# self.writer.add_scalar('performance/play_time', play_time, frame) -# self.writer.add_scalar('losses/td_loss', np.mean(losses), frame) -# self.writer.add_scalar('info/lr_mul', lr_mul, frame) -# self.writer.add_scalar('info/lr', self.learning_rate*lr_mul, frame) -# self.writer.add_scalar('info/epochs', epoch_num, frame) -# self.writer.add_scalar('info/epsilon', self.epsilon, frame) -# if self.is_prioritized: -# self.writer.add_scalar('beta', self.beta, frame) + losses.append(loss_t) + t_end = time.time() + update_time += t_end - t_start + total_time += update_time + if frame % 1000 == 0: + mem_free_steps += 1 + if mem_free_steps == 10: + mem_free_steps = 0 + tr_helpers.free_mem() + sum_time = update_time + play_time + print('frames per seconds: ', 1000 / (sum_time)) + self.writer.add_scalar('performance/fps', 1000 / sum_time, frame) + self.writer.add_scalar('performance/upd_time', update_time, frame) + self.writer.add_scalar('performance/play_time', play_time, frame) + self.writer.add_scalar('losses/td_loss', np.mean(losses), frame) + self.writer.add_scalar('info/lr_mul', lr_mul, frame) + self.writer.add_scalar('info/lr', self.learning_rate*lr_mul, frame) + self.writer.add_scalar('info/epochs', epoch_num, frame) + self.writer.add_scalar('info/epsilon', self.epsilon, frame) + if self.is_prioritized: + self.writer.add_scalar('beta', self.beta, frame) -# update_time = 0 -# play_time = 0 -# num_games = len(self.game_rewards) -# if num_games > 10: -# d = num_games / lives_reward -# mean_rewards = np.sum(self.game_rewards) / d -# mean_lengths = np.sum(self.game_lengths) / d -# self.writer.add_scalar('rewards/mean', mean_rewards, frame) -# self.writer.add_scalar('rewards/time', mean_rewards, total_time) -# self.writer.add_scalar('episode_lengths/mean', mean_lengths, frame) -# self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time) + update_time = 0 + play_time = 0 + num_games = len(self.game_rewards) + if num_games > 10: + d = num_games / lives_reward + mean_rewards = np.sum(self.game_rewards) / d + mean_lengths = np.sum(self.game_lengths) / d + self.writer.add_scalar('rewards/mean', mean_rewards, frame) + self.writer.add_scalar('rewards/time', mean_rewards, total_time) + self.writer.add_scalar('episode_lengths/mean', mean_lengths, frame) + self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time) -# if mean_rewards > last_mean_rewards: -# print('saving next best rewards: ', mean_rewards) -# last_mean_rewards = mean_rewards -# self.save("./nn/" + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards)) -# if last_mean_rewards > self.config['score_to_win']: -# print('network won!') -# return last_mean_rewards, epoch_num + if mean_rewards > last_mean_rewards: + print('saving next best rewards: ', mean_rewards) + last_mean_rewards = mean_rewards + self.save("./nn/" + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards)) + if last_mean_rewards > self.config['score_to_win']: + print('network won!') + return last_mean_rewards, epoch_num -# if frame % num_epochs_to_copy == 0: -# self.load_weigths_into_target_network() + if frame % num_epochs_to_copy == 0: + self.load_weigths_into_target_network() -# if epoch_num >= self.max_epochs: -# print('Max epochs reached') -# self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(np.sum(self.game_rewards) * lives_reward / len(self.game_rewards))) -# return last_mean_rewards, epoch_num + if epoch_num >= self.max_epochs: + print('Max epochs reached') + self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(np.sum(self.game_rewards) * lives_reward / len(self.game_rewards))) + return last_mean_rewards, epoch_num From b3a2870ee6d9dc48016a87a52a191f09f9275ff1 Mon Sep 17 00:00:00 2001 From: Tarun Date: Mon, 22 Jun 2020 00:24:24 +0100 Subject: [PATCH 06/14] vdn as a model --- algos_tf14/model_builder.py | 1 + algos_tf14/models.py | 37 ++++++++++++++++++++++++++++++++++ algos_tf14/vdnagent.py | 40 +++++++------------------------------ 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/algos_tf14/model_builder.py b/algos_tf14/model_builder.py index dd58a9dd..ade95ba2 100644 --- a/algos_tf14/model_builder.py +++ b/algos_tf14/model_builder.py @@ -13,6 +13,7 @@ def __init__(self): self.model_factory.register_builder('continuous_a2c_lstm', lambda network, **kwargs : models.LSTMModelA2CContinuous(network)) self.model_factory.register_builder('continuous_a2c_lstm_logstd', lambda network, **kwargs : models.LSTMModelA2CContinuousLogStd(network)) self.model_factory.register_builder('dqn', lambda network, **kwargs : models.AtariDQN(network)) + self.model_factory.register_builder('vdn', lambda network, **kwargs : models.VDN_DQN(network)) self.network_factory = object_factory.ObjectFactory() diff --git a/algos_tf14/models.py b/algos_tf14/models.py index 117b1047..70f84a6a 100644 --- a/algos_tf14/models.py +++ b/algos_tf14/models.py @@ -245,3 +245,40 @@ def __call__(self, dict, reuse=False): ''' is_train = name == 'agent' return self.network(name=name, inputs=inputs, actions_num=actions_num, is_train=is_train, reuse=reuse) + + +class VDN_DQN(BaseModel): + def __init__(self, network): + self.network = network + + def __call__(self, dict): + input_obs = dict['input_obs'] + input_next_obs = dict['input_next_obs'] + actions_num = dict['actions_num'] + is_double = dict['is_double'] + actions_ph = dict['actions_ph'] + + ''' + TODO: fix is_train + ''' + # is_train = name == 'agent' + + # (n_agents, n_actions) + qvalues = self.network(name='agent', inputs=input_obs, actions_num=actions_num, is_train=True, reuse=False) + target_qvalues = tf.stop_gradient(self.network(name='target', inputs=input_next_obs, actions_num=actions_num, is_train=False, reuse=False)) + current_action_qvalues = tf.reduce_sum(tf.one_hot(actions_ph, actions_num) * qvalues, + reduction_indices=1) + if is_double: + next_qvalues = tf.stop_gradient(self.network(name='agent', inputs=input_next_obs, actions_num=actions_num, is_train=True, reuse=True)) + next_selected_actions = tf.argmax(next_qvalues, axis=1) + next_selected_actions_onehot = tf.one_hot(next_selected_actions, actions_num) + next_obs_values_target = tf.stop_gradient( + tf.reduce_sum(target_qvalues * next_selected_actions_onehot, reduction_indices=[1, ])) + else: + next_obs_values_target = tf.stop_gradient(tf.reduce_max(target_qvalues, reduction_indices=1)) + + ##MIXING: + current_action_qvalues_mix = tf.reduce_sum(current_action_qvalues, axis=0) + target_action_qvalues_mix = tf.reduce_sum(next_obs_values_target, axis=0) + + return current_action_qvalues_mix, target_action_qvalues_mix diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index 901793e9..34aa6668 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -110,44 +110,18 @@ def update_epoch(self): def setup_qvalues(self, actions_num): config = { - 'name' : 'agent', - 'inputs' : self.input_obs, + 'input_obs' : self.input_obs, + 'input_next_obs': self.input_next_obs, 'actions_num' : actions_num, + 'is_double': self.config['is_double'], + 'actions_ph': self.actions_ph } - #(n_agents, n_actions) - self.qvalues = self.network(config, reuse=False) - config = { - 'name' : 'target', - 'inputs' : self.input_next_obs, - 'actions_num' : actions_num, - } - self.target_qvalues = tf.stop_gradient(self.network(config, reuse=False)) - - if self.config['is_double'] == True: - config = { - 'name' : 'agent', - 'inputs' : self.input_next_obs, - 'actions_num' : actions_num, - } - self.next_qvalues = tf.stop_gradient(self.network(config, reuse=True)) + + self.current_action_qvalues_mix, self.target_action_qvalues_mix = self.network(config) self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent') self.target_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target') - - #(n_agents, 1) - self.current_action_qvalues = tf.reduce_sum(tf.one_hot(self.actions_ph, actions_num) * self.qvalues, reduction_indices = 1) - - if self.config['is_double'] == True: - self.next_selected_actions = tf.argmax(self.next_qvalues, axis = 1) - self.next_selected_actions_onehot = tf.one_hot(self.next_selected_actions, actions_num) - self.next_obs_values_target = tf.stop_gradient( tf.reduce_sum( self.target_qvalues * self.next_selected_actions_onehot , reduction_indices=[1,] )) - else: - self.next_obs_values_target = tf.stop_gradient(tf.reduce_max(self.target_qvalues, reduction_indices=1)) - - ##MIXING: - self.current_action_qvalues_mix = tf.reduce_sum(self.current_action_qvalues, axis=0) - self.target_action_qvalues_mix = tf.reduce_sum(self.next_obs_values_target, axis=0) - + self.reference_qvalues = self.rewards_ph + self.gamma_step *self.is_not_done * self.target_action_qvalues_mix if self.is_prioritized: From 55b60bfe9426b3bb9d9f0a2adcc41b097daf2838 Mon Sep 17 00:00:00 2001 From: Tarun Date: Mon, 22 Jun 2020 01:08:07 +0100 Subject: [PATCH 07/14] vdn conf --- configs/vdn_3s5z_vs_3s6z.yaml | 62 +++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 configs/vdn_3s5z_vs_3s6z.yaml diff --git a/configs/vdn_3s5z_vs_3s6z.yaml b/configs/vdn_3s5z_vs_3s6z.yaml new file mode 100644 index 00000000..2ea47834 --- /dev/null +++ b/configs/vdn_3s5z_vs_3s6z.yaml @@ -0,0 +1,62 @@ +label: "" +name: "" +params: + algo: + name: vdn + + model: + name: vdn + + load_checkpoint: False + load_path: "" + + network: + name: dqn + dueling: True + atoms: 1 + noisy: False + mlp: + units: [256] + activation: relu + initializer: + name: variance_scaling_initializer + scale: 2 + regularizer: + name: 'None' + + config: + reward_shaper: + scale_value: 0.1 + gamma: 0.99 + learning_rate: 0.0005 + steps_per_epoch: 4 + batch_size: 128 + epsilon: 1.0 + min_epsilon: 0.05 + epsilon_decay_frames: 100000 + num_epochs_to_copy: 10000 + env_name: smac_cnn + name: 3s5z_vs_3s6z + is_double: True + score_to_win: 20 + num_steps_fill_buffer: 10000 + replay_buffer_type: 'normal' + replay_buffer_size: 100000 + priority_beta: 0.4 + priority_alpha: 0.6 + beta_decay_frames: 100000 + max_beta: 1 + steps_num: 128 +# episodes_to_log: 10 + atoms_num: 1 + games_to_track: 20 + lr_schedule: None + max_epochs: 100000 + grad_norm: 0.5 + mix_with_state: False + + env_config: + name: 3s5z_vs_3s6z + frames: 4 + transpose: True + random_invalid_step: False \ No newline at end of file From b22648f3c14489e4a934dddd02c91c1039be5967 Mon Sep 17 00:00:00 2001 From: Tarun Date: Mon, 22 Jun 2020 01:24:20 +0100 Subject: [PATCH 08/14] env config use in vdn --- algos_tf14/vdnagent.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index 34aa6668..b0e45b49 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -48,7 +48,8 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames']) self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames']) if self.env_name: - self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name']) + self.env_config = config.get('env_config', {}) + self.env = env_configurations.configurations[self.env_name]['env_creator'](**self.env_config) self.sess = sess self.steps_num = self.config['steps_num'] @@ -94,16 +95,16 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.saver = tf.train.Saver() self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)] - self.variables = TensorFlowVariables(self.qvalues, self.sess) + # self.variables = TensorFlowVariables(self.qvalues, self.sess) if self.env_name: sess.run(tf.global_variables_initializer()) self._reset() - def get_weights(self): - return self.variables.get_flat() - - def set_weights(self, weights): - return self.variables.set_flat(weights) + # def get_weights(self): + # return self.variables.get_flat() + # + # def set_weights(self, weights): + # return self.variables.set_flat(weights) def update_epoch(self): return self.sess.run([self.update_epoch_op])[0] From fb0161d09ee7f8b3f115b1536a70f03e9669a6ad Mon Sep 17 00:00:00 2001 From: Tarun Date: Mon, 22 Jun 2020 01:26:00 +0100 Subject: [PATCH 09/14] bug correct --- algos_tf14/vdnagent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index b0e45b49..b2bbd693 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -325,7 +325,7 @@ def train(self): return last_mean_rewards, epoch_num if frame % num_epochs_to_copy == 0: - self.load_weigths_into_target_network() + self.load_weights_into_target_network() if epoch_num >= self.max_epochs: print('Max epochs reached') From cbf85febad74acbf5bfe8dae187f6b8fd5952a9c Mon Sep 17 00:00:00 2001 From: Tarun Date: Mon, 22 Jun 2020 20:00:11 +0100 Subject: [PATCH 10/14] grad norm with truncate option and a bug update --- algos_tf14/models.py | 2 +- algos_tf14/vdnagent.py | 23 ++++++++++++++--------- configs/vdn_3s5z_vs_3s6z.yaml | 1 + 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/algos_tf14/models.py b/algos_tf14/models.py index 70f84a6a..81e88c4b 100644 --- a/algos_tf14/models.py +++ b/algos_tf14/models.py @@ -281,4 +281,4 @@ def __call__(self, dict): current_action_qvalues_mix = tf.reduce_sum(current_action_qvalues, axis=0) target_action_qvalues_mix = tf.reduce_sum(next_obs_values_target, axis=0) - return current_action_qvalues_mix, target_action_qvalues_mix + return qvalues, current_action_qvalues_mix, target_action_qvalues_mix diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index b2bbd693..9f184556 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -27,7 +27,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.game_rewards = deque([], maxlen=self.games_to_track) self.game_lengths = deque([], maxlen=self.games_to_track) - self.epoch_num = tf.Variable( tf.constant(0, shape=(), dtype=tf.float32), trainable=False) + self.epoch_num = tf.Variable(tf.constant(0, shape=(), dtype=tf.float32), trainable=False) self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1) self.current_lr = self.learning_rate_ph @@ -91,20 +91,25 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.reg_loss = tf.losses.get_regularization_loss() self.td_loss_mean += self.reg_loss self.learning_rate = self.config['learning_rate'] - self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights) + self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier)#.minimize(self.td_loss_mean, var_list=self.weights) + grads = tf.gradients(self.td_loss_mean, self.weights) + if self.config['truncate_grads']: + grads, _ = tf.clip_by_global_norm(grads, self.grad_norm) + grads = list(zip(grads, self.weights)) + self.train_op = self.train_step.apply_gradients(grads) self.saver = tf.train.Saver() self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)] - # self.variables = TensorFlowVariables(self.qvalues, self.sess) + self.variables = TensorFlowVariables(self.qvalues, self.sess) if self.env_name: sess.run(tf.global_variables_initializer()) self._reset() - # def get_weights(self): - # return self.variables.get_flat() - # - # def set_weights(self, weights): - # return self.variables.set_flat(weights) + def get_weights(self): + return self.variables.get_flat() + + def set_weights(self, weights): + return self.variables.set_flat(weights) def update_epoch(self): return self.sess.run([self.update_epoch_op])[0] @@ -118,7 +123,7 @@ def setup_qvalues(self, actions_num): 'actions_ph': self.actions_ph } - self.current_action_qvalues_mix, self.target_action_qvalues_mix = self.network(config) + self.qvalues, self.current_action_qvalues_mix, self.target_action_qvalues_mix = self.network(config) self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent') self.target_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target') diff --git a/configs/vdn_3s5z_vs_3s6z.yaml b/configs/vdn_3s5z_vs_3s6z.yaml index 2ea47834..75e6f51a 100644 --- a/configs/vdn_3s5z_vs_3s6z.yaml +++ b/configs/vdn_3s5z_vs_3s6z.yaml @@ -54,6 +54,7 @@ params: max_epochs: 100000 grad_norm: 0.5 mix_with_state: False + truncate_grads: True env_config: name: 3s5z_vs_3s6z From 279bf6a00d33f64d5f1fce6a461b51f5f0540a0f Mon Sep 17 00:00:00 2001 From: Tarun Date: Mon, 22 Jun 2020 20:09:45 +0100 Subject: [PATCH 11/14] bug correct --- algos_tf14/vdnagent.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index 9f184556..084a12c0 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -281,11 +281,12 @@ def train(self): t_start = time.time() if self.is_prioritized: batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta) - _, loss_t, errors_update, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch) + _, loss_t, errors_update, lr_mul = self.sess.run([self.train_op, self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch) self.exp_buffer.update_priorities(idxes, errors_update) else: batch = self.sample_batch(self.exp_buffer, batch_size=batch_size) - _, loss_t, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.lr_multiplier], batch) + print(self.sess.run(self.qvalues, batch).shape) + _, loss_t, lr_mul = self.sess.run([self.train_op, self.train_step, self.td_loss_mean, self.lr_multiplier], batch) losses.append(loss_t) t_end = time.time() From e8e8f7e832f11bcee5d46f2a355152ae82a445dd Mon Sep 17 00:00:00 2001 From: Tarun Date: Mon, 22 Jun 2020 21:32:38 +0100 Subject: [PATCH 12/14] major changes --- algos_tf14/models.py | 37 +++++-- algos_tf14/vdnagent.py | 202 ++++++++++++++++++++-------------- common/experience.py | 2 +- configs/vdn_3s5z_vs_3s6z.yaml | 27 ++++- 4 files changed, 176 insertions(+), 92 deletions(-) diff --git a/algos_tf14/models.py b/algos_tf14/models.py index 81e88c4b..b0e531e4 100644 --- a/algos_tf14/models.py +++ b/algos_tf14/models.py @@ -256,29 +256,50 @@ def __call__(self, dict): input_next_obs = dict['input_next_obs'] actions_num = dict['actions_num'] is_double = dict['is_double'] + # (bs * n_agents, 1) actions_ph = dict['actions_ph'] + batch_size_ph = dict['batch_size_ph'] + n_agents = dict['n_agents'] ''' TODO: fix is_train ''' # is_train = name == 'agent' - # (n_agents, n_actions) + # (bs * n_agents, n_actions) qvalues = self.network(name='agent', inputs=input_obs, actions_num=actions_num, is_train=True, reuse=False) + # (bs, n_agents, n_actions) + qvalues = tf.reshape(qvalues, [batch_size_ph, n_agents, actions_num]) + # (bs * n_agents, n_actions) target_qvalues = tf.stop_gradient(self.network(name='target', inputs=input_next_obs, actions_num=actions_num, is_train=False, reuse=False)) - current_action_qvalues = tf.reduce_sum(tf.one_hot(actions_ph, actions_num) * qvalues, - reduction_indices=1) + # (bs, n_agents, n_actions) + target_qvalues = tf.reshape(target_qvalues, [batch_size_ph, n_agents, actions_num]) + + # (bs * n_agents, 1, actions_num) + # (bs, n_agents, actions_num) + one_hot_actions = tf.reshape(tf.one_hot(actions_ph, actions_num), [batch_size_ph, n_agents, actions_num]) + # (bs, n_agents, 1) + current_action_qvalues = tf.reduce_sum(one_hot_actions * qvalues, axis=2) + if is_double: + # (bs * n_agents, n_actions) next_qvalues = tf.stop_gradient(self.network(name='agent', inputs=input_next_obs, actions_num=actions_num, is_train=True, reuse=True)) + # (bs * n_agents, 1) next_selected_actions = tf.argmax(next_qvalues, axis=1) - next_selected_actions_onehot = tf.one_hot(next_selected_actions, actions_num) + # (bs*n_agents, 1, n_actions) + # (bs, n_agents, actions_num) + next_selected_actions_onehot = tf.reshape(tf.one_hot(next_selected_actions, actions_num), [batch_size_ph, n_agents, actions_num]) + # (bs, n_agents, 1) next_obs_values_target = tf.stop_gradient( - tf.reduce_sum(target_qvalues * next_selected_actions_onehot, reduction_indices=[1, ])) + tf.reduce_sum(target_qvalues * next_selected_actions_onehot, axis=2)) else: - next_obs_values_target = tf.stop_gradient(tf.reduce_max(target_qvalues, reduction_indices=1)) + # (bs, n_agents, 1) + next_obs_values_target = tf.stop_gradient(tf.reduce_max(target_qvalues, axis=2)) ##MIXING: - current_action_qvalues_mix = tf.reduce_sum(current_action_qvalues, axis=0) - target_action_qvalues_mix = tf.reduce_sum(next_obs_values_target, axis=0) + # (bs, 1) + current_action_qvalues_mix = tf.reshape(tf.reduce_sum(current_action_qvalues, axis=1), [batch_size_ph, 1]) + # (bs, 1, 1) + target_action_qvalues_mix = tf.reshape(tf.reduce_sum(next_obs_values_target, axis=1), [batch_size_ph, 1]) return qvalues, current_action_qvalues_mix, target_action_qvalues_mix diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index 084a12c0..bce361e3 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -11,6 +11,7 @@ from common.categorical import CategoricalQ import tensorflow_probability as tfp + class VDNAgent: def __init__(self, sess, base_name, observation_space, action_space, config, logger): observation_shape = observation_space.shape @@ -20,7 +21,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay' self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay' self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32) - self.learning_rate_ph = tf.placeholder('float32', (), name = 'lr_ph') + self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph') self.games_to_track = tr_helpers.get_or_default(config, 'games_to_track', 100) self.max_epochs = tr_helpers.get_or_default(self.config, 'max_epochs', 1e6) @@ -34,51 +35,60 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log if self.is_adaptive_lr: self.lr_threshold = config['lr_threshold'] if self.is_polynom_decay_lr: - self.lr_multiplier = tf.train.polynomial_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs, end_learning_rate=0.001, power=tr_helpers.get_or_default(config, 'decay_power', 1.0)) + self.lr_multiplier = tf.train.polynomial_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs, + end_learning_rate=0.001, + power=tr_helpers.get_or_default(config, 'decay_power', 1.0)) if self.is_exp_decay_lr: - self.lr_multiplier = tf.train.exponential_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs, decay_rate = config['decay_rate']) - + self.lr_multiplier = tf.train.exponential_decay(1.0, global_step=self.epoch_num, + decay_steps=self.max_epochs, + decay_rate=config['decay_rate']) + self.env_name = config['env_name'] self.network = config['network'] + self.batch_size = self.config['batch_size'] + self.obs_shape = observation_shape self.actions_num = actions_num self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("%d, %H:%M:%S")) self.epsilon = self.config['epsilon'] self.rewards_shaper = self.config['reward_shaper'] - self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames']) - self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames']) + self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], + self.config['epsilon_decay_frames']) + self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], + self.config['beta_decay_frames']) if self.env_name: self.env_config = config.get('env_config', {}) self.env = env_configurations.configurations[self.env_name]['env_creator'](**self.env_config) self.sess = sess self.steps_num = self.config['steps_num'] - + self.obs_act_rew = deque([], maxlen=self.steps_num) - + self.is_prioritized = config['replay_buffer_type'] != 'normal' self.atoms_num = self.config['atoms_num'] assert self.atoms_num == 1 - + self.state_shape = (self.env.env_info['state_shape'],) self.n_agents = self.env.env_info['n_agents'] - + if not self.is_prioritized: self.exp_buffer = experience.ReplayBuffer(config['replay_buffer_size']) - else: + else: self.exp_buffer = experience.PrioritizedReplayBuffer(config['replay_buffer_size'], config['priority_alpha']) - self.sample_weights_ph = tf.placeholder(tf.float32, shape= [None,] , name='sample_weights') - - self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape , name = 'obs_ph') - self.state_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.state_shape , name = 'state_ph') - self.actions_ph = tf.placeholder(tf.int32, shape=[None,], name = 'actions_ph') - self.rewards_ph = tf.placeholder(tf.float32, shape=[None,], name = 'rewards_ph') - self.next_obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape , name = 'next_obs_ph') - self.is_done_ph = tf.placeholder(tf.float32, shape=[None,], name = 'is_done_ph') + self.sample_weights_ph = tf.placeholder(tf.float32, shape=[None, 1], name='sample_weights') + + self.batch_size_ph = tf.placeholder(tf.int32, name='batch_size_ph') + self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape, name='obs_ph') + self.state_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.state_shape, name='state_ph') + self.actions_ph = tf.placeholder(tf.int32, shape=[None, 1], name='actions_ph') + self.rewards_ph = tf.placeholder(tf.float32, shape=[None, 1], name='rewards_ph') + self.next_obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape, name='next_obs_ph') + self.is_done_ph = tf.placeholder(tf.float32, shape=[None, 1], name='is_done_ph') self.is_not_done = 1 - self.is_done_ph self.name = base_name - + self.gamma = self.config['gamma'] - self.gamma_step = self.gamma**self.steps_num + self.gamma_step = self.gamma ** self.steps_num self.grad_norm = config['grad_norm'] self.input_obs = self.obs_ph self.input_next_obs = self.next_obs_ph @@ -87,11 +97,12 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.input_obs = tf.to_float(self.input_obs) / 255.0 self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0 self.setup_qvalues(actions_num) - + self.reg_loss = tf.losses.get_regularization_loss() self.td_loss_mean += self.reg_loss self.learning_rate = self.config['learning_rate'] - self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier)#.minimize(self.td_loss_mean, var_list=self.weights) + self.train_step = tf.train.AdamOptimizer( + self.learning_rate * self.lr_multiplier) # .minimize(self.td_loss_mean, var_list=self.weights) grads = tf.gradients(self.td_loss_mean, self.weights) if self.config['truncate_grads']: grads, _ = tf.clip_by_global_norm(grads, self.grad_norm) @@ -99,58 +110,65 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.train_op = self.train_step.apply_gradients(grads) self.saver = tf.train.Saver() - self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)] + self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in + zip(self.weights, self.target_weights)] self.variables = TensorFlowVariables(self.qvalues, self.sess) if self.env_name: sess.run(tf.global_variables_initializer()) self._reset() - + def get_weights(self): return self.variables.get_flat() def set_weights(self, weights): return self.variables.set_flat(weights) - + def update_epoch(self): return self.sess.run([self.update_epoch_op])[0] - + def setup_qvalues(self, actions_num): config = { - 'input_obs' : self.input_obs, + 'input_obs': self.input_obs, 'input_next_obs': self.input_next_obs, - 'actions_num' : actions_num, + 'actions_num': actions_num, 'is_double': self.config['is_double'], - 'actions_ph': self.actions_ph + 'actions_ph': self.actions_ph, + 'batch_size_ph': self.batch_size_ph, + 'n_agents': self.n_agents } + # (bs, n_agents, n_actions), (bs, 1), (bs, 1) self.qvalues, self.current_action_qvalues_mix, self.target_action_qvalues_mix = self.network(config) self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent') self.target_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target') - self.reference_qvalues = self.rewards_ph + self.gamma_step *self.is_not_done * self.target_action_qvalues_mix - + self.reference_qvalues = self.rewards_ph + self.gamma_step * self.is_not_done * self.target_action_qvalues_mix + if self.is_prioritized: # we need to return l1 loss to update priority buffer self.abs_errors = tf.abs(self.current_action_qvalues_mix - self.reference_qvalues) + 1e-5 # the same as multiply gradients later (other way is used in different examples over internet) - self.td_loss = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, reduction=tf.losses.Reduction.NONE) * self.sample_weights_ph - self.td_loss_mean = tf.reduce_mean(self.td_loss) + self.td_loss = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, + reduction=tf.losses.Reduction.NONE) * self.sample_weights_ph + self.td_loss_mean = tf.reduce_mean(self.td_loss) else: - self.td_loss_mean = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, reduction=tf.losses.Reduction.MEAN) - + self.td_loss_mean = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, + reduction=tf.losses.Reduction.MEAN) + self.reg_loss = tf.losses.get_regularization_loss() self.td_loss_mean += self.reg_loss self.learning_rate = self.config['learning_rate'] if self.env_name: - self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights) - + self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize( + self.td_loss_mean, var_list=self.weights) + def save(self, fn): self.saver.save(self.sess, fn) def restore(self, fn): self.saver.restore(self.sess, fn) - + def _reset(self): self.obs_act_rew.clear() if self.env_name: @@ -158,19 +176,21 @@ def _reset(self): self.total_reward = 0.0 self.total_shaped_reward = 0.0 self.step_count = 0 - + def get_action(self, obs, avail_acts, epsilon=0.0): if np.random.random() < epsilon: action = tfp.distributions.Categorical(probs=avail_acts.astype(float)).sample().eval(session=self.sess) else: - qvals = self.get_qvalues(obs) + obs = obs.reshape((self.n_agents,) + self.obs_shape) + # (n_agents, num_actions) + qvals = self.get_qvalues(obs).squeeze(0) qvals[avail_acts == False] = -9999999 action = np.argmax(qvals, axis=1) - return action - + return action + def get_qvalues(self, obs): - return self.sess.run(self.qvalues, {self.obs_ph: obs}) - + return self.sess.run(self.qvalues, {self.obs_ph: obs, self.batch_size_ph: 1}) + def play_steps(self, steps, epsilon=0.0): done_reward = None done_shaped_reward = None @@ -190,8 +210,12 @@ def play_steps(self, steps, epsilon=0.0): action = self.get_action(obs, self.env.get_action_mask(), epsilon) new_obs, reward, is_done, _ = self.env.step(action) - #reward = reward * (1 - is_done) - + # reward = reward * (1 - is_done) + + # Same reward, done for all agents + reward = reward[0] + is_done = all(is_done) + self.step_count += 1 self.total_reward += reward shaped_reward = self.rewards_shaper(reward) @@ -210,29 +234,48 @@ def play_steps(self, steps, epsilon=0.0): self.exp_buffer.add(self.current_obs, current_action, current_st, steps_rewards, new_obs, is_done) self.current_obs = next_obs break - - if all(is_done): + + if is_done: done_reward = self.total_reward done_steps = self.step_count done_shaped_reward = self.total_shaped_reward self._reset() return done_reward, done_shaped_reward, done_steps - + def load_weights_into_target_network(self): self.sess.run(self.assigns_op) def sample_batch(self, exp_replay, batch_size): - obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(batch_size) + obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(batch_size) + obs_batch = obs_batch.reshape((batch_size * self.n_agents,) + self.obs_shape) + act_batch = act_batch.reshape((batch_size * self.n_agents, 1)) + st_batch = st_batch.reshape((batch_size,) + self.state_shape) + next_obs_batch = next_obs_batch.reshape((batch_size * self.n_agents,) + self.obs_shape) + reward_batch = reward_batch.reshape((batch_size, 1)) + is_done_batch = is_done_batch.reshape((batch_size, 1)) + return { - self.obs_ph:obs_batch, self.actions_ph:act_batch, self.state_ph: st_batch, - self.rewards_ph:reward_batch, self.is_done_ph:is_done_batch, self.next_obs_ph:next_obs_batch + self.obs_ph: obs_batch, self.actions_ph: act_batch, self.state_ph: st_batch, + self.rewards_ph: reward_batch, self.is_done_ph: is_done_batch, self.next_obs_ph: next_obs_batch, + self.batch_size_ph: batch_size } def sample_prioritized_batch(self, exp_replay, batch_size, beta): - obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch, sample_weights, sample_idxes = exp_replay.sample(batch_size, beta) - batch = { self.obs_ph:obs_batch, self.actions_ph:act_batch, self.state_ph: st_batch, self.rewards_ph:reward_batch, - self.is_done_ph:is_done_batch, self.next_obs_ph:next_obs_batch, self.sample_weights_ph: sample_weights } - return [batch , sample_idxes] + obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch, sample_weights, sample_idxes = exp_replay.sample( + batch_size, beta) + obs_batch = obs_batch.reshape((batch_size * self.n_agents,) + self.obs_shape) + act_batch = act_batch.reshape((batch_size * self.n_agents, 1)) + st_batch = st_batch.reshape((batch_size,) + self.state_shape) + next_obs_batch = next_obs_batch.reshape((batch_size * self.n_agents,) + self.obs_shape) + reward_batch = reward_batch.reshape((batch_size, 1)) + is_done_batch = is_done_batch.reshape((batch_size, 1)) + sample_weights = sample_weights.reshape((batch_size, 1)) + batch = {self.obs_ph: obs_batch, self.actions_ph: act_batch, self.state_ph: st_batch, + self.rewards_ph: reward_batch, + self.is_done_ph: is_done_batch, self.next_obs_ph: next_obs_batch, + self.sample_weights_ph: sample_weights, + self.batch_size_ph: batch_size} + return [batch, sample_idxes] def train(self): mem_free_steps = 0 @@ -249,9 +292,6 @@ def train(self): self.play_steps(self.steps_num, self.epsilon) steps_per_epoch = self.config['steps_per_epoch'] num_epochs_to_copy = self.config['num_epochs_to_copy'] - batch_size = self.config['batch_size'] - lives_reward = self.config['lives_reward'] - episodes_to_log = self.config['episodes_to_log'] frame = 0 play_time = 0 update_time = 0 @@ -259,7 +299,7 @@ def train(self): shaped_rewards = [] steps = [] losses = deque([], maxlen=100) - + while True: epoch_num = self.update_epoch() t_play_start = time.time() @@ -271,30 +311,32 @@ def train(self): if reward != None: self.game_lengths.append(step) self.game_rewards.append(reward) - #shaped_rewards.append(shaped_reward) + # shaped_rewards.append(shaped_reward) t_play_end = time.time() play_time += t_play_end - t_play_start - + # train frame = frame + steps_per_epoch t_start = time.time() if self.is_prioritized: - batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta) - _, loss_t, errors_update, lr_mul = self.sess.run([self.train_op, self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch) + batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=self.batch_size, + beta=self.beta) + _, loss_t, errors_update, lr_mul = self.sess.run( + [self.train_op, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch) self.exp_buffer.update_priorities(idxes, errors_update) else: - batch = self.sample_batch(self.exp_buffer, batch_size=batch_size) - print(self.sess.run(self.qvalues, batch).shape) - _, loss_t, lr_mul = self.sess.run([self.train_op, self.train_step, self.td_loss_mean, self.lr_multiplier], batch) - + batch = self.sample_batch(self.exp_buffer, batch_size=self.batch_size) + _, loss_t, lr_mul = self.sess.run( + [self.train_op, self.td_loss_mean, self.lr_multiplier], batch) + losses.append(loss_t) t_end = time.time() update_time += t_end - t_start total_time += update_time if frame % 1000 == 0: - mem_free_steps += 1 - if mem_free_steps == 10: + mem_free_steps += 1 + if mem_free_steps == 10: mem_free_steps = 0 tr_helpers.free_mem() sum_time = update_time + play_time @@ -304,19 +346,18 @@ def train(self): self.writer.add_scalar('performance/play_time', play_time, frame) self.writer.add_scalar('losses/td_loss', np.mean(losses), frame) self.writer.add_scalar('info/lr_mul', lr_mul, frame) - self.writer.add_scalar('info/lr', self.learning_rate*lr_mul, frame) + self.writer.add_scalar('info/lr', self.learning_rate * lr_mul, frame) self.writer.add_scalar('info/epochs', epoch_num, frame) self.writer.add_scalar('info/epsilon', self.epsilon, frame) if self.is_prioritized: self.writer.add_scalar('beta', self.beta, frame) - + update_time = 0 play_time = 0 num_games = len(self.game_rewards) if num_games > 10: - d = num_games / lives_reward - mean_rewards = np.sum(self.game_rewards) / d - mean_lengths = np.sum(self.game_lengths) / d + mean_rewards = np.sum(self.game_rewards) / num_games + mean_lengths = np.sum(self.game_lengths) / num_games self.writer.add_scalar('rewards/mean', mean_rewards, frame) self.writer.add_scalar('rewards/time', mean_rewards, total_time) self.writer.add_scalar('episode_lengths/mean', mean_lengths, frame) @@ -329,11 +370,12 @@ def train(self): if last_mean_rewards > self.config['score_to_win']: print('network won!') return last_mean_rewards, epoch_num - + if frame % num_epochs_to_copy == 0: self.load_weights_into_target_network() - + if epoch_num >= self.max_epochs: print('Max epochs reached') - self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(np.sum(self.game_rewards) * lives_reward / len(self.game_rewards))) - return last_mean_rewards, epoch_num + self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str( + np.sum(self.game_rewards) / len(self.game_rewards))) + return last_mean_rewards, epoch_num diff --git a/common/experience.py b/common/experience.py index 2765db79..9312fb23 100644 --- a/common/experience.py +++ b/common/experience.py @@ -30,7 +30,7 @@ def add(self, obs_t, action, state_t, reward, obs_tp1, done): self._next_idx = (self._next_idx + 1) % self._maxsize def _encode_sample(self, idxes): - obses_t, actions, states_t, rewards, obses_tp1, dones = [], [], [], [], [] + obses_t, actions, states_t, rewards, obses_tp1, dones = [], [], [], [], [], [] for i in idxes: data = self._storage[i] obs_t, action, state_t, reward, obs_tp1, done = data diff --git a/configs/vdn_3s5z_vs_3s6z.yaml b/configs/vdn_3s5z_vs_3s6z.yaml index 75e6f51a..dcf9b358 100644 --- a/configs/vdn_3s5z_vs_3s6z.yaml +++ b/configs/vdn_3s5z_vs_3s6z.yaml @@ -15,6 +15,27 @@ params: dueling: True atoms: 1 noisy: False + cnn: + type: conv1d + activation: relu + initializer: + name: variance_scaling_initializer + scale: 2 + regularizer: + name: 'None' + convs: + - filters: 64 + kernel_size: 3 + strides: 2 + padding: 'same' + - filters: 128 + kernel_size: 3 + strides: 1 + padding: 'valid' + - filters: 256 + kernel_size: 3 + strides: 1 + padding: 'valid' mlp: units: [256] activation: relu @@ -30,7 +51,7 @@ params: gamma: 0.99 learning_rate: 0.0005 steps_per_epoch: 4 - batch_size: 128 + batch_size: 64 epsilon: 1.0 min_epsilon: 0.05 epsilon_decay_frames: 100000 @@ -39,7 +60,7 @@ params: name: 3s5z_vs_3s6z is_double: True score_to_win: 20 - num_steps_fill_buffer: 10000 + num_steps_fill_buffer: 200 replay_buffer_type: 'normal' replay_buffer_size: 100000 priority_beta: 0.4 @@ -52,7 +73,7 @@ params: games_to_track: 20 lr_schedule: None max_epochs: 100000 - grad_norm: 0.5 + grad_norm: 10 mix_with_state: False truncate_grads: True From 650c741ca6d933fea82909e5df3ad641868162bf Mon Sep 17 00:00:00 2001 From: Tarun Date: Mon, 22 Jun 2020 22:53:55 +0100 Subject: [PATCH 13/14] final update --- configs/vdn_3s5z_vs_3s6z.yaml | 6 +-- configs/vdn_3s_vs_5z.yaml | 84 +++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 configs/vdn_3s_vs_5z.yaml diff --git a/configs/vdn_3s5z_vs_3s6z.yaml b/configs/vdn_3s5z_vs_3s6z.yaml index dcf9b358..240f18e3 100644 --- a/configs/vdn_3s5z_vs_3s6z.yaml +++ b/configs/vdn_3s5z_vs_3s6z.yaml @@ -60,14 +60,14 @@ params: name: 3s5z_vs_3s6z is_double: True score_to_win: 20 - num_steps_fill_buffer: 200 + num_steps_fill_buffer: 100000 replay_buffer_type: 'normal' - replay_buffer_size: 100000 + replay_buffer_size: 1000000 priority_beta: 0.4 priority_alpha: 0.6 beta_decay_frames: 100000 max_beta: 1 - steps_num: 128 + steps_num: 10 # episodes_to_log: 10 atoms_num: 1 games_to_track: 20 diff --git a/configs/vdn_3s_vs_5z.yaml b/configs/vdn_3s_vs_5z.yaml new file mode 100644 index 00000000..4fe035b0 --- /dev/null +++ b/configs/vdn_3s_vs_5z.yaml @@ -0,0 +1,84 @@ +label: "" +name: "" +params: + algo: + name: vdn + + model: + name: vdn + + load_checkpoint: False + load_path: "" + + network: + name: dqn + dueling: True + atoms: 1 + noisy: False + cnn: + type: conv1d + activation: relu + initializer: + name: variance_scaling_initializer + scale: 2 + regularizer: + name: 'None' + convs: + - filters: 64 + kernel_size: 3 + strides: 2 + padding: 'same' + - filters: 128 + kernel_size: 3 + strides: 1 + padding: 'valid' + - filters: 256 + kernel_size: 3 + strides: 1 + padding: 'valid' + mlp: + units: [256] + activation: relu + initializer: + name: variance_scaling_initializer + scale: 2 + regularizer: + name: 'None' + + config: + reward_shaper: + scale_value: 0.1 + gamma: 0.99 + learning_rate: 0.0005 + steps_per_epoch: 4 + batch_size: 64 + epsilon: 1.0 + min_epsilon: 0.05 + epsilon_decay_frames: 100000 + num_epochs_to_copy: 10000 + env_name: smac_cnn + name: 3s_vs_5z + is_double: True + score_to_win: 20 + num_steps_fill_buffer: 100000 + replay_buffer_type: 'normal' + replay_buffer_size: 1000000 + priority_beta: 0.4 + priority_alpha: 0.6 + beta_decay_frames: 100000 + max_beta: 1 + steps_num: 10 +# episodes_to_log: 10 + atoms_num: 1 + games_to_track: 20 + lr_schedule: None + max_epochs: 100000 + grad_norm: 10 + mix_with_state: False + truncate_grads: True + + env_config: + name: 3s_vs_5z + frames: 4 + transpose: True + random_invalid_step: False \ No newline at end of file From 072d2ac300cd13162fd11cfbd8d102b80347ee0e Mon Sep 17 00:00:00 2001 From: Christian Schroeder Date: Tue, 23 Jun 2020 20:31:01 +0100 Subject: [PATCH 14/14] created ReplayBufferCentralState --- algos_tf14/vdnagent.py | 7 +++-- common/experience.py | 62 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py index bce361e3..6d5b1ef6 100644 --- a/algos_tf14/vdnagent.py +++ b/algos_tf14/vdnagent.py @@ -72,10 +72,11 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log self.n_agents = self.env.env_info['n_agents'] if not self.is_prioritized: - self.exp_buffer = experience.ReplayBuffer(config['replay_buffer_size']) + self.exp_buffer = experience.ReplayBufferCentralState(config['replay_buffer_size']) else: - self.exp_buffer = experience.PrioritizedReplayBuffer(config['replay_buffer_size'], config['priority_alpha']) - self.sample_weights_ph = tf.placeholder(tf.float32, shape=[None, 1], name='sample_weights') + raise NotImplementedError("Not implemented! PrioritizedReplayBuffer with CentralState") + #self.exp_buffer = experience.PrioritizedReplayBufferCentralState(config['replay_buffer_size'], config['priority_alpha']) + #self.sample_weights_ph = tf.placeholder(tf.float32, shape=[None, 1], name='sample_weights') self.batch_size_ph = tf.placeholder(tf.int32, name='batch_size_ph') self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape, name='obs_ph') diff --git a/common/experience.py b/common/experience.py index 9312fb23..d8670bd5 100644 --- a/common/experience.py +++ b/common/experience.py @@ -4,7 +4,7 @@ from common.segment_tree import SumSegmentTree, MinSegmentTree -class ReplayBuffer(object): +class ReplayBufferCentralState(object): def __init__(self, size): """Create Replay buffer. Parameters @@ -65,6 +65,66 @@ def sample(self, batch_size): idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] return self._encode_sample(idxes) +class ReplayBuffer(object): + def __init__(self, size): + """Create Replay buffer. + Parameters + ---------- + size: int + Max number of transitions to store in the buffer. When the buffer + overflows the old memories are dropped. + """ + self._storage = [] + self._maxsize = size + self._next_idx = 0 + + def __len__(self): + return len(self._storage) + + def add(self, obs_t, action, reward, obs_tp1, done): + data = (obs_t, action, reward, obs_tp1, done) + + if self._next_idx >= len(self._storage): + self._storage.append(data) + else: + self._storage[self._next_idx] = data + self._next_idx = (self._next_idx + 1) % self._maxsize + + def _encode_sample(self, idxes): + obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] + for i in idxes: + data = self._storage[i] + obs_t, action, reward, obs_tp1, done = data + obses_t.append(np.array(obs_t, copy=False)) + actions.append(np.array(action, copy=False)) + rewards.append(reward) + obses_tp1.append(np.array(obs_tp1, copy=False)) + dones.append(done) + return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) + + def sample(self, batch_size): + """Sample a batch of experiences. + Parameters + ---------- + batch_size: int + How many transitions to sample. + Returns + ------- + obs_batch: np.array + batch of observations + act_batch: np.array + batch of actions executed given obs_batch + rew_batch: np.array + rewards received as results of executing act_batch + next_obs_batch: np.array + next set of observations seen after executing act_batch + done_mask: np.array + done_mask[i] = 1 if executing act_batch[i] resulted in + the end of an episode and 0 otherwise. + """ + idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] + return self._encode_sample(idxes) + class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha):