Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vdn #1

Merged
merged 14 commits into from
Jun 23, 2020
4 changes: 2 additions & 2 deletions algos_tf14/dqnagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from common.categorical import CategoricalQ

class DQNAgent:
def __init__(self, sess, base_name, observation_space, action_space, config):
def __init__(self, sess, base_name, observation_space, action_space, config, logger):
observation_shape = observation_space.shape
actions_num = action_space.n
self.config = config
Expand Down Expand Up @@ -47,7 +47,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config):
self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames'])
self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames'])
if self.env_name:
self.env = env_configurations.configurations[self.env_name]['env_creator']()
self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name'])
self.sess = sess
self.steps_num = self.config['steps_num']
self.states = deque([], maxlen=self.steps_num)
Expand Down
1 change: 1 addition & 0 deletions algos_tf14/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(self):
self.model_factory.register_builder('continuous_a2c_lstm', lambda network, **kwargs : models.LSTMModelA2CContinuous(network))
self.model_factory.register_builder('continuous_a2c_lstm_logstd', lambda network, **kwargs : models.LSTMModelA2CContinuousLogStd(network))
self.model_factory.register_builder('dqn', lambda network, **kwargs : models.AtariDQN(network))
self.model_factory.register_builder('vdn', lambda network, **kwargs : models.VDN_DQN(network))


self.network_factory = object_factory.ObjectFactory()
Expand Down
58 changes: 58 additions & 0 deletions algos_tf14/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,61 @@ def __call__(self, dict, reuse=False):
'''
is_train = name == 'agent'
return self.network(name=name, inputs=inputs, actions_num=actions_num, is_train=is_train, reuse=reuse)


class VDN_DQN(BaseModel):
def __init__(self, network):
self.network = network

def __call__(self, dict):
input_obs = dict['input_obs']
input_next_obs = dict['input_next_obs']
actions_num = dict['actions_num']
is_double = dict['is_double']
# (bs * n_agents, 1)
actions_ph = dict['actions_ph']
batch_size_ph = dict['batch_size_ph']
n_agents = dict['n_agents']

'''
TODO: fix is_train
'''
# is_train = name == 'agent'

# (bs * n_agents, n_actions)
qvalues = self.network(name='agent', inputs=input_obs, actions_num=actions_num, is_train=True, reuse=False)
# (bs, n_agents, n_actions)
qvalues = tf.reshape(qvalues, [batch_size_ph, n_agents, actions_num])
# (bs * n_agents, n_actions)
target_qvalues = tf.stop_gradient(self.network(name='target', inputs=input_next_obs, actions_num=actions_num, is_train=False, reuse=False))
# (bs, n_agents, n_actions)
target_qvalues = tf.reshape(target_qvalues, [batch_size_ph, n_agents, actions_num])

# (bs * n_agents, 1, actions_num)
# (bs, n_agents, actions_num)
one_hot_actions = tf.reshape(tf.one_hot(actions_ph, actions_num), [batch_size_ph, n_agents, actions_num])
# (bs, n_agents, 1)
current_action_qvalues = tf.reduce_sum(one_hot_actions * qvalues, axis=2)

if is_double:
# (bs * n_agents, n_actions)
next_qvalues = tf.stop_gradient(self.network(name='agent', inputs=input_next_obs, actions_num=actions_num, is_train=True, reuse=True))
# (bs * n_agents, 1)
next_selected_actions = tf.argmax(next_qvalues, axis=1)
# (bs*n_agents, 1, n_actions)
# (bs, n_agents, actions_num)
next_selected_actions_onehot = tf.reshape(tf.one_hot(next_selected_actions, actions_num), [batch_size_ph, n_agents, actions_num])
# (bs, n_agents, 1)
next_obs_values_target = tf.stop_gradient(
tf.reduce_sum(target_qvalues * next_selected_actions_onehot, axis=2))
else:
# (bs, n_agents, 1)
next_obs_values_target = tf.stop_gradient(tf.reduce_max(target_qvalues, axis=2))

##MIXING:
# (bs, 1)
current_action_qvalues_mix = tf.reshape(tf.reduce_sum(current_action_qvalues, axis=1), [batch_size_ph, 1])
# (bs, 1, 1)
target_action_qvalues_mix = tf.reshape(tf.reduce_sum(next_obs_values_target, axis=1), [batch_size_ph, 1])

return qvalues, current_action_qvalues_mix, target_action_qvalues_mix
Loading