schroederdewitt · schroederdewitt · Jun 23, 2020 · Jun 21, 2020 · Jun 21, 2020 · Jun 21, 2020
diff --git a/algos_tf14/dqnagent.py b/algos_tf14/dqnagent.py
@@ -11,7 +11,7 @@
 from common.categorical import CategoricalQ
 
 class DQNAgent:
-    def __init__(self, sess, base_name, observation_space, action_space, config):
+    def __init__(self, sess, base_name, observation_space, action_space, config, logger):
         observation_shape = observation_space.shape
         actions_num = action_space.n
         self.config = config
@@ -47,7 +47,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config):
         self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames'])
         self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames'])
         if self.env_name:
-            self.env = env_configurations.configurations[self.env_name]['env_creator']()
+            self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name'])
         self.sess = sess
         self.steps_num = self.config['steps_num']
         self.states = deque([], maxlen=self.steps_num)

diff --git a/algos_tf14/model_builder.py b/algos_tf14/model_builder.py
@@ -13,6 +13,7 @@ def __init__(self):
         self.model_factory.register_builder('continuous_a2c_lstm', lambda network, **kwargs : models.LSTMModelA2CContinuous(network))
         self.model_factory.register_builder('continuous_a2c_lstm_logstd', lambda network, **kwargs : models.LSTMModelA2CContinuousLogStd(network))
         self.model_factory.register_builder('dqn', lambda network, **kwargs : models.AtariDQN(network))
+        self.model_factory.register_builder('vdn', lambda network, **kwargs : models.VDN_DQN(network))
 
 
         self.network_factory = object_factory.ObjectFactory()

diff --git a/algos_tf14/models.py b/algos_tf14/models.py
@@ -245,3 +245,61 @@ def __call__(self, dict, reuse=False):
         '''        
         is_train = name == 'agent'
         return self.network(name=name, inputs=inputs, actions_num=actions_num, is_train=is_train, reuse=reuse)
+
+
+class VDN_DQN(BaseModel):
+    def __init__(self, network):
+        self.network = network
+
+    def __call__(self, dict):
+        input_obs = dict['input_obs']
+        input_next_obs = dict['input_next_obs']
+        actions_num = dict['actions_num']
+        is_double = dict['is_double']
+        # (bs * n_agents, 1)
+        actions_ph = dict['actions_ph']
+        batch_size_ph = dict['batch_size_ph']
+        n_agents = dict['n_agents']
+
+        '''
+        TODO: fix is_train
+        '''
+        # is_train = name == 'agent'
+
+        # (bs * n_agents, n_actions)
+        qvalues = self.network(name='agent', inputs=input_obs, actions_num=actions_num, is_train=True, reuse=False)
+        # (bs, n_agents, n_actions)
+        qvalues = tf.reshape(qvalues, [batch_size_ph, n_agents, actions_num])
+        # (bs * n_agents, n_actions)
+        target_qvalues = tf.stop_gradient(self.network(name='target', inputs=input_next_obs, actions_num=actions_num, is_train=False, reuse=False))
+        # (bs, n_agents, n_actions)
+        target_qvalues = tf.reshape(target_qvalues, [batch_size_ph, n_agents, actions_num])
+
+        # (bs * n_agents, 1, actions_num)
+        # (bs, n_agents, actions_num)
+        one_hot_actions = tf.reshape(tf.one_hot(actions_ph, actions_num), [batch_size_ph, n_agents, actions_num])
+        # (bs, n_agents, 1)
+        current_action_qvalues = tf.reduce_sum(one_hot_actions * qvalues, axis=2)
+
+        if is_double:
+            # (bs * n_agents, n_actions)
+            next_qvalues = tf.stop_gradient(self.network(name='agent', inputs=input_next_obs, actions_num=actions_num, is_train=True, reuse=True))
+            # (bs * n_agents, 1)
+            next_selected_actions = tf.argmax(next_qvalues, axis=1)
+            # (bs*n_agents, 1, n_actions)
+            # (bs, n_agents, actions_num)
+            next_selected_actions_onehot = tf.reshape(tf.one_hot(next_selected_actions, actions_num), [batch_size_ph, n_agents, actions_num])
+            # (bs, n_agents, 1)
+            next_obs_values_target = tf.stop_gradient(
+                tf.reduce_sum(target_qvalues * next_selected_actions_onehot, axis=2))
+        else:
+            # (bs, n_agents, 1)
+            next_obs_values_target = tf.stop_gradient(tf.reduce_max(target_qvalues, axis=2))
+
+        ##MIXING:
+        # (bs, 1)
+        current_action_qvalues_mix = tf.reshape(tf.reduce_sum(current_action_qvalues, axis=1), [batch_size_ph, 1])
+        # (bs, 1, 1)
+        target_action_qvalues_mix = tf.reshape(tf.reduce_sum(next_obs_values_target, axis=1), [batch_size_ph, 1])
+
+        return qvalues, current_action_qvalues_mix, target_action_qvalues_mix