From f7a54c60ba27f098c56c6f9335abe8da6fc0b17a Mon Sep 17 00:00:00 2001
From: Tarun Gupta <tarun1995gupta@gmail.com>
Date: Sun, 21 Jun 2020 02:01:18 +0100
Subject: [PATCH 01/14] vdn start

---
 algos_tf14/dqnagent.py |   4 +-
 algos_tf14/vdnagent.py | 148 +++++++++++++++++++++++++++++++++++++++++
 envs/smac_env.py       |   3 +
 tf14_runner.py         |   2 +
 4 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 algos_tf14/vdnagent.py

diff --git a/algos_tf14/dqnagent.py b/algos_tf14/dqnagent.py
index bf7b10f6..007bd0ad 100644
--- a/algos_tf14/dqnagent.py
+++ b/algos_tf14/dqnagent.py
@@ -11,7 +11,7 @@
 from common.categorical import CategoricalQ
 
 class DQNAgent:
-    def __init__(self, sess, base_name, observation_space, action_space, config):
+    def __init__(self, sess, base_name, observation_space, action_space, config, logger):
         observation_shape = observation_space.shape
         actions_num = action_space.n
         self.config = config
@@ -47,7 +47,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config):
         self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames'])
         self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames'])
         if self.env_name:
-            self.env = env_configurations.configurations[self.env_name]['env_creator']()
+            self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name'])
         self.sess = sess
         self.steps_num = self.config['steps_num']
         self.states = deque([], maxlen=self.steps_num)
diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
new file mode 100644
index 00000000..41a6dcd0
--- /dev/null
+++ b/algos_tf14/vdnagent.py
@@ -0,0 +1,148 @@
+import tensorflow as tf
+import algos_tf14.models
+from common import tr_helpers, experience, env_configurations
+import numpy as np
+import collections
+import time
+from collections import deque
+from tensorboardX import SummaryWriter
+from datetime import datetime
+from algos_tf14.tensorflow_utils import TensorFlowVariables
+from common.categorical import CategoricalQ
+
+class VDNAgent:
+    def __init__(self, sess, base_name, observation_space, action_space, config, logger):
+        observation_shape = observation_space.shape
+        actions_num = action_space.n
+        self.config = config
+        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
+        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
+        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
+        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)
+        self.learning_rate_ph = tf.placeholder('float32', (), name = 'lr_ph')
+        self.games_to_track = tr_helpers.get_or_default(config, 'games_to_track', 100)
+        self.max_epochs = tr_helpers.get_or_default(self.config, 'max_epochs', 1e6)
+
+        self.game_rewards = deque([], maxlen=self.games_to_track)
+        self.game_lengths = deque([], maxlen=self.games_to_track)
+
+        self.epoch_num = tf.Variable( tf.constant(0, shape=(), dtype=tf.float32), trainable=False)
+        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
+        self.current_lr = self.learning_rate_ph
+
+        if self.is_adaptive_lr:
+            self.lr_threshold = config['lr_threshold']
+        if self.is_polynom_decay_lr:
+            self.lr_multiplier = tf.train.polynomial_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs, end_learning_rate=0.001, power=tr_helpers.get_or_default(config, 'decay_power', 1.0))
+        if self.is_exp_decay_lr:
+            self.lr_multiplier = tf.train.exponential_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs,  decay_rate = config['decay_rate'])
+            
+        self.env_name = config['env_name']
+        self.network = config['network']
+        self.obs_shape = observation_shape
+        self.actions_num = actions_num
+        self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("%d, %H:%M:%S"))
+        self.epsilon = self.config['epsilon']
+        self.rewards_shaper = self.config['reward_shaper']
+        self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames'])
+        self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames'])
+        if self.env_name:
+            self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name'])
+        self.sess = sess
+        self.steps_num = self.config['steps_num']
+        self.states = deque([], maxlen=self.steps_num)
+        self.is_prioritized = config['replay_buffer_type'] != 'normal'
+        self.atoms_num = self.config['atoms_num']
+        assert self.atoms_num == 1
+        
+        self.state_shape = (self.env.env_info['state_shape'],)
+        self.n_agents = self.env.env_info['n_agents']
+        
+        if not self.is_prioritized:
+            self.exp_buffer = experience.ReplayBuffer(config['replay_buffer_size'])
+        else: 
+            self.exp_buffer = experience.PrioritizedReplayBuffer(config['replay_buffer_size'], config['priority_alpha'])
+            self.sample_weights_ph = tf.placeholder(tf.float32, shape= [None,] , name='sample_weights')
+        
+        self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape , name = 'obs_ph')
+        self.state_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.state_shape , name = 'state_ph')
+        self.actions_ph = tf.placeholder(tf.int32, shape=[None,], name = 'actions_ph')
+        self.rewards_ph = tf.placeholder(tf.float32, shape=[None,], name = 'rewards_ph')
+        self.next_obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape , name = 'next_obs_ph')
+        self.is_done_ph = tf.placeholder(tf.float32, shape=[None,], name = 'is_done_ph')
+        self.is_not_done = 1 - self.is_done_ph
+        self.name = base_name
+        
+        self.gamma = self.config['gamma']
+        self.gamma_step = self.gamma**self.steps_num
+        self.grad_norm = config['grad_norm']
+        self.input_obs = self.obs_ph
+        self.input_next_obs = self.next_obs_ph
+        if observation_space.dtype == np.uint8:
+            print('scaling obs')
+            self.input_obs = tf.to_float(self.input_obs) / 255.0
+            self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0
+        self.setup_qvalues(actions_num)
+        self.sess.run(tf.global_variables_initializer())
+    
+    def setup_qvalues(self, actions_num):
+        config = {
+            'name' : 'agent',
+            'inputs' : self.input_obs,
+            'actions_num' : actions_num,
+        }
+        self.qvalues = self.network(config, reuse=False)
+        config = {
+            'name' : 'target',
+            'inputs' : self.input_next_obs,
+            'actions_num' : actions_num,
+        }
+        self.target_qvalues = tf.stop_gradient(self.network(config, reuse=False))
+        
+    def play_episode(self, epsilon=0.0):
+        mb_obs = []
+        mb_rewards = []
+        mb_actions = []
+        mb_avail_actions = []
+        mb_dones = []
+        mb_states = []
+        
+        obs = self.env.reset()
+        obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape))
+        mb_obs.append(obs)
+        mb_states.append(self.env.get_state())
+        mb_avail_actions.append(self.env.get_action_mask())
+
+        while True:
+            step_act = self.get_action(obs, epsilon)
+            print(step_act)
+            break
+            
+            
+            
+            
+    def get_action(self, obs, epsilon=0.0):
+        print(obs.shape)
+        if np.random.random() < epsilon:
+            action = self.env.action_space.sample()
+        else:
+            qvals = self.get_qvalues(obs)
+            print(qvals.shape)
+            action = np.argmax(qvals)
+        return action  
+    
+    def get_qvalues(self, obs):
+        print(obs.shape)
+        return self.sess.run(self.qvalues, {self.obs_ph: obs})
+        
+    def train(self):
+        self.play_episode()
+        
+        
+        
+        
+        
+        
+        
+        
+
diff --git a/envs/smac_env.py b/envs/smac_env.py
index 178a9cba..edbc7090 100644
--- a/envs/smac_env.py
+++ b/envs/smac_env.py
@@ -44,6 +44,9 @@ def _preproc_actions(self, actions):
                 actions[ind] = np.random.choice(avail_actions)
                 #rewards[ind] = -0.05
         return actions, rewards
+    
+    def get_state(self):
+        return self.env.get_state()
 
     def step(self, actions):
         fixed_rewards = None
diff --git a/tf14_runner.py b/tf14_runner.py
index 3fd847b7..d24d4f78 100644
--- a/tf14_runner.py
+++ b/tf14_runner.py
@@ -9,6 +9,7 @@
 import algos_tf14.a2c_continuous as a2c_continuous
 import algos_tf14.a2c_discrete as a2c_discrete
 import algos_tf14.dqnagent as dqnagent
+import algos_tf14.vdnagent as vdnagent
 
 import common.tr_helpers as tr_helpers
 import yaml
@@ -49,6 +50,7 @@ def __init__(self, logger):
         self.algo_factory.register_builder('a2c_continuous', lambda **kwargs : a2c_continuous.A2CAgent(**kwargs))
         self.algo_factory.register_builder('a2c_discrete', lambda **kwargs : a2c_discrete.A2CAgent(**kwargs)) 
         self.algo_factory.register_builder('dqn', lambda **kwargs : dqnagent.DQNAgent(**kwargs))
+        self.algo_factory.register_builder('vdn', lambda **kwargs : vdnagent.VDNAgent(**kwargs))
 
         self.player_factory = common.object_factory.ObjectFactory()
         self.player_factory.register_builder('a2c_continuous', lambda **kwargs : players.PpoPlayerContinuous(**kwargs))

From 8b9c3db519a4941b9d6328e5a4e3fc02d3d9eb1e Mon Sep 17 00:00:00 2001
From: Tarun Gupta <tarun1995gupta@gmail.com>
Date: Sun, 21 Jun 2020 02:36:51 +0100
Subject: [PATCH 02/14] up

---
 algos_tf14/vdnagent.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index 41a6dcd0..1fad7e17 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -106,33 +106,44 @@ def play_episode(self, epsilon=0.0):
         mb_avail_actions = []
         mb_dones = []
         mb_states = []
+        step_count = 0
         
         obs = self.env.reset()
         obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape))
         mb_obs.append(obs)
         mb_states.append(self.env.get_state())
-        mb_avail_actions.append(self.env.get_action_mask())
-
+        avail_acts = self.env.get_action_mask()
+        mb_avail_actions.append(avail_acts)
         while True:
-            step_act = self.get_action(obs, epsilon)
-            print(step_act)
-            break
+            step_count += 1
+            step_act = self.get_action(obs, avail_acts, epsilon)
+            next_obs, rewards, dones, _ = self.env.step(step_act)
+            mb_actions.append(step_act)
+            mb_obs.append(next_obs)
+            mb_rewards.append(rewards)
+            mb_dones.append(dones)
+            mb_states.append(self.env.get_state())
             
+            obs = next_obs
+            obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape))
+            avail_acts = self.env.get_action_mask()
+            mb_avail_actions.append(avail_acts)
             
+            if all(dones) or self.steps_num < step_count:
+                break
             
             
-    def get_action(self, obs, epsilon=0.0):
+    def get_action(self, obs, avail_acts, epsilon=0.0):
         print(obs.shape)
         if np.random.random() < epsilon:
             action = self.env.action_space.sample()
         else:
             qvals = self.get_qvalues(obs)
-            print(qvals.shape)
-            action = np.argmax(qvals)
+            qvals[avail_acts == False] = -9999999
+            action = np.argmax(qvals, axis=1)
         return action  
     
     def get_qvalues(self, obs):
-        print(obs.shape)
         return self.sess.run(self.qvalues, {self.obs_ph: obs})
         
     def train(self):

From d7e0d9150eda09ea33a9e2eb86bc12aa940c303d Mon Sep 17 00:00:00 2001
From: Tarun Gupta <tarun1995gupta@gmail.com>
Date: Sun, 21 Jun 2020 20:03:11 +0100
Subject: [PATCH 03/14] updates

---
 algos_tf14/vdnagent.py | 129 ++++++++++++++++++++++++++++++-----------
 1 file changed, 95 insertions(+), 34 deletions(-)

diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index 1fad7e17..c1dbf5ea 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -50,7 +50,9 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
             self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name'])
         self.sess = sess
         self.steps_num = self.config['steps_num']
-        self.states = deque([], maxlen=self.steps_num)
+        
+        self.obs_act_rew = deque([], maxlen=self.steps_num)
+        
         self.is_prioritized = config['replay_buffer_type'] != 'normal'
         self.atoms_num = self.config['atoms_num']
         assert self.atoms_num == 1
@@ -83,7 +85,20 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
             self.input_obs = tf.to_float(self.input_obs) / 255.0
             self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0
         self.setup_qvalues(actions_num)
-        self.sess.run(tf.global_variables_initializer())
+        
+        if self.env_name:
+            self.sess.run(tf.global_variables_initializer())
+#         self.reg_loss = tf.losses.get_regularization_loss()
+#         self.td_loss_mean += self.reg_loss
+#         self.learning_rate = self.config['learning_rate']
+#         self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights)        
+
+#         self.saver = tf.train.Saver()
+#         self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)]
+#         self.variables = TensorFlowVariables(self.qvalues, self.sess)
+        if self.env_name:
+            sess.run(tf.global_variables_initializer())
+        self._reset()
     
     def setup_qvalues(self, actions_num):
         config = {
@@ -91,6 +106,7 @@ def setup_qvalues(self, actions_num):
             'inputs' : self.input_obs,
             'actions_num' : actions_num,
         }
+        #(n_agents, n_actions)
         self.qvalues = self.network(config, reuse=False)
         config = {
             'name' : 'target',
@@ -99,42 +115,85 @@ def setup_qvalues(self, actions_num):
         }
         self.target_qvalues = tf.stop_gradient(self.network(config, reuse=False))
         
-    def play_episode(self, epsilon=0.0):
-        mb_obs = []
-        mb_rewards = []
-        mb_actions = []
-        mb_avail_actions = []
-        mb_dones = []
-        mb_states = []
-        step_count = 0
-        
-        obs = self.env.reset()
-        obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape))
-        mb_obs.append(obs)
-        mb_states.append(self.env.get_state())
-        avail_acts = self.env.get_action_mask()
-        mb_avail_actions.append(avail_acts)
+        if self.config['is_double'] == True:
+            config = {
+                'name' : 'agent',
+                'inputs' : self.input_next_obs,
+                'actions_num' : actions_num,
+            }
+            self.next_qvalues = tf.stop_gradient(self.network(config, reuse=True))
+
+        self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent')
+        self.target_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target')
+        
+        #(n_agents, 1)
+        self.current_action_qvalues = tf.reduce_sum(tf.one_hot(self.actions_ph, actions_num) * self.qvalues, reduction_indices = 1)
+        
+        if self.config['is_double'] == True:
+            self.next_selected_actions = tf.argmax(self.next_qvalues, axis = 1)
+            self.next_selected_actions_onehot = tf.one_hot(self.next_selected_actions, actions_num)
+            self.next_state_values_target = tf.stop_gradient( tf.reduce_sum( self.target_qvalues * self.next_selected_actions_onehot , reduction_indices=[1,] ))
+        else:
+            self.next_state_values_target = tf.stop_gradient(tf.reduce_max(self.target_qvalues, reduction_indices=1))
+        
+    def play_steps(self, steps, epsilon=0.0):
+        done_reward = None
+        done_shaped_reward = None
+        done_steps = None
+        steps_rewards = 0
+        cur_gamma = 1
+        cur_obs_act_rew_len = len(self.obs_act_rew)
+
+        # always break after one
         while True:
-            step_count += 1
-            step_act = self.get_action(obs, avail_acts, epsilon)
-            next_obs, rewards, dones, _ = self.env.step(step_act)
-            mb_actions.append(step_act)
-            mb_obs.append(next_obs)
-            mb_rewards.append(rewards)
-            mb_dones.append(dones)
-            mb_states.append(self.env.get_state())
-            
-            obs = next_obs
+            if cur_obs_act_rew_len > 0:
+                obs = self.obs_act_rew[-1][0]
+            else:
+                obs = self.current_obs
             obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape))
-            avail_acts = self.env.get_action_mask()
-            mb_avail_actions.append(avail_acts)
-            
-            if all(dones) or self.steps_num < step_count:
+
+            action = self.get_action(obs, self.env.get_action_mask(), epsilon)
+            print(action)
+            print(self.sess.run(self.qvalues, {self.obs_ph: obs}))
+            print(self.sess.run(self.next_state_values_target, {self.obs_ph: obs, self.actions_ph: action}))
+            new_obs, reward, is_done, _ = self.env.step(action)
+            #reward = reward * (1 - is_done)
+ 
+            self.step_count += 1
+            self.total_reward += reward
+            shaped_reward = self.rewards_shaper(reward)
+            self.total_shaped_reward += shaped_reward
+            self.obs_act_rew.append([new_obs, action, shaped_reward])
+
+            if len(self.obs_act_rew) < steps:
                 break
+
+            for i in range(steps):
+                sreward = self.obs_act_rew[i][2]
+                steps_rewards += sreward * cur_gamma
+                cur_gamma = cur_gamma * self.gamma
+
+            next_obs, current_action, _ = self.obs_act_rew[0]
+            self.exp_buffer.add(self.current_obs, current_action, steps_rewards, new_obs, is_done)
+            self.current_obs = next_obs
+            break
             
-            
+        if all(is_done):
+            done_reward = self.total_reward
+            done_steps = self.step_count
+            done_shaped_reward = self.total_shaped_reward
+            self._reset()
+        return done_reward, done_shaped_reward, done_steps
+                
+    def _reset(self):
+        self.obs_act_rew.clear()
+        if self.env_name:
+            self.current_obs = self.env.reset()
+        self.total_reward = 0.0
+        self.total_shaped_reward = 0.0
+        self.step_count = 0
+        
     def get_action(self, obs, avail_acts, epsilon=0.0):
-        print(obs.shape)
         if np.random.random() < epsilon:
             action = self.env.action_space.sample()
         else:
@@ -147,7 +206,9 @@ def get_qvalues(self, obs):
         return self.sess.run(self.qvalues, {self.obs_ph: obs})
         
     def train(self):
-        self.play_episode()
+        for _ in range(5):
+            self.play_steps(steps=3)
+            
         
         
         

From 76fbc524b355f1162b792f8128f6f15ddf459a05 Mon Sep 17 00:00:00 2001
From: Tarun Gupta <tarun1995gupta@gmail.com>
Date: Sun, 21 Jun 2020 20:50:11 +0100
Subject: [PATCH 04/14] state added to exp replay

---
 algos_tf14/vdnagent.py | 233 ++++++++++++++++++++++++++++++++---------
 common/experience.py   |  11 +-
 2 files changed, 191 insertions(+), 53 deletions(-)

diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index c1dbf5ea..f356a14c 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -86,20 +86,27 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
             self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0
         self.setup_qvalues(actions_num)
         
-        if self.env_name:
-            self.sess.run(tf.global_variables_initializer())
-#         self.reg_loss = tf.losses.get_regularization_loss()
-#         self.td_loss_mean += self.reg_loss
-#         self.learning_rate = self.config['learning_rate']
-#         self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights)        
-
-#         self.saver = tf.train.Saver()
-#         self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)]
-#         self.variables = TensorFlowVariables(self.qvalues, self.sess)
+        self.reg_loss = tf.losses.get_regularization_loss()
+        self.td_loss_mean += self.reg_loss
+        self.learning_rate = self.config['learning_rate']
+        self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights)        
+
+        self.saver = tf.train.Saver()
+        self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)]
+        self.variables = TensorFlowVariables(self.qvalues, self.sess)
         if self.env_name:
             sess.run(tf.global_variables_initializer())
         self._reset()
     
+    def get_weights(self):
+        return self.variables.get_flat()
+    
+    def set_weights(self, weights):
+        return self.variables.set_flat(weights)
+    
+    def update_epoch(self):
+        return self.sess.run([self.update_epoch_op])[0]
+    
     def setup_qvalues(self, actions_num):
         config = {
             'name' : 'agent',
@@ -132,9 +139,56 @@ def setup_qvalues(self, actions_num):
         if self.config['is_double'] == True:
             self.next_selected_actions = tf.argmax(self.next_qvalues, axis = 1)
             self.next_selected_actions_onehot = tf.one_hot(self.next_selected_actions, actions_num)
-            self.next_state_values_target = tf.stop_gradient( tf.reduce_sum( self.target_qvalues * self.next_selected_actions_onehot , reduction_indices=[1,] ))
+            self.next_obs_values_target = tf.stop_gradient( tf.reduce_sum( self.target_qvalues * self.next_selected_actions_onehot , reduction_indices=[1,] ))
+        else:
+            self.next_obs_values_target = tf.stop_gradient(tf.reduce_max(self.target_qvalues, reduction_indices=1))
+            
+        ##MIXING:
+        self.current_action_qvalues_mix = tf.reduce_sum(self.current_action_qvalues, axis=0)
+        self.target_action_qvalues_mix = tf.reduce_sum(self.next_obs_values_target, axis=0)
+        
+        self.reference_qvalues = self.rewards_ph + self.gamma_step *self.is_not_done * self.target_action_qvalues_mix
+        
+        if self.is_prioritized:
+            # we need to return l1 loss to update priority buffer
+            self.abs_errors = tf.abs(self.current_action_qvalues_mix - self.reference_qvalues) + 1e-5
+            # the same as multiply gradients later (other way is used in different examples over internet) 
+            self.td_loss = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, reduction=tf.losses.Reduction.NONE) * self.sample_weights_ph
+            self.td_loss_mean = tf.reduce_mean(self.td_loss) 
+        else:
+            self.td_loss_mean = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, reduction=tf.losses.Reduction.MEAN)
+            
+        self.reg_loss = tf.losses.get_regularization_loss()
+        self.td_loss_mean += self.reg_loss
+        self.learning_rate = self.config['learning_rate']
+        if self.env_name:
+            self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights)
+            
+    def save(self, fn):
+        self.saver.save(self.sess, fn)
+
+    def restore(self, fn):
+        self.saver.restore(self.sess, fn)
+        
+    def _reset(self):
+        self.obs_act_rew.clear()
+        if self.env_name:
+            self.current_obs = self.env.reset()
+        self.total_reward = 0.0
+        self.total_shaped_reward = 0.0
+        self.step_count = 0
+        
+    def get_action(self, obs, avail_acts, epsilon=0.0):
+        if np.random.random() < epsilon:
+            action = self.env.action_space.sample()
         else:
-            self.next_state_values_target = tf.stop_gradient(tf.reduce_max(self.target_qvalues, reduction_indices=1))
+            qvals = self.get_qvalues(obs)
+            qvals[avail_acts == False] = -9999999
+            action = np.argmax(qvals, axis=1)
+        return action  
+    
+    def get_qvalues(self, obs):
+        return self.sess.run(self.qvalues, {self.obs_ph: obs})
         
     def play_steps(self, steps, epsilon=0.0):
         done_reward = None
@@ -151,11 +205,9 @@ def play_steps(self, steps, epsilon=0.0):
             else:
                 obs = self.current_obs
             obs = np.reshape(obs, ((self.n_agents,) + self.obs_shape))
+            state = self.env.get_state()
 
             action = self.get_action(obs, self.env.get_action_mask(), epsilon)
-            print(action)
-            print(self.sess.run(self.qvalues, {self.obs_ph: obs}))
-            print(self.sess.run(self.next_state_values_target, {self.obs_ph: obs, self.actions_ph: action}))
             new_obs, reward, is_done, _ = self.env.step(action)
             #reward = reward * (1 - is_done)
  
@@ -163,7 +215,7 @@ def play_steps(self, steps, epsilon=0.0):
             self.total_reward += reward
             shaped_reward = self.rewards_shaper(reward)
             self.total_shaped_reward += shaped_reward
-            self.obs_act_rew.append([new_obs, action, shaped_reward])
+            self.obs_act_rew.append([new_obs, action, shaped_reward, state])
 
             if len(self.obs_act_rew) < steps:
                 break
@@ -173,8 +225,8 @@ def play_steps(self, steps, epsilon=0.0):
                 steps_rewards += sreward * cur_gamma
                 cur_gamma = cur_gamma * self.gamma
 
-            next_obs, current_action, _ = self.obs_act_rew[0]
-            self.exp_buffer.add(self.current_obs, current_action, steps_rewards, new_obs, is_done)
+            next_obs, current_action, _, current_st = self.obs_act_rew[0]
+            self.exp_buffer.add(self.current_obs, current_action, current_st, steps_rewards, new_obs, is_done)
             self.current_obs = next_obs
             break
             
@@ -184,37 +236,122 @@ def play_steps(self, steps, epsilon=0.0):
             done_shaped_reward = self.total_shaped_reward
             self._reset()
         return done_reward, done_shaped_reward, done_steps
-                
-    def _reset(self):
-        self.obs_act_rew.clear()
-        if self.env_name:
-            self.current_obs = self.env.reset()
-        self.total_reward = 0.0
-        self.total_shaped_reward = 0.0
-        self.step_count = 0
-        
-    def get_action(self, obs, avail_acts, epsilon=0.0):
-        if np.random.random() < epsilon:
-            action = self.env.action_space.sample()
-        else:
-            qvals = self.get_qvalues(obs)
-            qvals[avail_acts == False] = -9999999
-            action = np.argmax(qvals, axis=1)
-        return action  
     
-    def get_qvalues(self, obs):
-        return self.sess.run(self.qvalues, {self.obs_ph: obs})
-        
+    def load_weights_into_target_network(self):
+        self.sess.run(self.assigns_op)
+
+    def sample_batch(self, exp_replay, batch_size):
+        obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch  = exp_replay.sample(batch_size)
+        return {
+        self.obs_ph:obs_batch, self.actions_ph:act_batch, self.state_ph: st_batch, 
+        self.rewards_ph:reward_batch, self.is_done_ph:is_done_batch, self.next_obs_ph:next_obs_batch
+        }
+
+    def sample_prioritized_batch(self, exp_replay, batch_size, beta):
+        obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch,  sample_weights, sample_idxes = exp_replay.sample(batch_size, beta)
+        batch = { self.obs_ph:obs_batch, self.actions_ph:act_batch, self.state_ph: st_batch, self.rewards_ph:reward_batch, 
+        self.is_done_ph:is_done_batch, self.next_obs_ph:next_obs_batch, self.sample_weights_ph: sample_weights }
+        return [batch , sample_idxes]
+
     def train(self):
-        for _ in range(5):
-            self.play_steps(steps=3)
-            
-        
-        
-        
-        
-        
-        
-        
+        mem_free_steps = 0
+        last_mean_rewards = -100500
+        epoch_num = 0
+        frame = 0
+        update_time = 0
+        play_time = 0
+
+        start_time = time.time()
+        total_time = 0
+        self.load_weights_into_target_network()
+        for _ in range(0, self.config['num_steps_fill_buffer']):
+            self.play_steps(self.steps_num, self.epsilon)
+        steps_per_epoch = self.config['steps_per_epoch']
+        num_epochs_to_copy = self.config['num_epochs_to_copy']
+        batch_size = self.config['batch_size']
+        lives_reward = self.config['lives_reward']
+        episodes_to_log = self.config['episodes_to_log']
+        frame = 0
+        play_time = 0
+        update_time = 0
+        rewards = []
+        shaped_rewards = []
+        steps = []
+        losses = deque([], maxlen=100)
         
+#         while True:
+#             epoch_num = self.update_epoch()
+#             t_play_start = time.time()
+#             self.epsilon = self.epsilon_processor(frame)
+#             self.beta = self.beta_processor(frame)
+
+#             for _ in range(0, steps_per_epoch):
+#                 reward, shaped_reward, step = self.play_steps(self.steps_num, self.epsilon)
+#                 if reward != None:
+#                     self.game_lengths.append(step)
+#                     self.game_rewards.append(reward)
+#                     #shaped_rewards.append(shaped_reward)
 
+#             t_play_end = time.time()
+#             play_time += t_play_end - t_play_start
+            
+#             # train
+#             frame = frame + steps_per_epoch
+#             t_start = time.time()
+#             if self.is_prioritized:
+#                 batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta)
+#                 _, loss_t, errors_update, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch)
+#                 self.exp_buffer.update_priorities(idxes, errors_update)
+#             else:
+#                 batch = self.sample_batch(self.exp_buffer, batch_size=batch_size)
+#                 _, loss_t, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.lr_multiplier], batch)
+                
+#             losses.append(loss_t)
+#             t_end = time.time()
+#             update_time += t_end - t_start
+#             total_time += update_time
+#             if frame % 1000 == 0:
+#                 mem_free_steps += 1 
+#                 if mem_free_steps  == 10:
+#                     mem_free_steps = 0
+#                     tr_helpers.free_mem()
+#                 sum_time = update_time + play_time
+#                 print('frames per seconds: ', 1000 / (sum_time))
+#                 self.writer.add_scalar('performance/fps', 1000 / sum_time, frame)
+#                 self.writer.add_scalar('performance/upd_time', update_time, frame)
+#                 self.writer.add_scalar('performance/play_time', play_time, frame)
+#                 self.writer.add_scalar('losses/td_loss', np.mean(losses), frame)
+#                 self.writer.add_scalar('info/lr_mul', lr_mul, frame)
+#                 self.writer.add_scalar('info/lr', self.learning_rate*lr_mul, frame)
+#                 self.writer.add_scalar('info/epochs', epoch_num, frame)
+#                 self.writer.add_scalar('info/epsilon', self.epsilon, frame)
+#                 if self.is_prioritized:
+#                     self.writer.add_scalar('beta', self.beta, frame)
+                    
+#                 update_time = 0
+#                 play_time = 0
+#                 num_games = len(self.game_rewards)
+#                 if num_games > 10:
+#                     d = num_games / lives_reward
+#                     mean_rewards = np.sum(self.game_rewards) / d 
+#                     mean_lengths = np.sum(self.game_lengths) / d
+#                     self.writer.add_scalar('rewards/mean', mean_rewards, frame)
+#                     self.writer.add_scalar('rewards/time', mean_rewards, total_time)
+#                     self.writer.add_scalar('episode_lengths/mean', mean_lengths, frame)
+#                     self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time)
+
+#                     if mean_rewards > last_mean_rewards:
+#                         print('saving next best rewards: ', mean_rewards)
+#                         last_mean_rewards = mean_rewards
+#                         self.save("./nn/" + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards))
+#                         if last_mean_rewards > self.config['score_to_win']:
+#                             print('network won!')
+#                             return last_mean_rewards, epoch_num
+                        
+#             if frame % num_epochs_to_copy == 0:
+#                 self.load_weigths_into_target_network()
+            
+#             if epoch_num >= self.max_epochs:
+#                 print('Max epochs reached')
+#                 self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(np.sum(self.game_rewards) *  lives_reward / len(self.game_rewards)))
+#                 return last_mean_rewards, epoch_num 
diff --git a/common/experience.py b/common/experience.py
index 28885e91..2765db79 100644
--- a/common/experience.py
+++ b/common/experience.py
@@ -20,8 +20,8 @@ def __init__(self, size):
     def __len__(self):
         return len(self._storage)
 
-    def add(self, obs_t, action, reward, obs_tp1, done):
-        data = (obs_t, action, reward, obs_tp1, done)
+    def add(self, obs_t, action, state_t, reward, obs_tp1, done):
+        data = (obs_t, action, state_t, reward, obs_tp1, done)
 
         if self._next_idx >= len(self._storage):
             self._storage.append(data)
@@ -30,16 +30,17 @@ def add(self, obs_t, action, reward, obs_tp1, done):
         self._next_idx = (self._next_idx + 1) % self._maxsize
 
     def _encode_sample(self, idxes):
-        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
+        obses_t, actions, states_t, rewards, obses_tp1, dones = [], [], [], [], []
         for i in idxes:
             data = self._storage[i]
-            obs_t, action, reward, obs_tp1, done = data
+            obs_t, action, state_t, reward, obs_tp1, done = data
             obses_t.append(np.array(obs_t, copy=False))
             actions.append(np.array(action, copy=False))
+            states_t.append(np.array(state_t, copy=False))
             rewards.append(reward)
             obses_tp1.append(np.array(obs_tp1, copy=False))
             dones.append(done)
-        return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
+        return np.array(obses_t), np.array(actions), np.array(states_t), np.array(rewards), np.array(obses_tp1), np.array(dones)
 
     def sample(self, batch_size):
         """Sample a batch of experiences.

From 8dc9c6127e15a811adf52f181254fa99e79b8122 Mon Sep 17 00:00:00 2001
From: Tarun Gupta <tarun1995gupta@gmail.com>
Date: Sun, 21 Jun 2020 21:54:54 +0100
Subject: [PATCH 05/14] rudimentary vdn ready

---
 algos_tf14/vdnagent.py | 139 +++++++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 69 deletions(-)

diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index f356a14c..901793e9 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -9,6 +9,7 @@
 from datetime import datetime
 from algos_tf14.tensorflow_utils import TensorFlowVariables
 from common.categorical import CategoricalQ
+import tensorflow_probability as tfp
 
 class VDNAgent:
     def __init__(self, sess, base_name, observation_space, action_space, config, logger):
@@ -180,7 +181,7 @@ def _reset(self):
         
     def get_action(self, obs, avail_acts, epsilon=0.0):
         if np.random.random() < epsilon:
-            action = self.env.action_space.sample()
+            action = tfp.distributions.Categorical(probs=avail_acts.astype(float)).sample().eval(session=self.sess)
         else:
             qvals = self.get_qvalues(obs)
             qvals[avail_acts == False] = -9999999
@@ -279,79 +280,79 @@ def train(self):
         steps = []
         losses = deque([], maxlen=100)
         
-#         while True:
-#             epoch_num = self.update_epoch()
-#             t_play_start = time.time()
-#             self.epsilon = self.epsilon_processor(frame)
-#             self.beta = self.beta_processor(frame)
+        while True:
+            epoch_num = self.update_epoch()
+            t_play_start = time.time()
+            self.epsilon = self.epsilon_processor(frame)
+            self.beta = self.beta_processor(frame)
 
-#             for _ in range(0, steps_per_epoch):
-#                 reward, shaped_reward, step = self.play_steps(self.steps_num, self.epsilon)
-#                 if reward != None:
-#                     self.game_lengths.append(step)
-#                     self.game_rewards.append(reward)
-#                     #shaped_rewards.append(shaped_reward)
+            for _ in range(0, steps_per_epoch):
+                reward, shaped_reward, step = self.play_steps(self.steps_num, self.epsilon)
+                if reward != None:
+                    self.game_lengths.append(step)
+                    self.game_rewards.append(reward)
+                    #shaped_rewards.append(shaped_reward)
 
-#             t_play_end = time.time()
-#             play_time += t_play_end - t_play_start
+            t_play_end = time.time()
+            play_time += t_play_end - t_play_start
             
-#             # train
-#             frame = frame + steps_per_epoch
-#             t_start = time.time()
-#             if self.is_prioritized:
-#                 batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta)
-#                 _, loss_t, errors_update, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch)
-#                 self.exp_buffer.update_priorities(idxes, errors_update)
-#             else:
-#                 batch = self.sample_batch(self.exp_buffer, batch_size=batch_size)
-#                 _, loss_t, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.lr_multiplier], batch)
+            # train
+            frame = frame + steps_per_epoch
+            t_start = time.time()
+            if self.is_prioritized:
+                batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta)
+                _, loss_t, errors_update, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch)
+                self.exp_buffer.update_priorities(idxes, errors_update)
+            else:
+                batch = self.sample_batch(self.exp_buffer, batch_size=batch_size)
+                _, loss_t, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.lr_multiplier], batch)
                 
-#             losses.append(loss_t)
-#             t_end = time.time()
-#             update_time += t_end - t_start
-#             total_time += update_time
-#             if frame % 1000 == 0:
-#                 mem_free_steps += 1 
-#                 if mem_free_steps  == 10:
-#                     mem_free_steps = 0
-#                     tr_helpers.free_mem()
-#                 sum_time = update_time + play_time
-#                 print('frames per seconds: ', 1000 / (sum_time))
-#                 self.writer.add_scalar('performance/fps', 1000 / sum_time, frame)
-#                 self.writer.add_scalar('performance/upd_time', update_time, frame)
-#                 self.writer.add_scalar('performance/play_time', play_time, frame)
-#                 self.writer.add_scalar('losses/td_loss', np.mean(losses), frame)
-#                 self.writer.add_scalar('info/lr_mul', lr_mul, frame)
-#                 self.writer.add_scalar('info/lr', self.learning_rate*lr_mul, frame)
-#                 self.writer.add_scalar('info/epochs', epoch_num, frame)
-#                 self.writer.add_scalar('info/epsilon', self.epsilon, frame)
-#                 if self.is_prioritized:
-#                     self.writer.add_scalar('beta', self.beta, frame)
+            losses.append(loss_t)
+            t_end = time.time()
+            update_time += t_end - t_start
+            total_time += update_time
+            if frame % 1000 == 0:
+                mem_free_steps += 1 
+                if mem_free_steps  == 10:
+                    mem_free_steps = 0
+                    tr_helpers.free_mem()
+                sum_time = update_time + play_time
+                print('frames per seconds: ', 1000 / (sum_time))
+                self.writer.add_scalar('performance/fps', 1000 / sum_time, frame)
+                self.writer.add_scalar('performance/upd_time', update_time, frame)
+                self.writer.add_scalar('performance/play_time', play_time, frame)
+                self.writer.add_scalar('losses/td_loss', np.mean(losses), frame)
+                self.writer.add_scalar('info/lr_mul', lr_mul, frame)
+                self.writer.add_scalar('info/lr', self.learning_rate*lr_mul, frame)
+                self.writer.add_scalar('info/epochs', epoch_num, frame)
+                self.writer.add_scalar('info/epsilon', self.epsilon, frame)
+                if self.is_prioritized:
+                    self.writer.add_scalar('beta', self.beta, frame)
                     
-#                 update_time = 0
-#                 play_time = 0
-#                 num_games = len(self.game_rewards)
-#                 if num_games > 10:
-#                     d = num_games / lives_reward
-#                     mean_rewards = np.sum(self.game_rewards) / d 
-#                     mean_lengths = np.sum(self.game_lengths) / d
-#                     self.writer.add_scalar('rewards/mean', mean_rewards, frame)
-#                     self.writer.add_scalar('rewards/time', mean_rewards, total_time)
-#                     self.writer.add_scalar('episode_lengths/mean', mean_lengths, frame)
-#                     self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time)
+                update_time = 0
+                play_time = 0
+                num_games = len(self.game_rewards)
+                if num_games > 10:
+                    d = num_games / lives_reward
+                    mean_rewards = np.sum(self.game_rewards) / d 
+                    mean_lengths = np.sum(self.game_lengths) / d
+                    self.writer.add_scalar('rewards/mean', mean_rewards, frame)
+                    self.writer.add_scalar('rewards/time', mean_rewards, total_time)
+                    self.writer.add_scalar('episode_lengths/mean', mean_lengths, frame)
+                    self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time)
 
-#                     if mean_rewards > last_mean_rewards:
-#                         print('saving next best rewards: ', mean_rewards)
-#                         last_mean_rewards = mean_rewards
-#                         self.save("./nn/" + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards))
-#                         if last_mean_rewards > self.config['score_to_win']:
-#                             print('network won!')
-#                             return last_mean_rewards, epoch_num
+                    if mean_rewards > last_mean_rewards:
+                        print('saving next best rewards: ', mean_rewards)
+                        last_mean_rewards = mean_rewards
+                        self.save("./nn/" + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards))
+                        if last_mean_rewards > self.config['score_to_win']:
+                            print('network won!')
+                            return last_mean_rewards, epoch_num
                         
-#             if frame % num_epochs_to_copy == 0:
-#                 self.load_weigths_into_target_network()
+            if frame % num_epochs_to_copy == 0:
+                self.load_weigths_into_target_network()
             
-#             if epoch_num >= self.max_epochs:
-#                 print('Max epochs reached')
-#                 self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(np.sum(self.game_rewards) *  lives_reward / len(self.game_rewards)))
-#                 return last_mean_rewards, epoch_num 
+            if epoch_num >= self.max_epochs:
+                print('Max epochs reached')
+                self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(np.sum(self.game_rewards) *  lives_reward / len(self.game_rewards)))
+                return last_mean_rewards, epoch_num 

From b3a2870ee6d9dc48016a87a52a191f09f9275ff1 Mon Sep 17 00:00:00 2001
From: Tarun <tarun1995gupta@gmail.com>
Date: Mon, 22 Jun 2020 00:24:24 +0100
Subject: [PATCH 06/14] vdn as a model

---
 algos_tf14/model_builder.py |  1 +
 algos_tf14/models.py        | 37 ++++++++++++++++++++++++++++++++++
 algos_tf14/vdnagent.py      | 40 +++++++------------------------------
 3 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/algos_tf14/model_builder.py b/algos_tf14/model_builder.py
index dd58a9dd..ade95ba2 100644
--- a/algos_tf14/model_builder.py
+++ b/algos_tf14/model_builder.py
@@ -13,6 +13,7 @@ def __init__(self):
         self.model_factory.register_builder('continuous_a2c_lstm', lambda network, **kwargs : models.LSTMModelA2CContinuous(network))
         self.model_factory.register_builder('continuous_a2c_lstm_logstd', lambda network, **kwargs : models.LSTMModelA2CContinuousLogStd(network))
         self.model_factory.register_builder('dqn', lambda network, **kwargs : models.AtariDQN(network))
+        self.model_factory.register_builder('vdn', lambda network, **kwargs : models.VDN_DQN(network))
 
 
         self.network_factory = object_factory.ObjectFactory()
diff --git a/algos_tf14/models.py b/algos_tf14/models.py
index 117b1047..70f84a6a 100644
--- a/algos_tf14/models.py
+++ b/algos_tf14/models.py
@@ -245,3 +245,40 @@ def __call__(self, dict, reuse=False):
         '''        
         is_train = name == 'agent'
         return self.network(name=name, inputs=inputs, actions_num=actions_num, is_train=is_train, reuse=reuse)
+
+
+class VDN_DQN(BaseModel):
+    def __init__(self, network):
+        self.network = network
+
+    def __call__(self, dict):
+        input_obs = dict['input_obs']
+        input_next_obs = dict['input_next_obs']
+        actions_num = dict['actions_num']
+        is_double = dict['is_double']
+        actions_ph = dict['actions_ph']
+
+        '''
+        TODO: fix is_train
+        '''
+        # is_train = name == 'agent'
+
+        # (n_agents, n_actions)
+        qvalues = self.network(name='agent', inputs=input_obs, actions_num=actions_num, is_train=True, reuse=False)
+        target_qvalues = tf.stop_gradient(self.network(name='target', inputs=input_next_obs, actions_num=actions_num, is_train=False, reuse=False))
+        current_action_qvalues = tf.reduce_sum(tf.one_hot(actions_ph, actions_num) * qvalues,
+                                                    reduction_indices=1)
+        if is_double:
+            next_qvalues = tf.stop_gradient(self.network(name='agent', inputs=input_next_obs, actions_num=actions_num, is_train=True, reuse=True))
+            next_selected_actions = tf.argmax(next_qvalues, axis=1)
+            next_selected_actions_onehot = tf.one_hot(next_selected_actions, actions_num)
+            next_obs_values_target = tf.stop_gradient(
+                tf.reduce_sum(target_qvalues * next_selected_actions_onehot, reduction_indices=[1, ]))
+        else:
+            next_obs_values_target = tf.stop_gradient(tf.reduce_max(target_qvalues, reduction_indices=1))
+
+        ##MIXING:
+        current_action_qvalues_mix = tf.reduce_sum(current_action_qvalues, axis=0)
+        target_action_qvalues_mix = tf.reduce_sum(next_obs_values_target, axis=0)
+
+        return current_action_qvalues_mix, target_action_qvalues_mix
diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index 901793e9..34aa6668 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -110,44 +110,18 @@ def update_epoch(self):
     
     def setup_qvalues(self, actions_num):
         config = {
-            'name' : 'agent',
-            'inputs' : self.input_obs,
+            'input_obs' : self.input_obs,
+            'input_next_obs': self.input_next_obs,
             'actions_num' : actions_num,
+            'is_double': self.config['is_double'],
+            'actions_ph': self.actions_ph
         }
-        #(n_agents, n_actions)
-        self.qvalues = self.network(config, reuse=False)
-        config = {
-            'name' : 'target',
-            'inputs' : self.input_next_obs,
-            'actions_num' : actions_num,
-        }
-        self.target_qvalues = tf.stop_gradient(self.network(config, reuse=False))
-        
-        if self.config['is_double'] == True:
-            config = {
-                'name' : 'agent',
-                'inputs' : self.input_next_obs,
-                'actions_num' : actions_num,
-            }
-            self.next_qvalues = tf.stop_gradient(self.network(config, reuse=True))
+
+        self.current_action_qvalues_mix, self.target_action_qvalues_mix = self.network(config)
 
         self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent')
         self.target_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target')
-        
-        #(n_agents, 1)
-        self.current_action_qvalues = tf.reduce_sum(tf.one_hot(self.actions_ph, actions_num) * self.qvalues, reduction_indices = 1)
-        
-        if self.config['is_double'] == True:
-            self.next_selected_actions = tf.argmax(self.next_qvalues, axis = 1)
-            self.next_selected_actions_onehot = tf.one_hot(self.next_selected_actions, actions_num)
-            self.next_obs_values_target = tf.stop_gradient( tf.reduce_sum( self.target_qvalues * self.next_selected_actions_onehot , reduction_indices=[1,] ))
-        else:
-            self.next_obs_values_target = tf.stop_gradient(tf.reduce_max(self.target_qvalues, reduction_indices=1))
-            
-        ##MIXING:
-        self.current_action_qvalues_mix = tf.reduce_sum(self.current_action_qvalues, axis=0)
-        self.target_action_qvalues_mix = tf.reduce_sum(self.next_obs_values_target, axis=0)
-        
+
         self.reference_qvalues = self.rewards_ph + self.gamma_step *self.is_not_done * self.target_action_qvalues_mix
         
         if self.is_prioritized:

From 55b60bfe9426b3bb9d9f0a2adcc41b097daf2838 Mon Sep 17 00:00:00 2001
From: Tarun <tarun1995gupta@gmail.com>
Date: Mon, 22 Jun 2020 01:08:07 +0100
Subject: [PATCH 07/14] vdn conf

---
 configs/vdn_3s5z_vs_3s6z.yaml | 62 +++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 configs/vdn_3s5z_vs_3s6z.yaml

diff --git a/configs/vdn_3s5z_vs_3s6z.yaml b/configs/vdn_3s5z_vs_3s6z.yaml
new file mode 100644
index 00000000..2ea47834
--- /dev/null
+++ b/configs/vdn_3s5z_vs_3s6z.yaml
@@ -0,0 +1,62 @@
+label: ""
+name: ""
+params:
+  algo:
+    name: vdn
+
+  model:
+    name: vdn
+
+  load_checkpoint: False
+  load_path: ""
+
+  network:
+    name: dqn
+    dueling: True
+    atoms: 1
+    noisy: False
+    mlp:
+      units: [256]
+      activation: relu
+      initializer:
+        name: variance_scaling_initializer
+        scale: 2
+      regularizer:
+        name: 'None'
+
+  config:
+    reward_shaper:
+      scale_value: 0.1
+    gamma: 0.99
+    learning_rate: 0.0005
+    steps_per_epoch: 4
+    batch_size: 128
+    epsilon: 1.0
+    min_epsilon: 0.05
+    epsilon_decay_frames: 100000
+    num_epochs_to_copy: 10000
+    env_name:  smac_cnn
+    name: 3s5z_vs_3s6z
+    is_double: True
+    score_to_win: 20
+    num_steps_fill_buffer: 10000
+    replay_buffer_type: 'normal'
+    replay_buffer_size: 100000
+    priority_beta: 0.4
+    priority_alpha: 0.6
+    beta_decay_frames: 100000
+    max_beta: 1
+    steps_num: 128
+#    episodes_to_log: 10
+    atoms_num: 1
+    games_to_track: 20
+    lr_schedule: None
+    max_epochs: 100000
+    grad_norm: 0.5
+    mix_with_state: False
+
+    env_config:
+      name: 3s5z_vs_3s6z
+      frames: 4
+      transpose: True
+      random_invalid_step: False
\ No newline at end of file

From b22648f3c14489e4a934dddd02c91c1039be5967 Mon Sep 17 00:00:00 2001
From: Tarun <tarun1995gupta@gmail.com>
Date: Mon, 22 Jun 2020 01:24:20 +0100
Subject: [PATCH 08/14] env config use in vdn

---
 algos_tf14/vdnagent.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index 34aa6668..b0e45b49 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -48,7 +48,8 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
         self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames'])
         self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames'])
         if self.env_name:
-            self.env = env_configurations.configurations[self.env_name]['env_creator'](name=config['name'])
+            self.env_config = config.get('env_config', {})
+            self.env = env_configurations.configurations[self.env_name]['env_creator'](**self.env_config)
         self.sess = sess
         self.steps_num = self.config['steps_num']
         
@@ -94,16 +95,16 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
 
         self.saver = tf.train.Saver()
         self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)]
-        self.variables = TensorFlowVariables(self.qvalues, self.sess)
+        # self.variables = TensorFlowVariables(self.qvalues, self.sess)
         if self.env_name:
             sess.run(tf.global_variables_initializer())
         self._reset()
     
-    def get_weights(self):
-        return self.variables.get_flat()
-    
-    def set_weights(self, weights):
-        return self.variables.set_flat(weights)
+    # def get_weights(self):
+    #     return self.variables.get_flat()
+    #
+    # def set_weights(self, weights):
+    #     return self.variables.set_flat(weights)
     
     def update_epoch(self):
         return self.sess.run([self.update_epoch_op])[0]

From fb0161d09ee7f8b3f115b1536a70f03e9669a6ad Mon Sep 17 00:00:00 2001
From: Tarun <tarun1995gupta@gmail.com>
Date: Mon, 22 Jun 2020 01:26:00 +0100
Subject: [PATCH 09/14] bug correct

---
 algos_tf14/vdnagent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index b0e45b49..b2bbd693 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -325,7 +325,7 @@ def train(self):
                             return last_mean_rewards, epoch_num
                         
             if frame % num_epochs_to_copy == 0:
-                self.load_weigths_into_target_network()
+                self.load_weights_into_target_network()
             
             if epoch_num >= self.max_epochs:
                 print('Max epochs reached')

From cbf85febad74acbf5bfe8dae187f6b8fd5952a9c Mon Sep 17 00:00:00 2001
From: Tarun <tarun1995gupta@gmail.com>
Date: Mon, 22 Jun 2020 20:00:11 +0100
Subject: [PATCH 10/14] grad norm with truncate option and a bug update

---
 algos_tf14/models.py          |  2 +-
 algos_tf14/vdnagent.py        | 23 ++++++++++++++---------
 configs/vdn_3s5z_vs_3s6z.yaml |  1 +
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/algos_tf14/models.py b/algos_tf14/models.py
index 70f84a6a..81e88c4b 100644
--- a/algos_tf14/models.py
+++ b/algos_tf14/models.py
@@ -281,4 +281,4 @@ def __call__(self, dict):
         current_action_qvalues_mix = tf.reduce_sum(current_action_qvalues, axis=0)
         target_action_qvalues_mix = tf.reduce_sum(next_obs_values_target, axis=0)
 
-        return current_action_qvalues_mix, target_action_qvalues_mix
+        return qvalues, current_action_qvalues_mix, target_action_qvalues_mix
diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index b2bbd693..9f184556 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -27,7 +27,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
         self.game_rewards = deque([], maxlen=self.games_to_track)
         self.game_lengths = deque([], maxlen=self.games_to_track)
 
-        self.epoch_num = tf.Variable( tf.constant(0, shape=(), dtype=tf.float32), trainable=False)
+        self.epoch_num = tf.Variable(tf.constant(0, shape=(), dtype=tf.float32), trainable=False)
         self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
         self.current_lr = self.learning_rate_ph
 
@@ -91,20 +91,25 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
         self.reg_loss = tf.losses.get_regularization_loss()
         self.td_loss_mean += self.reg_loss
         self.learning_rate = self.config['learning_rate']
-        self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights)        
+        self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier)#.minimize(self.td_loss_mean, var_list=self.weights)
+        grads = tf.gradients(self.td_loss_mean, self.weights)
+        if self.config['truncate_grads']:
+            grads, _ = tf.clip_by_global_norm(grads, self.grad_norm)
+        grads = list(zip(grads, self.weights))
+        self.train_op = self.train_step.apply_gradients(grads)
 
         self.saver = tf.train.Saver()
         self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)]
-        # self.variables = TensorFlowVariables(self.qvalues, self.sess)
+        self.variables = TensorFlowVariables(self.qvalues, self.sess)
         if self.env_name:
             sess.run(tf.global_variables_initializer())
         self._reset()
     
-    # def get_weights(self):
-    #     return self.variables.get_flat()
-    #
-    # def set_weights(self, weights):
-    #     return self.variables.set_flat(weights)
+    def get_weights(self):
+        return self.variables.get_flat()
+
+    def set_weights(self, weights):
+        return self.variables.set_flat(weights)
     
     def update_epoch(self):
         return self.sess.run([self.update_epoch_op])[0]
@@ -118,7 +123,7 @@ def setup_qvalues(self, actions_num):
             'actions_ph': self.actions_ph
         }
 
-        self.current_action_qvalues_mix, self.target_action_qvalues_mix = self.network(config)
+        self.qvalues, self.current_action_qvalues_mix, self.target_action_qvalues_mix = self.network(config)
 
         self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent')
         self.target_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target')
diff --git a/configs/vdn_3s5z_vs_3s6z.yaml b/configs/vdn_3s5z_vs_3s6z.yaml
index 2ea47834..75e6f51a 100644
--- a/configs/vdn_3s5z_vs_3s6z.yaml
+++ b/configs/vdn_3s5z_vs_3s6z.yaml
@@ -54,6 +54,7 @@ params:
     max_epochs: 100000
     grad_norm: 0.5
     mix_with_state: False
+    truncate_grads: True
 
     env_config:
       name: 3s5z_vs_3s6z

From 279bf6a00d33f64d5f1fce6a461b51f5f0540a0f Mon Sep 17 00:00:00 2001
From: Tarun <tarun1995gupta@gmail.com>
Date: Mon, 22 Jun 2020 20:09:45 +0100
Subject: [PATCH 11/14] bug correct

---
 algos_tf14/vdnagent.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index 9f184556..084a12c0 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -281,11 +281,12 @@ def train(self):
             t_start = time.time()
             if self.is_prioritized:
                 batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta)
-                _, loss_t, errors_update, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch)
+                _, loss_t, errors_update, lr_mul = self.sess.run([self.train_op, self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch)
                 self.exp_buffer.update_priorities(idxes, errors_update)
             else:
                 batch = self.sample_batch(self.exp_buffer, batch_size=batch_size)
-                _, loss_t, lr_mul = self.sess.run([self.train_step, self.td_loss_mean, self.lr_multiplier], batch)
+                print(self.sess.run(self.qvalues, batch).shape)
+                _, loss_t, lr_mul = self.sess.run([self.train_op, self.train_step, self.td_loss_mean, self.lr_multiplier], batch)
                 
             losses.append(loss_t)
             t_end = time.time()

From e8e8f7e832f11bcee5d46f2a355152ae82a445dd Mon Sep 17 00:00:00 2001
From: Tarun <tarun1995gupta@gmail.com>
Date: Mon, 22 Jun 2020 21:32:38 +0100
Subject: [PATCH 12/14] major changes

---
 algos_tf14/models.py          |  37 +++++--
 algos_tf14/vdnagent.py        | 202 ++++++++++++++++++++--------------
 common/experience.py          |   2 +-
 configs/vdn_3s5z_vs_3s6z.yaml |  27 ++++-
 4 files changed, 176 insertions(+), 92 deletions(-)

diff --git a/algos_tf14/models.py b/algos_tf14/models.py
index 81e88c4b..b0e531e4 100644
--- a/algos_tf14/models.py
+++ b/algos_tf14/models.py
@@ -256,29 +256,50 @@ def __call__(self, dict):
         input_next_obs = dict['input_next_obs']
         actions_num = dict['actions_num']
         is_double = dict['is_double']
+        # (bs * n_agents, 1)
         actions_ph = dict['actions_ph']
+        batch_size_ph = dict['batch_size_ph']
+        n_agents = dict['n_agents']
 
         '''
         TODO: fix is_train
         '''
         # is_train = name == 'agent'
 
-        # (n_agents, n_actions)
+        # (bs * n_agents, n_actions)
         qvalues = self.network(name='agent', inputs=input_obs, actions_num=actions_num, is_train=True, reuse=False)
+        # (bs, n_agents, n_actions)
+        qvalues = tf.reshape(qvalues, [batch_size_ph, n_agents, actions_num])
+        # (bs * n_agents, n_actions)
         target_qvalues = tf.stop_gradient(self.network(name='target', inputs=input_next_obs, actions_num=actions_num, is_train=False, reuse=False))
-        current_action_qvalues = tf.reduce_sum(tf.one_hot(actions_ph, actions_num) * qvalues,
-                                                    reduction_indices=1)
+        # (bs, n_agents, n_actions)
+        target_qvalues = tf.reshape(target_qvalues, [batch_size_ph, n_agents, actions_num])
+
+        # (bs * n_agents, 1, actions_num)
+        # (bs, n_agents, actions_num)
+        one_hot_actions = tf.reshape(tf.one_hot(actions_ph, actions_num), [batch_size_ph, n_agents, actions_num])
+        # (bs, n_agents, 1)
+        current_action_qvalues = tf.reduce_sum(one_hot_actions * qvalues, axis=2)
+
         if is_double:
+            # (bs * n_agents, n_actions)
             next_qvalues = tf.stop_gradient(self.network(name='agent', inputs=input_next_obs, actions_num=actions_num, is_train=True, reuse=True))
+            # (bs * n_agents, 1)
             next_selected_actions = tf.argmax(next_qvalues, axis=1)
-            next_selected_actions_onehot = tf.one_hot(next_selected_actions, actions_num)
+            # (bs*n_agents, 1, n_actions)
+            # (bs, n_agents, actions_num)
+            next_selected_actions_onehot = tf.reshape(tf.one_hot(next_selected_actions, actions_num), [batch_size_ph, n_agents, actions_num])
+            # (bs, n_agents, 1)
             next_obs_values_target = tf.stop_gradient(
-                tf.reduce_sum(target_qvalues * next_selected_actions_onehot, reduction_indices=[1, ]))
+                tf.reduce_sum(target_qvalues * next_selected_actions_onehot, axis=2))
         else:
-            next_obs_values_target = tf.stop_gradient(tf.reduce_max(target_qvalues, reduction_indices=1))
+            # (bs, n_agents, 1)
+            next_obs_values_target = tf.stop_gradient(tf.reduce_max(target_qvalues, axis=2))
 
         ##MIXING:
-        current_action_qvalues_mix = tf.reduce_sum(current_action_qvalues, axis=0)
-        target_action_qvalues_mix = tf.reduce_sum(next_obs_values_target, axis=0)
+        # (bs, 1)
+        current_action_qvalues_mix = tf.reshape(tf.reduce_sum(current_action_qvalues, axis=1), [batch_size_ph, 1])
+        # (bs, 1, 1)
+        target_action_qvalues_mix = tf.reshape(tf.reduce_sum(next_obs_values_target, axis=1), [batch_size_ph, 1])
 
         return qvalues, current_action_qvalues_mix, target_action_qvalues_mix
diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index 084a12c0..bce361e3 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -11,6 +11,7 @@
 from common.categorical import CategoricalQ
 import tensorflow_probability as tfp
 
+
 class VDNAgent:
     def __init__(self, sess, base_name, observation_space, action_space, config, logger):
         observation_shape = observation_space.shape
@@ -20,7 +21,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
         self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
         self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
         self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)
-        self.learning_rate_ph = tf.placeholder('float32', (), name = 'lr_ph')
+        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')
         self.games_to_track = tr_helpers.get_or_default(config, 'games_to_track', 100)
         self.max_epochs = tr_helpers.get_or_default(self.config, 'max_epochs', 1e6)
 
@@ -34,51 +35,60 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
         if self.is_adaptive_lr:
             self.lr_threshold = config['lr_threshold']
         if self.is_polynom_decay_lr:
-            self.lr_multiplier = tf.train.polynomial_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs, end_learning_rate=0.001, power=tr_helpers.get_or_default(config, 'decay_power', 1.0))
+            self.lr_multiplier = tf.train.polynomial_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs,
+                                                           end_learning_rate=0.001,
+                                                           power=tr_helpers.get_or_default(config, 'decay_power', 1.0))
         if self.is_exp_decay_lr:
-            self.lr_multiplier = tf.train.exponential_decay(1.0, global_step=self.epoch_num, decay_steps=self.max_epochs,  decay_rate = config['decay_rate'])
-            
+            self.lr_multiplier = tf.train.exponential_decay(1.0, global_step=self.epoch_num,
+                                                            decay_steps=self.max_epochs,
+                                                            decay_rate=config['decay_rate'])
+
         self.env_name = config['env_name']
         self.network = config['network']
+        self.batch_size = self.config['batch_size']
+
         self.obs_shape = observation_shape
         self.actions_num = actions_num
         self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("%d, %H:%M:%S"))
         self.epsilon = self.config['epsilon']
         self.rewards_shaper = self.config['reward_shaper']
-        self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'], self.config['epsilon_decay_frames'])
-        self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'], self.config['beta_decay_frames'])
+        self.epsilon_processor = tr_helpers.LinearValueProcessor(self.config['epsilon'], self.config['min_epsilon'],
+                                                                 self.config['epsilon_decay_frames'])
+        self.beta_processor = tr_helpers.LinearValueProcessor(self.config['priority_beta'], self.config['max_beta'],
+                                                              self.config['beta_decay_frames'])
         if self.env_name:
             self.env_config = config.get('env_config', {})
             self.env = env_configurations.configurations[self.env_name]['env_creator'](**self.env_config)
         self.sess = sess
         self.steps_num = self.config['steps_num']
-        
+
         self.obs_act_rew = deque([], maxlen=self.steps_num)
-        
+
         self.is_prioritized = config['replay_buffer_type'] != 'normal'
         self.atoms_num = self.config['atoms_num']
         assert self.atoms_num == 1
-        
+
         self.state_shape = (self.env.env_info['state_shape'],)
         self.n_agents = self.env.env_info['n_agents']
-        
+
         if not self.is_prioritized:
             self.exp_buffer = experience.ReplayBuffer(config['replay_buffer_size'])
-        else: 
+        else:
             self.exp_buffer = experience.PrioritizedReplayBuffer(config['replay_buffer_size'], config['priority_alpha'])
-            self.sample_weights_ph = tf.placeholder(tf.float32, shape= [None,] , name='sample_weights')
-        
-        self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape , name = 'obs_ph')
-        self.state_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.state_shape , name = 'state_ph')
-        self.actions_ph = tf.placeholder(tf.int32, shape=[None,], name = 'actions_ph')
-        self.rewards_ph = tf.placeholder(tf.float32, shape=[None,], name = 'rewards_ph')
-        self.next_obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape , name = 'next_obs_ph')
-        self.is_done_ph = tf.placeholder(tf.float32, shape=[None,], name = 'is_done_ph')
+            self.sample_weights_ph = tf.placeholder(tf.float32, shape=[None, 1], name='sample_weights')
+
+        self.batch_size_ph = tf.placeholder(tf.int32, name='batch_size_ph')
+        self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape, name='obs_ph')
+        self.state_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.state_shape, name='state_ph')
+        self.actions_ph = tf.placeholder(tf.int32, shape=[None, 1], name='actions_ph')
+        self.rewards_ph = tf.placeholder(tf.float32, shape=[None, 1], name='rewards_ph')
+        self.next_obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape, name='next_obs_ph')
+        self.is_done_ph = tf.placeholder(tf.float32, shape=[None, 1], name='is_done_ph')
         self.is_not_done = 1 - self.is_done_ph
         self.name = base_name
-        
+
         self.gamma = self.config['gamma']
-        self.gamma_step = self.gamma**self.steps_num
+        self.gamma_step = self.gamma ** self.steps_num
         self.grad_norm = config['grad_norm']
         self.input_obs = self.obs_ph
         self.input_next_obs = self.next_obs_ph
@@ -87,11 +97,12 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
             self.input_obs = tf.to_float(self.input_obs) / 255.0
             self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0
         self.setup_qvalues(actions_num)
-        
+
         self.reg_loss = tf.losses.get_regularization_loss()
         self.td_loss_mean += self.reg_loss
         self.learning_rate = self.config['learning_rate']
-        self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier)#.minimize(self.td_loss_mean, var_list=self.weights)
+        self.train_step = tf.train.AdamOptimizer(
+            self.learning_rate * self.lr_multiplier)  # .minimize(self.td_loss_mean, var_list=self.weights)
         grads = tf.gradients(self.td_loss_mean, self.weights)
         if self.config['truncate_grads']:
             grads, _ = tf.clip_by_global_norm(grads, self.grad_norm)
@@ -99,58 +110,65 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
         self.train_op = self.train_step.apply_gradients(grads)
 
         self.saver = tf.train.Saver()
-        self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in zip(self.weights, self.target_weights)]
+        self.assigns_op = [tf.assign(w_target, w_self, validate_shape=True) for w_self, w_target in
+                           zip(self.weights, self.target_weights)]
         self.variables = TensorFlowVariables(self.qvalues, self.sess)
         if self.env_name:
             sess.run(tf.global_variables_initializer())
         self._reset()
-    
+
     def get_weights(self):
         return self.variables.get_flat()
 
     def set_weights(self, weights):
         return self.variables.set_flat(weights)
-    
+
     def update_epoch(self):
         return self.sess.run([self.update_epoch_op])[0]
-    
+
     def setup_qvalues(self, actions_num):
         config = {
-            'input_obs' : self.input_obs,
+            'input_obs': self.input_obs,
             'input_next_obs': self.input_next_obs,
-            'actions_num' : actions_num,
+            'actions_num': actions_num,
             'is_double': self.config['is_double'],
-            'actions_ph': self.actions_ph
+            'actions_ph': self.actions_ph,
+            'batch_size_ph': self.batch_size_ph,
+            'n_agents': self.n_agents
         }
 
+        # (bs, n_agents, n_actions), (bs, 1), (bs, 1)
         self.qvalues, self.current_action_qvalues_mix, self.target_action_qvalues_mix = self.network(config)
 
         self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='agent')
         self.target_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target')
 
-        self.reference_qvalues = self.rewards_ph + self.gamma_step *self.is_not_done * self.target_action_qvalues_mix
-        
+        self.reference_qvalues = self.rewards_ph + self.gamma_step * self.is_not_done * self.target_action_qvalues_mix
+
         if self.is_prioritized:
             # we need to return l1 loss to update priority buffer
             self.abs_errors = tf.abs(self.current_action_qvalues_mix - self.reference_qvalues) + 1e-5
             # the same as multiply gradients later (other way is used in different examples over internet) 
-            self.td_loss = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, reduction=tf.losses.Reduction.NONE) * self.sample_weights_ph
-            self.td_loss_mean = tf.reduce_mean(self.td_loss) 
+            self.td_loss = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues,
+                                                reduction=tf.losses.Reduction.NONE) * self.sample_weights_ph
+            self.td_loss_mean = tf.reduce_mean(self.td_loss)
         else:
-            self.td_loss_mean = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues, reduction=tf.losses.Reduction.MEAN)
-            
+            self.td_loss_mean = tf.losses.huber_loss(self.current_action_qvalues_mix, self.reference_qvalues,
+                                                     reduction=tf.losses.Reduction.MEAN)
+
         self.reg_loss = tf.losses.get_regularization_loss()
         self.td_loss_mean += self.reg_loss
         self.learning_rate = self.config['learning_rate']
         if self.env_name:
-            self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(self.td_loss_mean, var_list=self.weights)
-            
+            self.train_step = tf.train.AdamOptimizer(self.learning_rate * self.lr_multiplier).minimize(
+                self.td_loss_mean, var_list=self.weights)
+
     def save(self, fn):
         self.saver.save(self.sess, fn)
 
     def restore(self, fn):
         self.saver.restore(self.sess, fn)
-        
+
     def _reset(self):
         self.obs_act_rew.clear()
         if self.env_name:
@@ -158,19 +176,21 @@ def _reset(self):
         self.total_reward = 0.0
         self.total_shaped_reward = 0.0
         self.step_count = 0
-        
+
     def get_action(self, obs, avail_acts, epsilon=0.0):
         if np.random.random() < epsilon:
             action = tfp.distributions.Categorical(probs=avail_acts.astype(float)).sample().eval(session=self.sess)
         else:
-            qvals = self.get_qvalues(obs)
+            obs = obs.reshape((self.n_agents,) + self.obs_shape)
+            # (n_agents, num_actions)
+            qvals = self.get_qvalues(obs).squeeze(0)
             qvals[avail_acts == False] = -9999999
             action = np.argmax(qvals, axis=1)
-        return action  
-    
+        return action
+
     def get_qvalues(self, obs):
-        return self.sess.run(self.qvalues, {self.obs_ph: obs})
-        
+        return self.sess.run(self.qvalues, {self.obs_ph: obs, self.batch_size_ph: 1})
+
     def play_steps(self, steps, epsilon=0.0):
         done_reward = None
         done_shaped_reward = None
@@ -190,8 +210,12 @@ def play_steps(self, steps, epsilon=0.0):
 
             action = self.get_action(obs, self.env.get_action_mask(), epsilon)
             new_obs, reward, is_done, _ = self.env.step(action)
-            #reward = reward * (1 - is_done)
- 
+            # reward = reward * (1 - is_done)
+
+            # Same reward, done for all agents
+            reward = reward[0]
+            is_done = all(is_done)
+
             self.step_count += 1
             self.total_reward += reward
             shaped_reward = self.rewards_shaper(reward)
@@ -210,29 +234,48 @@ def play_steps(self, steps, epsilon=0.0):
             self.exp_buffer.add(self.current_obs, current_action, current_st, steps_rewards, new_obs, is_done)
             self.current_obs = next_obs
             break
-            
-        if all(is_done):
+
+        if is_done:
             done_reward = self.total_reward
             done_steps = self.step_count
             done_shaped_reward = self.total_shaped_reward
             self._reset()
         return done_reward, done_shaped_reward, done_steps
-    
+
     def load_weights_into_target_network(self):
         self.sess.run(self.assigns_op)
 
     def sample_batch(self, exp_replay, batch_size):
-        obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch  = exp_replay.sample(batch_size)
+        obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(batch_size)
+        obs_batch = obs_batch.reshape((batch_size * self.n_agents,) + self.obs_shape)
+        act_batch = act_batch.reshape((batch_size * self.n_agents, 1))
+        st_batch = st_batch.reshape((batch_size,) + self.state_shape)
+        next_obs_batch = next_obs_batch.reshape((batch_size * self.n_agents,) + self.obs_shape)
+        reward_batch = reward_batch.reshape((batch_size, 1))
+        is_done_batch = is_done_batch.reshape((batch_size, 1))
+
         return {
-        self.obs_ph:obs_batch, self.actions_ph:act_batch, self.state_ph: st_batch, 
-        self.rewards_ph:reward_batch, self.is_done_ph:is_done_batch, self.next_obs_ph:next_obs_batch
+            self.obs_ph: obs_batch, self.actions_ph: act_batch, self.state_ph: st_batch,
+            self.rewards_ph: reward_batch, self.is_done_ph: is_done_batch, self.next_obs_ph: next_obs_batch,
+            self.batch_size_ph: batch_size
         }
 
     def sample_prioritized_batch(self, exp_replay, batch_size, beta):
-        obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch,  sample_weights, sample_idxes = exp_replay.sample(batch_size, beta)
-        batch = { self.obs_ph:obs_batch, self.actions_ph:act_batch, self.state_ph: st_batch, self.rewards_ph:reward_batch, 
-        self.is_done_ph:is_done_batch, self.next_obs_ph:next_obs_batch, self.sample_weights_ph: sample_weights }
-        return [batch , sample_idxes]
+        obs_batch, act_batch, st_batch, reward_batch, next_obs_batch, is_done_batch, sample_weights, sample_idxes = exp_replay.sample(
+            batch_size, beta)
+        obs_batch = obs_batch.reshape((batch_size * self.n_agents,) + self.obs_shape)
+        act_batch = act_batch.reshape((batch_size * self.n_agents, 1))
+        st_batch = st_batch.reshape((batch_size,) + self.state_shape)
+        next_obs_batch = next_obs_batch.reshape((batch_size * self.n_agents,) + self.obs_shape)
+        reward_batch = reward_batch.reshape((batch_size, 1))
+        is_done_batch = is_done_batch.reshape((batch_size, 1))
+        sample_weights = sample_weights.reshape((batch_size, 1))
+        batch = {self.obs_ph: obs_batch, self.actions_ph: act_batch, self.state_ph: st_batch,
+                 self.rewards_ph: reward_batch,
+                 self.is_done_ph: is_done_batch, self.next_obs_ph: next_obs_batch,
+                 self.sample_weights_ph: sample_weights,
+                 self.batch_size_ph: batch_size}
+        return [batch, sample_idxes]
 
     def train(self):
         mem_free_steps = 0
@@ -249,9 +292,6 @@ def train(self):
             self.play_steps(self.steps_num, self.epsilon)
         steps_per_epoch = self.config['steps_per_epoch']
         num_epochs_to_copy = self.config['num_epochs_to_copy']
-        batch_size = self.config['batch_size']
-        lives_reward = self.config['lives_reward']
-        episodes_to_log = self.config['episodes_to_log']
         frame = 0
         play_time = 0
         update_time = 0
@@ -259,7 +299,7 @@ def train(self):
         shaped_rewards = []
         steps = []
         losses = deque([], maxlen=100)
-        
+
         while True:
             epoch_num = self.update_epoch()
             t_play_start = time.time()
@@ -271,30 +311,32 @@ def train(self):
                 if reward != None:
                     self.game_lengths.append(step)
                     self.game_rewards.append(reward)
-                    #shaped_rewards.append(shaped_reward)
+                    # shaped_rewards.append(shaped_reward)
 
             t_play_end = time.time()
             play_time += t_play_end - t_play_start
-            
+
             # train
             frame = frame + steps_per_epoch
             t_start = time.time()
             if self.is_prioritized:
-                batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=batch_size, beta = self.beta)
-                _, loss_t, errors_update, lr_mul = self.sess.run([self.train_op, self.train_step, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch)
+                batch, idxes = self.sample_prioritized_batch(self.exp_buffer, batch_size=self.batch_size,
+                                                             beta=self.beta)
+                _, loss_t, errors_update, lr_mul = self.sess.run(
+                    [self.train_op, self.td_loss_mean, self.abs_errors, self.lr_multiplier], batch)
                 self.exp_buffer.update_priorities(idxes, errors_update)
             else:
-                batch = self.sample_batch(self.exp_buffer, batch_size=batch_size)
-                print(self.sess.run(self.qvalues, batch).shape)
-                _, loss_t, lr_mul = self.sess.run([self.train_op, self.train_step, self.td_loss_mean, self.lr_multiplier], batch)
-                
+                batch = self.sample_batch(self.exp_buffer, batch_size=self.batch_size)
+                _, loss_t, lr_mul = self.sess.run(
+                    [self.train_op, self.td_loss_mean, self.lr_multiplier], batch)
+
             losses.append(loss_t)
             t_end = time.time()
             update_time += t_end - t_start
             total_time += update_time
             if frame % 1000 == 0:
-                mem_free_steps += 1 
-                if mem_free_steps  == 10:
+                mem_free_steps += 1
+                if mem_free_steps == 10:
                     mem_free_steps = 0
                     tr_helpers.free_mem()
                 sum_time = update_time + play_time
@@ -304,19 +346,18 @@ def train(self):
                 self.writer.add_scalar('performance/play_time', play_time, frame)
                 self.writer.add_scalar('losses/td_loss', np.mean(losses), frame)
                 self.writer.add_scalar('info/lr_mul', lr_mul, frame)
-                self.writer.add_scalar('info/lr', self.learning_rate*lr_mul, frame)
+                self.writer.add_scalar('info/lr', self.learning_rate * lr_mul, frame)
                 self.writer.add_scalar('info/epochs', epoch_num, frame)
                 self.writer.add_scalar('info/epsilon', self.epsilon, frame)
                 if self.is_prioritized:
                     self.writer.add_scalar('beta', self.beta, frame)
-                    
+
                 update_time = 0
                 play_time = 0
                 num_games = len(self.game_rewards)
                 if num_games > 10:
-                    d = num_games / lives_reward
-                    mean_rewards = np.sum(self.game_rewards) / d 
-                    mean_lengths = np.sum(self.game_lengths) / d
+                    mean_rewards = np.sum(self.game_rewards) / num_games
+                    mean_lengths = np.sum(self.game_lengths) / num_games
                     self.writer.add_scalar('rewards/mean', mean_rewards, frame)
                     self.writer.add_scalar('rewards/time', mean_rewards, total_time)
                     self.writer.add_scalar('episode_lengths/mean', mean_lengths, frame)
@@ -329,11 +370,12 @@ def train(self):
                         if last_mean_rewards > self.config['score_to_win']:
                             print('network won!')
                             return last_mean_rewards, epoch_num
-                        
+
             if frame % num_epochs_to_copy == 0:
                 self.load_weights_into_target_network()
-            
+
             if epoch_num >= self.max_epochs:
                 print('Max epochs reached')
-                self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(np.sum(self.game_rewards) *  lives_reward / len(self.game_rewards)))
-                return last_mean_rewards, epoch_num 
+                self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(
+                    np.sum(self.game_rewards) / len(self.game_rewards)))
+                return last_mean_rewards, epoch_num
diff --git a/common/experience.py b/common/experience.py
index 2765db79..9312fb23 100644
--- a/common/experience.py
+++ b/common/experience.py
@@ -30,7 +30,7 @@ def add(self, obs_t, action, state_t, reward, obs_tp1, done):
         self._next_idx = (self._next_idx + 1) % self._maxsize
 
     def _encode_sample(self, idxes):
-        obses_t, actions, states_t, rewards, obses_tp1, dones = [], [], [], [], []
+        obses_t, actions, states_t, rewards, obses_tp1, dones = [], [], [], [], [], []
         for i in idxes:
             data = self._storage[i]
             obs_t, action, state_t, reward, obs_tp1, done = data
diff --git a/configs/vdn_3s5z_vs_3s6z.yaml b/configs/vdn_3s5z_vs_3s6z.yaml
index 75e6f51a..dcf9b358 100644
--- a/configs/vdn_3s5z_vs_3s6z.yaml
+++ b/configs/vdn_3s5z_vs_3s6z.yaml
@@ -15,6 +15,27 @@ params:
     dueling: True
     atoms: 1
     noisy: False
+    cnn:
+      type: conv1d
+      activation: relu
+      initializer:
+        name: variance_scaling_initializer
+        scale: 2
+      regularizer:
+        name: 'None'
+      convs:
+        - filters: 64
+          kernel_size: 3
+          strides: 2
+          padding: 'same'
+        - filters: 128
+          kernel_size: 3
+          strides: 1
+          padding: 'valid'
+        - filters: 256
+          kernel_size: 3
+          strides: 1
+          padding: 'valid'
     mlp:
       units: [256]
       activation: relu
@@ -30,7 +51,7 @@ params:
     gamma: 0.99
     learning_rate: 0.0005
     steps_per_epoch: 4
-    batch_size: 128
+    batch_size: 64
     epsilon: 1.0
     min_epsilon: 0.05
     epsilon_decay_frames: 100000
@@ -39,7 +60,7 @@ params:
     name: 3s5z_vs_3s6z
     is_double: True
     score_to_win: 20
-    num_steps_fill_buffer: 10000
+    num_steps_fill_buffer: 200
     replay_buffer_type: 'normal'
     replay_buffer_size: 100000
     priority_beta: 0.4
@@ -52,7 +73,7 @@ params:
     games_to_track: 20
     lr_schedule: None
     max_epochs: 100000
-    grad_norm: 0.5
+    grad_norm: 10
     mix_with_state: False
     truncate_grads: True
 

From 650c741ca6d933fea82909e5df3ad641868162bf Mon Sep 17 00:00:00 2001
From: Tarun <tarun1995gupta@gmail.com>
Date: Mon, 22 Jun 2020 22:53:55 +0100
Subject: [PATCH 13/14] final update

---
 configs/vdn_3s5z_vs_3s6z.yaml |  6 +--
 configs/vdn_3s_vs_5z.yaml     | 84 +++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 3 deletions(-)
 create mode 100644 configs/vdn_3s_vs_5z.yaml

diff --git a/configs/vdn_3s5z_vs_3s6z.yaml b/configs/vdn_3s5z_vs_3s6z.yaml
index dcf9b358..240f18e3 100644
--- a/configs/vdn_3s5z_vs_3s6z.yaml
+++ b/configs/vdn_3s5z_vs_3s6z.yaml
@@ -60,14 +60,14 @@ params:
     name: 3s5z_vs_3s6z
     is_double: True
     score_to_win: 20
-    num_steps_fill_buffer: 200
+    num_steps_fill_buffer: 100000
     replay_buffer_type: 'normal'
-    replay_buffer_size: 100000
+    replay_buffer_size: 1000000
     priority_beta: 0.4
     priority_alpha: 0.6
     beta_decay_frames: 100000
     max_beta: 1
-    steps_num: 128
+    steps_num: 10
 #    episodes_to_log: 10
     atoms_num: 1
     games_to_track: 20
diff --git a/configs/vdn_3s_vs_5z.yaml b/configs/vdn_3s_vs_5z.yaml
new file mode 100644
index 00000000..4fe035b0
--- /dev/null
+++ b/configs/vdn_3s_vs_5z.yaml
@@ -0,0 +1,84 @@
+label: ""
+name: ""
+params:
+  algo:
+    name: vdn
+
+  model:
+    name: vdn
+
+  load_checkpoint: False
+  load_path: ""
+
+  network:
+    name: dqn
+    dueling: True
+    atoms: 1
+    noisy: False
+    cnn:
+      type: conv1d
+      activation: relu
+      initializer:
+        name: variance_scaling_initializer
+        scale: 2
+      regularizer:
+        name: 'None'
+      convs:
+        - filters: 64
+          kernel_size: 3
+          strides: 2
+          padding: 'same'
+        - filters: 128
+          kernel_size: 3
+          strides: 1
+          padding: 'valid'
+        - filters: 256
+          kernel_size: 3
+          strides: 1
+          padding: 'valid'
+    mlp:
+      units: [256]
+      activation: relu
+      initializer:
+        name: variance_scaling_initializer
+        scale: 2
+      regularizer:
+        name: 'None'
+
+  config:
+    reward_shaper:
+      scale_value: 0.1
+    gamma: 0.99
+    learning_rate: 0.0005
+    steps_per_epoch: 4
+    batch_size: 64
+    epsilon: 1.0
+    min_epsilon: 0.05
+    epsilon_decay_frames: 100000
+    num_epochs_to_copy: 10000
+    env_name:  smac_cnn
+    name: 3s_vs_5z
+    is_double: True
+    score_to_win: 20
+    num_steps_fill_buffer: 100000
+    replay_buffer_type: 'normal'
+    replay_buffer_size: 1000000
+    priority_beta: 0.4
+    priority_alpha: 0.6
+    beta_decay_frames: 100000
+    max_beta: 1
+    steps_num: 10
+#    episodes_to_log: 10
+    atoms_num: 1
+    games_to_track: 20
+    lr_schedule: None
+    max_epochs: 100000
+    grad_norm: 10
+    mix_with_state: False
+    truncate_grads: True
+
+    env_config:
+      name: 3s_vs_5z
+      frames: 4
+      transpose: True
+      random_invalid_step: False
\ No newline at end of file

From 072d2ac300cd13162fd11cfbd8d102b80347ee0e Mon Sep 17 00:00:00 2001
From: Christian Schroeder <caschroeder@outlook.com>
Date: Tue, 23 Jun 2020 20:31:01 +0100
Subject: [PATCH 14/14] created ReplayBufferCentralState

---
 algos_tf14/vdnagent.py |  7 +++--
 common/experience.py   | 62 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/algos_tf14/vdnagent.py b/algos_tf14/vdnagent.py
index bce361e3..6d5b1ef6 100644
--- a/algos_tf14/vdnagent.py
+++ b/algos_tf14/vdnagent.py
@@ -72,10 +72,11 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
         self.n_agents = self.env.env_info['n_agents']
 
         if not self.is_prioritized:
-            self.exp_buffer = experience.ReplayBuffer(config['replay_buffer_size'])
+            self.exp_buffer = experience.ReplayBufferCentralState(config['replay_buffer_size'])
         else:
-            self.exp_buffer = experience.PrioritizedReplayBuffer(config['replay_buffer_size'], config['priority_alpha'])
-            self.sample_weights_ph = tf.placeholder(tf.float32, shape=[None, 1], name='sample_weights')
+            raise NotImplementedError("Not implemented! PrioritizedReplayBuffer with CentralState")
+            #self.exp_buffer = experience.PrioritizedReplayBufferCentralState(config['replay_buffer_size'], config['priority_alpha'])
+            #self.sample_weights_ph = tf.placeholder(tf.float32, shape=[None, 1], name='sample_weights')
 
         self.batch_size_ph = tf.placeholder(tf.int32, name='batch_size_ph')
         self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.obs_shape, name='obs_ph')
diff --git a/common/experience.py b/common/experience.py
index 9312fb23..d8670bd5 100644
--- a/common/experience.py
+++ b/common/experience.py
@@ -4,7 +4,7 @@
 from common.segment_tree import SumSegmentTree, MinSegmentTree
 
 
-class ReplayBuffer(object):
+class ReplayBufferCentralState(object):
     def __init__(self, size):
         """Create Replay buffer.
         Parameters
@@ -65,6 +65,66 @@ def sample(self, batch_size):
         idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
         return self._encode_sample(idxes)
 
+class ReplayBuffer(object):
+    def __init__(self, size):
+        """Create Replay buffer.
+        Parameters
+        ----------
+        size: int
+            Max number of transitions to store in the buffer. When the buffer
+            overflows the old memories are dropped.
+        """
+        self._storage = []
+        self._maxsize = size
+        self._next_idx = 0
+
+    def __len__(self):
+        return len(self._storage)
+
+    def add(self, obs_t, action, reward, obs_tp1, done):
+        data = (obs_t, action, reward, obs_tp1, done)
+
+        if self._next_idx >= len(self._storage):
+            self._storage.append(data)
+        else:
+            self._storage[self._next_idx] = data
+        self._next_idx = (self._next_idx + 1) % self._maxsize
+
+    def _encode_sample(self, idxes):
+        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
+        for i in idxes:
+            data = self._storage[i]
+            obs_t, action, reward, obs_tp1, done = data
+            obses_t.append(np.array(obs_t, copy=False))
+            actions.append(np.array(action, copy=False))
+            rewards.append(reward)
+            obses_tp1.append(np.array(obs_tp1, copy=False))
+            dones.append(done)
+        return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
+
+    def sample(self, batch_size):
+        """Sample a batch of experiences.
+        Parameters
+        ----------
+        batch_size: int
+            How many transitions to sample.
+        Returns
+        -------
+        obs_batch: np.array
+            batch of observations
+        act_batch: np.array
+            batch of actions executed given obs_batch
+        rew_batch: np.array
+            rewards received as results of executing act_batch
+        next_obs_batch: np.array
+            next set of observations seen after executing act_batch
+        done_mask: np.array
+            done_mask[i] = 1 if executing act_batch[i] resulted in
+            the end of an episode and 0 otherwise.
+        """
+        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
+        return self._encode_sample(idxes)
+
 
 class PrioritizedReplayBuffer(ReplayBuffer):
     def __init__(self, size, alpha):