Skip to content

Commit

Permalink
Merge branch 'master' of github.com:schroederdewitt/rl_games into master
Browse files Browse the repository at this point in the history
  • Loading branch information
Christian Schroeder de Witt committed Oct 18, 2020
2 parents bf11583 + bfaee18 commit 438a4a5
Show file tree
Hide file tree
Showing 28 changed files with 1,753 additions and 456 deletions.
292 changes: 139 additions & 153 deletions algos_tf14/a2c_discrete.py

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions algos_tf14/dqnagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from common.categorical import CategoricalQ

class DQNAgent:
def __init__(self, sess, base_name, observation_space, action_space, config, logger):
def __init__(self, sess, base_name, observation_space, action_space, config, logger, central_state_space=None):
observation_shape = observation_space.shape
actions_num = action_space.n
self.config = config
Expand Down Expand Up @@ -60,12 +60,14 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
self.v_max = self.config['v_max']
self.delta_z = (self.v_max - self.v_min) / (self.atoms_num - 1)
self.all_z = tf.range(self.v_min, self.v_max + self.delta_z, self.delta_z)
self.categorical = CategoricalQ(self.atoms_num, self.v_min, self.v_max)
self.categorical = CategoricalQ(self.atoms_num, self.v_min, self.v_max)

self.n_agents = self.env.env_info['n_agents']

if not self.is_prioritized:
self.exp_buffer = experience.ReplayBuffer(config['replay_buffer_size'])
self.exp_buffer = experience.ReplayBuffer(config['replay_buffer_size'], observation_space, self.n_agents)
else:
self.exp_buffer = experience.PrioritizedReplayBuffer(config['replay_buffer_size'], config['priority_alpha'])
self.exp_buffer = experience.PrioritizedReplayBuffer(config['replay_buffer_size'], config['priority_alpha'], observation_space, self.n_agents)
self.sample_weights_ph = tf.placeholder(tf.float32, shape= [None,] , name='sample_weights')

self.obs_ph = tf.placeholder(observation_space.dtype, shape=(None,) + self.state_shape , name = 'obs_ph')
Expand Down
415 changes: 415 additions & 0 deletions algos_tf14/iqlagent.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions algos_tf14/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def __init__(self):
self.model_factory.register_builder('continuous_a2c_lstm_logstd', lambda network, **kwargs : models.LSTMModelA2CContinuousLogStd(network))
self.model_factory.register_builder('dqn', lambda network, **kwargs : models.AtariDQN(network))
self.model_factory.register_builder('vdn', lambda network, **kwargs : models.VDN_DQN(network))
self.model_factory.register_builder('iql', lambda network, **kwargs : models.IQL_DQN(network))


self.network_factory = object_factory.ObjectFactory()
Expand Down
169 changes: 120 additions & 49 deletions algos_tf14/models.py

Large diffs are not rendered by default.

296 changes: 174 additions & 122 deletions algos_tf14/network_builder.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion algos_tf14/vdnagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __init__(self, sess, base_name, observation_space, action_space, config, log
self.n_agents = self.env.env_info['n_agents']

if not self.is_prioritized:
self.exp_buffer = experience.ReplayBufferCentralState(config['replay_buffer_size'])
self.exp_buffer = experience.ReplayBufferCentralState(config['replay_buffer_size'], observation_space, central_state_space, self.n_agents)
else:
raise NotImplementedError("Not implemented! PrioritizedReplayBuffer with CentralState")
#self.exp_buffer = experience.PrioritizedReplayBufferCentralState(config['replay_buffer_size'], config['priority_alpha'])
Expand Down
17 changes: 17 additions & 0 deletions common/env_configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@ def create_flex(path):

return env

def create_staghunt(name, **kwargs):
from envs.stag_hunt import StagHuntEnv
frames = kwargs.pop('frames', 1)
print(kwargs)
return wrappers.BatchedFrameStack(StagHuntEnv(1, **kwargs), frames, transpose=False, flatten=True)

def create_smac(name, **kwargs):
from envs.smac_env import SMACEnv
frames = kwargs.pop('frames', 1)
Expand All @@ -212,6 +218,13 @@ def create_smac_cnn(name, **kwargs):
env = wrappers.BatchedFrameStack(env, frames, transpose=transpose)
return env

def create_staghunt_cnn(name, **kwargs):
from envs.stag_hunt import StagHuntEnv
env = StagHuntEnv(1, **kwargs)
frames = kwargs.pop('frames', 4)
transpose = kwargs.pop('transpose', False)
env = wrappers.BatchedFrameStack(env, frames, transpose=transpose)
return env

configurations = {
'CartPole-v1' : {
Expand Down Expand Up @@ -314,6 +327,10 @@ def create_smac_cnn(name, **kwargs):
'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid_hard.yaml'),
'vecenv_type' : 'ISAAC'
},
'staghunt': {
'env_creator': lambda **kwargs: create_staghunt_cnn(**kwargs),
'vecenv_type': 'RAY_SMAC'
},
'smac' : {
'env_creator' : lambda **kwargs : create_smac(**kwargs),
'vecenv_type' : 'RAY_SMAC'
Expand Down
118 changes: 77 additions & 41 deletions common/experience.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,60 @@


class ReplayBufferCentralState(object):
def __init__(self, size):
def __init__(self, size, ob_space, st_space, n_agents):
"""Create Replay buffer.
Parameters
----------
size: int
Max number of transitions to store in the buffer. When the buffer
overflows the old memories are dropped.
"""
self._storage = []

self._obses = np.zeros((size,) + (n_agents,) + ob_space.shape, dtype=ob_space.dtype)
self._next_obses = np.zeros((size,) + (n_agents,) + ob_space.shape, dtype=ob_space.dtype)
self._rewards = np.zeros(size)
self._actions = np.zeros((size,) + (n_agents,), dtype=np.int32)
self._dones = np.zeros(size, dtype=np.bool)
self._states = np.zeros((size,) + st_space.shape, dtype=st_space.dtype)

self._maxsize = size
self._next_idx = 0
self._curr_size = 0

def __len__(self):
return len(self._storage)
return self._curr_size

def add(self, obs_t, action, state_t, reward, obs_tp1, done):
data = (obs_t, action, state_t, reward, obs_tp1, done)
# print("CAlled")
self._curr_size = min(self._curr_size + 1, self._maxsize)

self._obses[self._next_idx] = obs_t
self._next_obses[self._next_idx] = obs_tp1
self._rewards[self._next_idx] = reward
self._actions[self._next_idx] = action
self._dones[self._next_idx] = done
self._states[self._next_idx] = state_t

if self._next_idx >= len(self._storage):
self._storage.append(data)
else:
self._storage[self._next_idx] = data
self._next_idx = (self._next_idx + 1) % self._maxsize
# print(self._curr_size)

def _get(self, idx):
return self._obses[idx], self._actions[idx], self._states[idx], self._rewards[idx], self._next_obses[idx], self._dones[idx]

def _encode_sample(self, idxes):
obses_t, actions, states_t, rewards, obses_tp1, dones = [], [], [], [], [], []
batch_size = len(idxes)
obses_t, actions, states_t, rewards, obses_tp1, dones = [None] * batch_size, [None] * batch_size, [None] * batch_size, [None] * batch_size, [None] * batch_size, [None] * batch_size
it = 0
for i in idxes:
data = self._storage[i]
obs_t, action, state_t, reward, obs_tp1, done = data
obses_t.append(np.array(obs_t, copy=False))
actions.append(np.array(action, copy=False))
states_t.append(np.array(state_t, copy=False))
rewards.append(reward)
obses_tp1.append(np.array(obs_tp1, copy=False))
dones.append(done)
data = self._get(i)
obs_t, action, state, reward, obs_tp1, done = data
obses_t[it] = np.array(obs_t, copy=False)
actions[it] = np.array(action, copy=False)
states_t[it] = np.array(state, copy=False)
rewards[it] = reward
obses_tp1[it] = np.array(obs_tp1, copy=False)
dones[it] = done
it = it + 1
return np.array(obses_t), np.array(actions), np.array(states_t), np.array(rewards), np.array(obses_tp1), np.array(dones)

def sample(self, batch_size):
Expand All @@ -62,44 +81,61 @@ def sample(self, batch_size):
done_mask[i] = 1 if executing act_batch[i] resulted in
the end of an episode and 0 otherwise.
"""
idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
# print(self._curr_size)
idxes = [random.randint(0, self._curr_size - 1) for _ in range(batch_size)]
return self._encode_sample(idxes)


class ReplayBuffer(object):
def __init__(self, size):
def __init__(self, size, ob_space, n_agents):
"""Create Replay buffer.
Parameters
----------
size: int
Max number of transitions to store in the buffer. When the buffer
overflows the old memories are dropped.
"""
self._storage = []
self._obses = np.zeros((size,) + (n_agents,) + ob_space.shape, dtype=ob_space.dtype)
self._next_obses = np.zeros((size,) + (n_agents,) + ob_space.shape, dtype=ob_space.dtype)
self._rewards = np.zeros(size)
self._actions = np.zeros((size,) + (n_agents,), dtype=np.int32)
self._dones = np.zeros(size, dtype=np.bool)

self._maxsize = size
self._next_idx = 0
self._curr_size = 0

def __len__(self):
return len(self._storage)
return self._curr_size

def add(self, obs_t, action, reward, obs_tp1, done):
data = (obs_t, action, reward, obs_tp1, done)

if self._next_idx >= len(self._storage):
self._storage.append(data)
else:
self._storage[self._next_idx] = data
self._curr_size = min(self._curr_size + 1, self._maxsize )

self._obses[self._next_idx] = obs_t
self._next_obses[self._next_idx] = obs_tp1
self._rewards[self._next_idx] = reward
self._actions[self._next_idx] = action
self._dones[self._next_idx] = done

self._next_idx = (self._next_idx + 1) % self._maxsize

def _get(self, idx):
return self._obses[idx], self._actions[idx], self._rewards[idx], self._next_obses[idx], self._dones[idx]

def _encode_sample(self, idxes):
obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
batch_size = len(idxes)
obses_t, actions, rewards, obses_tp1, dones = [None] * batch_size, [None] * batch_size, [None] * batch_size, [None] * batch_size, [None] * batch_size
it = 0
for i in idxes:
data = self._storage[i]
data = self._get(i)
obs_t, action, reward, obs_tp1, done = data
obses_t.append(np.array(obs_t, copy=False))
actions.append(np.array(action, copy=False))
rewards.append(reward)
obses_tp1.append(np.array(obs_tp1, copy=False))
dones.append(done)
obses_t[it] = np.array(obs_t, copy=False)
actions[it] = np.array(action, copy=False)
rewards[it] = reward
obses_tp1[it] = np.array(obs_tp1, copy=False)
dones[it] = done
it = it + 1
return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)

def sample(self, batch_size):
Expand All @@ -122,12 +158,12 @@ def sample(self, batch_size):
done_mask[i] = 1 if executing act_batch[i] resulted in
the end of an episode and 0 otherwise.
"""
idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
idxes = [random.randint(0, self._curr_size - 1) for _ in range(batch_size)]
return self._encode_sample(idxes)


class PrioritizedReplayBuffer(ReplayBuffer):
def __init__(self, size, alpha):
def __init__(self, size, alpha, ob_space, n_agents):
"""Create Prioritized Replay buffer.
Parameters
----------
Expand All @@ -141,7 +177,7 @@ def __init__(self, size, alpha):
--------
ReplayBuffer.__init__
"""
super(PrioritizedReplayBuffer, self).__init__(size)
super(PrioritizedReplayBuffer, self).__init__(size, ob_space, n_agents)
assert alpha >= 0
self._alpha = alpha

Expand All @@ -162,7 +198,7 @@ def add(self, *args, **kwargs):

def _sample_proportional(self, batch_size):
res = []
p_total = self._it_sum.sum(0, len(self._storage) - 1)
p_total = self._it_sum.sum(0, self._curr_size - 1)
every_range_len = p_total / batch_size
for i in range(batch_size):
mass = random.random() * every_range_len + i * every_range_len
Expand Down Expand Up @@ -208,11 +244,11 @@ def sample(self, batch_size, beta):

weights = []
p_min = self._it_min.min() / self._it_sum.sum()
max_weight = (p_min * len(self._storage)) ** (-beta)
max_weight = (p_min * self._curr_size) ** (-beta)

for idx in idxes:
p_sample = self._it_sum[idx] / self._it_sum.sum()
weight = (p_sample * len(self._storage)) ** (-beta)
weight = (p_sample * self._curr_size) ** (-beta)
weights.append(weight / max_weight)
weights = np.array(weights)
encoded_sample = self._encode_sample(idxes)
Expand All @@ -234,8 +270,8 @@ def update_priorities(self, idxes, priorities):
assert len(idxes) == len(priorities)
for idx, priority in zip(idxes, priorities):
assert priority > 0
assert 0 <= idx < len(self._storage)
assert 0 <= idx < self._curr_size
self._it_sum[idx] = priority ** self._alpha
self._it_min[idx] = priority ** self._alpha

self._max_priority = max(self._max_priority, priority)
self._max_priority = max(self._max_priority, priority)
18 changes: 16 additions & 2 deletions common/vecenv.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,12 @@ def reset(self):
class RayWorker:
def __init__(self, config_name, config):
self.env = configurations[config_name]['env_creator'](**config)
self.obs = self.env.reset()

res = self.env.reset()
if isinstance(res, tuple):
self.obs, self.central_state = res
else:
self.obs = res

def step(self, action):
next_state, reward, is_done, info = self.env.step(action)
Expand Down Expand Up @@ -136,7 +141,16 @@ def step(self, actions):
newrewards.append(crewards)
newdones.append(cdones)
newinfos.append(cinfos)
return np.concatenate(newobs, axis=0), np.concatenate(newrewards, axis=0), np.concatenate(newdones, axis=0), newinfos
#print("newobs: ", newobs)
#print("newrewards: ", newrewards)
#print("newdones: ", newdones)
#print("newinfos:", newinfos)
#raise Exception()
ro = np.concatenate(newobs, axis=0)
rr = np.concatenate(newrewards, axis=0)
rd = np.concatenate(newdones, axis=0)
ri = newinfos
return ro, rr, rd, ri

def has_action_masks(self):
return True
Expand Down
Loading

0 comments on commit 438a4a5

Please sign in to comment.