Skip to content

Commit

Permalink
Merge pull request #3 from asokraju/crc-v7
Browse files Browse the repository at this point in the history
Crc v7
  • Loading branch information
asokraju authored Sep 27, 2020
2 parents 36e4f6b + b79f265 commit 13961bd
Show file tree
Hide file tree
Showing 28 changed files with 12,117 additions and 46 deletions.
63 changes: 43 additions & 20 deletions learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _get_state(self):
self.state = np.array(random.sample(range(self.total_states), self.n_agents))

def _get_desired(self):
self.desired_state = np.array([0, 4, 14, 24, 32])#np.array(random.sample(range(self.total_states), self.n_agents) )
self.desired_state = np.array([0,5 , 30, 35, 32])#np.array(random.sample(range(self.total_states), self.n_agents) )

def _set_state(self, state):
self.state = state
Expand Down Expand Up @@ -128,6 +128,7 @@ def reset(self):
"""
self._get_state()
#self._get_desired()
self.done=False
return self.state

def step(self, a):
Expand Down Expand Up @@ -237,6 +238,7 @@ def train_multi_agent(env, args, actors, critics, rew_approx, reward_result):
obs, obs_scaled, actions, rewards = [[] for _ in range(nodes)], [[] for _ in range(nodes)], [[] for _ in range(nodes)], [[] for _ in range(nodes)]
predicted_rewards_all = [[] for _ in range(nodes)]
rewards_mean = [[] for _ in range(nodes)]

#rewards_predicted = [[] for _ in range(nodes)]
done = False
#running the episode
Expand All @@ -252,6 +254,7 @@ def train_multi_agent(env, args, actors, critics, rew_approx, reward_result):
if done:
print(t,j)
while not done:
#for j in range(args['max_episode_len']):
for node in range(nodes):
if args['scaling']:
a = get_action(actors[node], obs_scaled[node][-1], eps = eps)
Expand Down Expand Up @@ -283,7 +286,10 @@ def train_multi_agent(env, args, actors, critics, rew_approx, reward_result):
act_loss, crit_loss, rew_loss = [], [], []
#training the reward network
for node in range(nodes):
states = np.vstack(obs_scaled[node][:-1])
if args['scaling']:
states = np.vstack(obs_scaled[node][:-1])
else:
states = np.vstack(obs[node][:-1])

for _ in range(100):
r_loss = rew_approx[node].train_on_batch(states, np.reshape(rewards[node], (-1,1)))
Expand All @@ -298,21 +304,37 @@ def train_multi_agent(env, args, actors, critics, rew_approx, reward_result):
rew_approx[node].trainable_variables[i].assign(temp/env.n_agents)

#training the Actor and Critic networks
node_true_returns = []
node_predicted_returns = []
for node in range(nodes):
states = np.vstack(obs_scaled[node][:-1])
final_state = np.vstack(obs_scaled[node][-1])
if args['scaling']:
states = np.vstack(obs_scaled[node][:-1])
final_state = np.vstack(obs_scaled[node][-1])
else:
states = np.vstack(obs[node][:-1])
final_state = np.vstack(obs[node][-1])
predicted_rewards = rew_approx[node].predict(states)
predicted_rewards_all[node].append(predicted_rewards)
if args['adversary']:
if node==0:
returns = discount_reward(rewards[node], GAMMA=args['gamma'])
else:
returns = discount_reward(predicted_rewards, GAMMA=args['gamma'])
else:
returns = discount_reward(predicted_rewards, GAMMA=args['gamma'])
# if args['adversary']:
# if node==0:
# returns = discount_reward(rewards[node], GAMMA=args['gamma'])
# else:
# returns = discount_reward(predicted_rewards, GAMMA=args['gamma'])
# else:
# returns = discount_reward(predicted_rewards, GAMMA=args['gamma'])
# returns -= np.mean(returns)
# returns /= np.std(returns)

predicted_returns = discount_reward(predicted_rewards, GAMMA=args['gamma'])
predicted_returns -= np.mean(predicted_returns)
predicted_returns /= np.std(predicted_returns)
node_predicted_returns.append(predicted_returns[0])

returns = discount_reward(rewards_mean[node], GAMMA=args['gamma'])
returns -= np.mean(returns)
returns /= np.std(returns)

node_true_returns.append(returns[0])

targets_actions = np.array([[1 if a==i else 0 for i in range(env.n_actions)] for j, a in enumerate(actions[node])])

V_s0 = critics[node].predict(states)
Expand All @@ -322,10 +344,10 @@ def train_multi_agent(env, args, actors, critics, rew_approx, reward_result):
td = returns + fin_discount.reshape((j,1)) - V_s0


for _ in range(100-np.min([np.int(t/1), 99])):
for _ in range(100):
c_loss = critics[node].train_on_batch(states, returns+fin_discount.reshape((j,1)))

for _ in range(10-np.min([np.int(t/10),9])):
for _ in range(10):
a_loss = actors[node].train_on_batch(states, targets_actions, sample_weight=td.reshape(-1, ))
act_loss.append(a_loss)
crit_loss.append(c_loss)
Expand All @@ -342,20 +364,21 @@ def train_multi_agent(env, args, actors, critics, rew_approx, reward_result):
with writer.as_default():
tf.summary.scalar("actor loss", np.mean(act_loss), step = t)
tf.summary.scalar("critic loss", np.mean(crit_loss), step = t)
tf.summary.scalar("critic loss", np.mean(rew_loss), step = t)
tf.summary.scalar("rew loss", np.mean(rew_loss), step = t)
writer.flush()
print('| Reward: {} | Episode: {} | actor loss: {} |critic loss: {} | reward loss: {} | done: {}'.format(ep_reward, t, np.mean(act_loss), np.mean(crit_loss),np.mean(rew_loss), done))
# fig, ax = plt.subplots(nrows=1, ncols=5, figsize = (24,4))
# for i in range(5):
# ax[i].plot(range(j), rewards[i])
# plt.show()
reward_result[t] = ep_reward.sum()

path = {
"Observation":obs,
"Action":actions,#np.concatenate(actions),
"Reward":rewards,#np.asarray(rewards)
"predicted_rewards_all":predicted_rewards_all
"predicted_rewards_all":predicted_rewards_all,
"true returns":np.mean(node_true_returns),
"predicted returns":np.mean(node_predicted_returns)
}
paths.append(path)
break
Expand Down Expand Up @@ -510,11 +533,11 @@ def main(args, reward_result):
#parser.add_argument('--save_model', help='Saving model from summary_dir', type = bool, default=False)
#parser.add_argument('--load_model', help='Loading model from summary_dir', type = bool, default=True)
parser.add_argument('--random_seed', help='seeding the random number generator', type = int, default=1754)
parser.add_argument('--adversary', help='Is node 1 an adversary?', type = bool, default=True)
parser.add_argument('--adversary', help='Is node 1 an adversary?', type = bool, default=False)

#agent params
parser.add_argument('--max_episodes', help='max number of episodes', type = int, default=200)
parser.add_argument('--max_episode_len', help='Number of steps per epsiode', type = int, default=1000)
parser.add_argument('--max_episodes', help='max number of episodes', type = int, default=4)
parser.add_argument('--max_episode_len', help='Number of steps per epsiode', type = int, default=100)
parser.add_argument('--actor_lr', help='actor network learning rate',type =float, default=0.01)
parser.add_argument('--critic_lr', help='critic network learning rate',type =float, default=0.01)
parser.add_argument('--rew_lr', help='critic network learning rate',type =float, default=0.01)
Expand Down
Loading

0 comments on commit 13961bd

Please sign in to comment.