Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DDPG fixes #75

Closed
wants to merge 11 commits into from
1 change: 1 addition & 0 deletions rl/agent/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class Agent(object):
def __init__(self, env_spec,
**kwargs): # absorb generic param without breaking
self.env_spec = env_spec
self.type = 'keras'

def compile(self, memory, optimizer, policy, preprocessor):
# set 2 way references
Expand Down
78 changes: 50 additions & 28 deletions rl/agent/ddpg.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from rl.agent.dqn import DQN
from rl.util import logger, clone_model, clone_optimizer
from rl.util import logger, clone_model, clone_optimizer, ddpg_weight_init, tanh2, normal_02
import math



class DDPG(DQN):
Expand All @@ -12,16 +14,28 @@ class DDPG(DQN):

def __init__(self, *args, **kwargs):
# import only when needed to contain side-effects
import numpy as np
np.random.seed(1234)
from keras.layers import Dense, Merge
from keras.models import Sequential
from keras import backend as K
from keras.initializations import uniform
self.Dense = Dense
self.Merge = Merge
self.Sequential = Sequential
self.uniform = uniform
self.K = K
import tensorflow as tf
self.tf = tf
self.tf.set_random_seed(1234)
self.sess = tf.Session()
K.set_session(self.sess)

self.TAU = 0.001 # for target network updates
super(DDPG, self).__init__(*args, **kwargs)
self.lr_actor = 0.0001
# print("ACTOR LEARNING RATE")
# print(self.lr_actor)

def compile(self, memory, optimizer, policy, preprocessor):
# override to make 4 optimizers
Expand All @@ -39,55 +53,62 @@ def compile(self, memory, optimizer, policy, preprocessor):

super(DDPG, self).compile(memory, self.optimizer, policy, preprocessor)

def build_actor_models(self):
def build_actor_models(self, weight_init):
model = self.Sequential()
self.build_hidden_layers(model)
self.build_hidden_layers(model, normal_02)
model.add(self.Dense(self.env_spec['action_dim'],
init='lecun_uniform',
activation=self.output_layer_activation))
init='uniform',
# activation=self.output_layer_activation))
activation='tanh2'))
logger.info('Actor model summary')
model.summary()
self.actor = model
self.target_actor = clone_model(self.actor)

def build_critic_models(self):
def build_critic_models(self, weight_init):
state_branch = self.Sequential()
state_branch.add(self.Dense(
self.hidden_layers[0],
self.hidden_layers[0] if len(self.hidden_layers) > 1 else math.floor(self.hidden_layers[0] * 1.25),
input_shape=(self.env_spec['state_dim'],),
activation=self.hidden_layers_activation,
init='lecun_uniform'))
init='normal'))
state_branch.add(self.Dense(
self.hidden_layers[1] if len(self.hidden_layers) > 1 else self.hidden_layers[0],
activation=self.hidden_layers_activation,
init='normal'))

# add action branch to second layer of the network
action_branch = self.Sequential()
action_branch.add(self.Dense(
self.hidden_layers[0],
self.hidden_layers[1] if len(self.hidden_layers) > 1 else self.hidden_layers[0],
input_shape=(self.env_spec['action_dim'],),
activation=self.hidden_layers_activation,
init='lecun_uniform'))
init='normal'))

input_layer = self.Merge([state_branch, action_branch], mode='concat')

model = self.Sequential()
model.add(input_layer)

if (len(self.hidden_layers) > 1):
for i in range(1, len(self.hidden_layers)):
model.add(self.Dense(
self.hidden_layers[i],
init='lecun_uniform',
activation=self.hidden_layers_activation))
# if (len(self.hidden_layers) > 1):
# for i in range(2, len(self.hidden_layers)):
# model.add(self.Dense(
# self.hidden_layers[i],
# init=normal_02,
# # use_bias=True,
# activation=self.hidden_layers_activation))

model.add(self.Dense(1,
init='lecun_uniform',
init='uniform',
activation=self.output_layer_activation))
logger.info('Critic model summary')
model.summary()
self.critic = model
self.target_critic = clone_model(self.critic)

def build_model(self):
self.build_actor_models()
self.build_critic_models()
self.build_actor_models(self.weight_init)
self.build_critic_models(self.weight_init)

def custom_critic_loss(self, y_true, y_pred):
return self.K.mean(self.K.square(y_true - y_pred))
Expand All @@ -100,17 +121,14 @@ def compile_model(self):
self.actor.output, self.actor.trainable_weights,
-self.action_gradient)
self.actor_optimize = self.K.tf.train.AdamOptimizer(
self.lr).apply_gradients(
self.lr_actor).apply_gradients(
zip(self.actor_grads, self.actor.trainable_weights))

self.critic_state = self.critic.inputs[0]
self.critic_action = self.critic.inputs[1]
self.critic_action_grads = self.K.tf.gradients(
self.critic_action_grads = self.tf.gradients(
self.critic.output, self.critic_action)

# self.actor.compile(
# loss='mse',
# optimizer=self.optimizer.actor_keras_optimizer)
self.target_actor.compile(
loss='mse',
optimizer=self.optimizer.target_actor_keras_optimizer)
Expand All @@ -124,6 +142,10 @@ def compile_model(self):
optimizer=self.optimizer.target_critic_keras_optimizer)
logger.info("Critic Models compiled")

init_op = self.tf.global_variables_initializer()
self.sess.run(init_op)
logger.info("Tensorflow variables initializaed")

def update(self, sys_vars):
'''Agent update apart from training the Q function'''
self.policy.update(sys_vars)
Expand All @@ -138,23 +160,23 @@ def train_critic(self, minibatch):
(1 - minibatch['terminals']) * Q_prime
critic_loss = self.critic.train_on_batch(
[minibatch['states'], minibatch['actions']], y)

return critic_loss

def train_actor(self, minibatch):
'''update actor network using sampled gradient'''
actions = self.actor.predict(minibatch['states'])
# critic_grads = critic.gradients(minibatch['states'], actions)
critic_grads = self.K.get_session().run(
critic_grads = self.sess.run(
self.critic_action_grads, feed_dict={
self.critic_state: minibatch['states'],
self.critic_action: actions
})[0]

# actor.train(minibatch['states'], critic_grads)
self.K.get_session().run(self.actor_optimize, feed_dict={
self.sess.run(self.actor_optimize, feed_dict={
self.actor_state: minibatch['states'],
self.action_gradient: critic_grads
})

actor_loss = 0
return actor_loss

Expand Down
Loading