Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ActorCritic and DDPG #118

Merged
merged 49 commits into from
Apr 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
a665bf3
Working discrete actor critic model
lgraesser Apr 8, 2017
c42d991
Uncommenting analyze data
lgraesser Apr 8, 2017
6f76d42
style fix, scehdule ac experiment
kengz Apr 9, 2017
2f5c642
stylefix
kengz Apr 9, 2017
0c4bf3a
Adding AC specs
lgraesser Apr 10, 2017
6041209
Merge remote-tracking branch 'origin/master' into policy-gradient
kengz Apr 10, 2017
1be0cb3
add component locks for ActorCritic and DDPG
kengz Apr 10, 2017
e8aab7c
schedule ac on cartpole and pendulum
kengz Apr 10, 2017
28d3cc8
reorder component locks
kengz Apr 10, 2017
1565a7a
add ac discrete component lock, fix and check all ac specs
kengz Apr 10, 2017
e9ad662
add variance to pendulum gaussian search, narrow search space
kengz Apr 10, 2017
c3c4ffb
add action bounds to env_spec
kengz Apr 11, 2017
7eaed26
fix boundedpolicy to auto-bound from env-spec
kengz Apr 11, 2017
34b1b58
Adding Acrobot specs
lgraesser Apr 11, 2017
9b415c3
Merge branch 'policy-gradient' of https://github.com/kengz/openai_lab…
lgraesser Apr 11, 2017
3bd227f
schedule other ac experiments
kengz Apr 11, 2017
d11abfd
Merge branch 'policy-gradient' of https://github.com/kengz/openai_lab…
kengz Apr 11, 2017
e8e6877
Fixing mem len param
lgraesser Apr 11, 2017
3d553ee
Merge branch 'policy-gradient' of https://github.com/kengz/openai_lab…
lgraesser Apr 11, 2017
75198e4
add ddpg fix attempt
kengz Apr 11, 2017
edd2276
Merge remote-tracking branch 'origin/master' into policy-gradient
kengz Apr 16, 2017
47a5f8c
ddpg with bounded actions
kengz Apr 16, 2017
ee011b1
Merge remote-tracking branch 'origin/master' into policy-gradient
kengz Apr 16, 2017
56e7f95
permami broken with reshape to len manually
kengz Apr 16, 2017
6215058
fixing permami shape one at a time; absolutely disgusting code
kengz Apr 16, 2017
434de8b
disgusting ddpg hack running
kengz Apr 16, 2017
4fa1dbc
comment out print
kengz Apr 16, 2017
a691401
fucking got it, culprit was predicted_q_val shape
kengz Apr 17, 2017
5a827c3
fix permami typo
kengz Apr 17, 2017
fd17088
runnable ddpg2 from permami, still not working yet
kengz Apr 17, 2017
bc0e3f9
DDPG2 WORKING AT LAST
kengz Apr 18, 2017
ea48d79
refactor ddpg and rename methods, variables properly
kengz Apr 18, 2017
366f229
use tf losses; return critic_loss from run
kengz Apr 18, 2017
1e79a98
restore critic_loss
kengz Apr 18, 2017
c4d21b8
source ddpg main class from dqn; propagate some param settings properly
kengz Apr 18, 2017
0be8381
add compatible spec
kengz Apr 18, 2017
7e4c28e
externalize select action to policy
kengz Apr 18, 2017
402fb71
refactor noise policies for ddpg
kengz Apr 18, 2017
22c331f
separate critic_lr for Critic
kengz Apr 18, 2017
9ab35e9
rename base to NoNoisePolicy as proper
kengz Apr 18, 2017
fb6059d
remove DDPGBoundedPolicy, already built in to DDPG
kengz Apr 18, 2017
a1524d1
remove useless ddpg examples
kengz Apr 18, 2017
da54f53
rename ddpg2 to ddpg
kengz Apr 18, 2017
d30fe83
stylefix
kengz Apr 18, 2017
e469f72
warn instead of break for component lock
kengz Apr 19, 2017
3b766a9
mute double dqn recompile both models till performance is fixed
kengz Apr 19, 2017
c650b04
fix graph rendering on single trial by mpl backend switching
kengz Apr 19, 2017
be6c2a9
rename noise policies properly
kengz Apr 19, 2017
0b50a69
add ac, ddpg tests
kengz Apr 19, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions rl/agent/actor_critic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import numpy as np
from rl.agent.dqn import DQN
from rl.util import logger


class ActorCritic(DQN):

'''
Actor Critic algorithm. The actor's policy
is adjusted in the direction that will lead to
better actions, guided by the critic
Implementation adapted from
http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html

Assumes one of the policies in actor_critic.py are being used
'''

def __init__(self, env_spec,
train_per_n_new_exp=1,
gamma=0.95, lr=0.1,
epi_change_lr=None,
batch_size=16, n_epoch=5, hidden_layers=None,
hidden_layers_activation='sigmoid',
output_layer_activation='linear',
auto_architecture=False,
num_hidden_layers=3,
first_hidden_layer_size=256,
num_initial_channels=16,
**kwargs): # absorb generic param without breaking
# import only when needed to contain side-effects
from keras.layers.core import Dense
from keras.models import Sequential, load_model
self.Dense = Dense
self.Sequential = Sequential
self.load_model = load_model

super(ActorCritic, self).__init__(env_spec,
train_per_n_new_exp,
gamma, lr,
epi_change_lr,
batch_size, n_epoch, hidden_layers,
hidden_layers_activation,
output_layer_activation,
auto_architecture,
num_hidden_layers,
first_hidden_layer_size,
num_initial_channels,
**kwargs)

def build_model(self):
self.build_actor()
self.build_critic()
logger.info("Actor and critic models built")

def build_actor(self):
actor = self.Sequential()
super(ActorCritic, self).build_hidden_layers(actor)
actor.add(self.Dense(self.env_spec['action_dim'],
init='lecun_uniform',
activation=self.output_layer_activation))
logger.info("Actor summary")
actor.summary()
self.actor = actor

def build_critic(self):
critic = self.Sequential()
super(ActorCritic, self).build_hidden_layers(critic)
critic.add(self.Dense(1,
init='lecun_uniform',
activation=self.output_layer_activation))
logger.info("Critic summary")
critic.summary()
self.critic = critic

def compile_model(self):
self.actor.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
self.critic.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
logger.info("Actor and critic compiled")

def recompile_model(self, sys_vars):
'''
Option to change model optimizer settings
Currently only used for changing the learning rate
Compiling does not affect the model weights
'''
if self.epi_change_lr is not None:
if (sys_vars['epi'] == self.epi_change_lr and
sys_vars['t'] == 0):
self.lr = self.lr / 10.0
self.optimizer.change_optim_param(**{'lr': self.lr})
self.actor.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
self.critic.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
logger.info(
'Actor and critic models recompiled with new settings: '
'Learning rate: {}'.format(self.lr))

def train_critic(self, minibatch):
Q_vals = np.clip(self.critic.predict(minibatch['states']),
-self.clip_val, self.clip_val)
Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']),
-self.clip_val, self.clip_val)
Q_targets = minibatch['rewards'] + self.gamma * \
(1 - minibatch['terminals']) * Q_next_vals.squeeze()
Q_targets = np.expand_dims(Q_targets, axis=1)

actor_delta = Q_next_vals - Q_vals
loss = self.critic.train_on_batch(minibatch['states'], Q_targets)

errors = abs(np.sum(Q_vals - Q_targets, axis=1))
self.memory.update(errors)
return loss, actor_delta

def train_actor(self, minibatch, actor_delta):
old_vals = self.actor.predict(minibatch['states'])
if self.env_spec['actions'] == 'continuous':
A_targets = np.zeros(
(actor_delta.shape[0], self.env_spec['action_dim']))
for j in range(A_targets.shape[1]):
A_targets[:, j] = actor_delta.squeeze()
else:
A_targets = minibatch['actions'] * actor_delta + \
(1 - minibatch['actions']) * old_vals

loss = self.actor.train_on_batch(minibatch['states'], A_targets)
return loss

def train_an_epoch(self):
minibatch = self.memory.rand_minibatch(self.batch_size)
critic_loss, actor_delta = self.train_critic(minibatch)
actor_loss = self.train_actor(minibatch, actor_delta)
return critic_loss + actor_loss
Loading