-
Notifications
You must be signed in to change notification settings - Fork 68
/
dqn.py
221 lines (194 loc) · 8.36 KB
/
dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import numpy as np
from rl.agent.base_agent import Agent
from rl.util import logger, log_self
class DQN(Agent):
'''
The base class of DQNs, with the core methods
The simplest deep Q network,
with epsilon-greedy method and
Bellman equation for value, using neural net.
'''
def __init__(self, env_spec,
train_per_n_new_exp=1,
gamma=0.95, lr=0.1,
epi_change_lr=None,
batch_size=16, n_epoch=5, hidden_layers=None,
hidden_layers_activation='sigmoid',
output_layer_activation='linear',
auto_architecture=False,
num_hidden_layers=3,
first_hidden_layer_size=256,
num_initial_channels=16,
**kwargs): # absorb generic param without breaking
# import only when needed to contain side-effects
from keras.layers.core import Dense
from keras.models import Sequential, load_model
self.Dense = Dense
self.Sequential = Sequential
self.load_model = load_model
super(DQN, self).__init__(env_spec)
self.train_per_n_new_exp = train_per_n_new_exp
self.gamma = gamma
self.lr = lr
self.epi_change_lr = epi_change_lr
self.batch_size = batch_size
self.n_epoch = 1
self.final_n_epoch = n_epoch
self.hidden_layers = hidden_layers or [4]
self.hidden_layers_activation = hidden_layers_activation
self.output_layer_activation = output_layer_activation
self.clip_val = 10000
self.auto_architecture = auto_architecture
self.num_hidden_layers = num_hidden_layers
self.first_hidden_layer_size = first_hidden_layer_size
self.num_initial_channels = num_initial_channels
log_self(self)
self.build_model()
def build_hidden_layers(self, model):
'''
build the hidden layers into model using parameter self.hidden_layers
'''
# Auto architecture infers the size of the hidden layers from the size
# of the first layer. Each successive hidden layer is half the size of the
# previous layer
# Enables hyperparameter optimization over network architecture
if self.auto_architecture:
curr_layer_size = self.first_hidden_layer_size
model.add(self.Dense(curr_layer_size,
input_shape=(self.env_spec['state_dim'],),
activation=self.hidden_layers_activation,
init='lecun_uniform'))
curr_layer_size = int(curr_layer_size / 2)
for i in range(1, self.num_hidden_layers):
model.add(self.Dense(curr_layer_size,
init='lecun_uniform',
activation=self.hidden_layers_activation))
curr_layer_size = int(curr_layer_size / 2)
else:
model.add(self.Dense(self.hidden_layers[0],
input_shape=(self.env_spec['state_dim'],),
activation=self.hidden_layers_activation,
init='lecun_uniform'))
# inner hidden layer: no specification of input shape
if (len(self.hidden_layers) > 1):
for i in range(1, len(self.hidden_layers)):
model.add(self.Dense(
self.hidden_layers[i],
init='lecun_uniform',
activation=self.hidden_layers_activation))
return model
def build_model(self):
model = self.Sequential()
self.build_hidden_layers(model)
model.add(self.Dense(self.env_spec['action_dim'],
init='lecun_uniform',
activation=self.output_layer_activation))
logger.info("Model summary")
model.summary()
self.model = model
logger.info("Model built")
return self.model
def compile_model(self):
self.model.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
logger.info("Model compiled")
def recompile_model(self, sys_vars):
'''
Option to change model optimizer settings
Currently only used for changing the learning rate
Compiling does not affect the model weights
'''
if self.epi_change_lr is not None:
if (sys_vars['epi'] == self.epi_change_lr and
sys_vars['t'] == 0):
self.lr = self.lr / 10.0
self.optimizer.change_optim_param(**{'lr': self.lr})
self.model.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
logger.info('Model recompiled with new settings: '
'Learning rate: {}'.format(self.lr))
return self.model
def update_n_epoch(self, sys_vars):
'''
Increase epochs at the beginning of each session,
for training for later episodes,
once it has more experience
Best so far, increment num epochs every 2 up to a max of 5
'''
if (self.n_epoch < self.final_n_epoch and
sys_vars['t'] == 0 and
sys_vars['epi'] % 2 == 0):
self.n_epoch += 1
return self.n_epoch
def select_action(self, state):
'''epsilon-greedy method'''
return self.policy.select_action(state)
def update(self, sys_vars):
'''
Agent update apart from training the Q function
'''
self.policy.update(sys_vars)
self.update_n_epoch(sys_vars)
self.recompile_model(sys_vars)
def to_train(self, sys_vars):
'''
return boolean condition if agent should train
get n NEW experiences before training model
'''
t = sys_vars['t']
done = sys_vars['done']
timestep_limit = self.env_spec['timestep_limit']
return (t > 0) and bool(
t % self.train_per_n_new_exp == 0 or
t == (timestep_limit-1) or
done)
def compute_Q_states(self, minibatch):
# note the computed values below are batched in array
Q_states = np.clip(self.model.predict(minibatch['states']),
-self.clip_val, self.clip_val)
Q_next_states = np.clip(self.model.predict(minibatch['next_states']),
-self.clip_val, self.clip_val)
Q_next_states_max = np.amax(Q_next_states, axis=1)
return (Q_states, Q_next_states, Q_next_states_max)
def compute_Q_targets(self, minibatch, Q_states, Q_next_states_max):
# make future reward 0 if exp is terminal
Q_targets_a = minibatch['rewards'] + self.gamma * \
(1 - minibatch['terminals']) * Q_next_states_max
# set batch Q_targets of a as above, the rest as is
# minibatch['actions'] is one-hot encoded
Q_targets = minibatch['actions'] * Q_targets_a[:, np.newaxis] + \
(1 - minibatch['actions']) * Q_states
return Q_targets
def train_an_epoch(self):
minibatch = self.memory.rand_minibatch(self.batch_size)
(Q_states, _states, Q_next_states_max) = self.compute_Q_states(
minibatch)
Q_targets = self.compute_Q_targets(
minibatch, Q_states, Q_next_states_max)
loss = self.model.train_on_batch(minibatch['states'], Q_targets)
errors = abs(np.sum(Q_states - Q_targets, axis=1))
assert Q_targets.shape == (
self.batch_size, self.env_spec['action_dim'])
assert errors.shape == (self.batch_size, )
self.memory.update(errors)
return loss
def train(self, sys_vars):
'''
Training is for the Q function (NN) only
otherwise (e.g. policy) see self.update()
step 1,2,3,4 of algo.
'''
loss_total = 0
for _epoch in range(self.n_epoch):
loss = self.train_an_epoch()
loss_total += loss
avg_loss = loss_total / self.n_epoch
sys_vars['loss'].append(avg_loss)
return avg_loss
def save(self, model_path, global_step=None):
logger.info('Saving model checkpoint')
self.model.save_weights(model_path)
def restore(self, model_path):
self.model.load_weights(model_path, by_name=False)