-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpolicy.py
250 lines (212 loc) · 12 KB
/
policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/python3
import math
import os
import sys
import tensorflow as tf
import cchess
import util
EPSILON = 1e-35
class PolicyNetwork(object):
def __init__(self, k=128, num_int_conv_layers=11, use_cpu=False):
self.num_input_planes = util.PIECE_SIZE
self.k = k
self.num_int_conv_layers = num_int_conv_layers
self.test_summary_writer = None
self.training_summary_writer = None
self.test_stats = StatisticsCollector()
self.training_stats = StatisticsCollector()
self.session = tf.Session()
if use_cpu:
with tf.device("/cpu:0"):
self.set_up_network()
else:
self.set_up_network()
def set_up_network(self):
# a global_step variable allows epoch counts to persist through multiple training sessions
global_step = tf.Variable(0, name="global_step", trainable=False)
RL_global_step = tf.Variable(0, name="RL_global_step", trainable=False)
x = tf.placeholder(tf.float32, [None, util.Y_SIZE, util.X_SIZE, self.num_input_planes])
y = tf.placeholder(tf.float32, shape=[None, util.BOARD_SIZE])
# whether this example should be positively or negatively reinforced.
# Set to 1 for positive, -1 for negative.
reinforce_direction = tf.placeholder(tf.float32, shape=[])
#convenience functions for initializing weights and biases
def _weight_variable(shape, name):
# If shape is [5, 5, 20, 32], then each of the 32 output planes
# has 5 * 5 * 20 inputs.
number_inputs_added = util.product(shape[:-1])
stddev = 1 / math.sqrt(number_inputs_added)
# http://neuralnetworksanddeeplearning.com/chap3.html#weight_initialization
return tf.Variable(tf.truncated_normal(shape, stddev=stddev), name=name)
def _conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding="SAME")
# initial conv layer is 5x5
W_conv_init55 = _weight_variable([5, 5, self.num_input_planes, self.k], name="W_conv_init55")
W_conv_init11 = _weight_variable([1, 1, self.num_input_planes, self.k], name="W_conv_init11")
h_conv_init = tf.nn.relu(_conv2d(x, W_conv_init55) + _conv2d(x, W_conv_init11), name="h_conv_init")
# followed by a series of resnet 3x3 conv layers
W_conv_intermediate = []
h_conv_intermediate = []
_current_h_conv = h_conv_init
for i in range(self.num_int_conv_layers):
with tf.name_scope("layer"+str(i)):
_resnet_weights1 = _weight_variable([3, 3, self.k, self.k], name="W_conv_resnet1")
_resnet_weights2 = _weight_variable([3, 3, self.k, self.k], name="W_conv_resnet2")
_int_conv = tf.nn.relu(_conv2d(_current_h_conv, _resnet_weights1), name="h_conv_intermediate")
_output_conv = tf.nn.relu(
_current_h_conv +
_conv2d(_int_conv, _resnet_weights2),
name="h_conv")
W_conv_intermediate.extend([_resnet_weights1, _resnet_weights2])
h_conv_intermediate.append(_output_conv)
_current_h_conv = _output_conv
W_conv_final = _weight_variable([1, 1, self.k, 1], name="W_conv_final")
b_conv_final = tf.Variable(tf.constant(0, shape=[util.BOARD_SIZE], dtype=tf.float32), name="b_conv_final")
h_conv_final = _conv2d(h_conv_intermediate[-1], W_conv_final)
output = tf.nn.softmax(tf.reshape(h_conv_final, [-1, util.BOARD_SIZE]) + b_conv_final)
logits = tf.reshape(h_conv_final, [-1, util.BOARD_SIZE]) + b_conv_final
log_likelihood_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
# AdamOptimizer is faster at start but gets really spiky after 2-3 million steps.
# train_step = tf.train.AdamOptimizer(1e-4).minimize(log_likelihood_cost, global_step=global_step)
learning_rate = tf.train.exponential_decay(1e-2, global_step, 20000, 0.9)
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(log_likelihood_cost, global_step=global_step)
was_correct = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(was_correct, tf.float32))
reinforce_step = tf.train.GradientDescentOptimizer(1e-2).minimize(
log_likelihood_cost * reinforce_direction, global_step=RL_global_step)
weight_summaries = tf.summary.merge([
tf.summary.histogram(weight_var.name, weight_var)
for weight_var in [W_conv_init55, W_conv_init11] + W_conv_intermediate + [W_conv_final, b_conv_final]],
name="weight_summaries"
)
activation_summaries = tf.summary.merge([
tf.summary.histogram(act_var.name, act_var)
for act_var in [h_conv_init] + h_conv_intermediate + [h_conv_final]],
name="activation_summaries"
)
saver = tf.train.Saver()
# save everything to self.
for name, thing in locals().items():
if not name.startswith('_'):
setattr(self, name, thing)
def initialize_logging(self, tensorboard_logdir):
self.test_summary_writer = tf.summary.FileWriter(os.path.join(tensorboard_logdir, "test"), self.session.graph)
self.training_summary_writer = tf.summary.FileWriter(os.path.join(tensorboard_logdir, "training"), self.session.graph)
def initialize_variables(self, restore_file=None):
self.session.run(tf.global_variables_initializer())
if restore_file is not None:
try:
self.saver.restore(self.session, restore_file)
except:
# some wizardry here... basically, only restore variables
# that are in the save file; otherwise, initialize them normally.
from tensorflow.python.framework import meta_graph
meta_graph_def = meta_graph.read_meta_graph_file(restore_file + '.meta')
stored_var_names = set([n.name
for n in meta_graph_def.graph_def.node
if n.op == 'VariableV2'])
print(stored_var_names)
var_list = [v for v in tf.global_variables()
if v.op.name in stored_var_names]
# initialize all of the variables
self.session.run(tf.global_variables_initializer())
# then overwrite the ones we have in the save file
# by using a throwaway saver, saved models are automatically
# "upgraded" to the latest graph definition.
throwaway_saver = tf.train.Saver(var_list=var_list)
throwaway_saver.restore(self.session, restore_file)
def get_global_step(self):
return self.session.run(self.global_step)
def save_variables(self, save_file):
if save_file is not None:
print("Saving checkpoint to %s" % save_file)
self.saver.save(self.session, save_file, global_step=self.global_step)
def train(self, training_data, save_file, num_epoch=30, batch_size=32):
num_minibatches = training_data.data_size // batch_size
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=self.session, coord=coord)
for epoch in range(num_epoch):
for i in range(num_minibatches):
batch_x, batch_y = training_data.get_batch(self.session)
_, accuracy, cost, lr = self.session.run(
[self.train_step, self.accuracy, self.log_likelihood_cost, self.learning_rate],
feed_dict={self.x: batch_x, self.y: batch_y, self.reinforce_direction: 1})
self.training_stats.report(accuracy, cost)
if i % 1000 == 999:
print("epoch: %d, i: %d, global_step: %d, accuracy: %g, cost: %g, lr: %g" % (epoch, i, self.get_global_step(), accuracy, cost, lr))
self.save_variables(save_file)
coord.request_stop()
coord.join(threads)
avg_accuracy, avg_cost, accuracy_summaries = self.training_stats.collect()
global_step = self.get_global_step()
print("Step %d training data accuracy: %g; cost: %g" % (global_step, avg_accuracy, avg_cost))
if self.training_summary_writer is not None:
activation_summaries = self.session.run(
self.activation_summaries,
feed_dict={self.x: batch_x, self.y: batch_y, self.reinforce_direction: 1})
self.training_summary_writer.add_summary(activation_summaries, global_step)
self.training_summary_writer.add_summary(accuracy_summaries, global_step)
def reinforce(self, dataset, direction=1, batch_size=32):
num_minibatches = dataset.data_size // batch_size
for i in range(num_minibatches):
batch_x, batch_y = dataset.get_batch(batch_size)
self.session.run(
self.reinforce_step,
feed_dict={self.x: batch_x, self.y: batch_y, self.reinforce_direction: direction})
def run(self, board):
'Return a sorted list of (probability, move) tuples'
probabilities = self.session.run(self.output, feed_dict={self.x: board})[0]
# return probabilities.reshape([util.Y_SIZE, util.X_SIZE])
return probabilities
def run_many(self, positions):
processed_positions = features.bulk_extract_features(positions)
probabilities = self.session.run(self.output, feed_dict={self.x:processed_positions})
return probabilities.reshape([-1, util.Y_SIZE, util.X_SIZE])
def check_accuracy(self, test_data, batch_size=128):
num_minibatches = test_data.data_size // batch_size
weight_summaries = self.session.run(self.weight_summaries)
for i in range(num_minibatches):
batch_x, batch_y = test_data.get_batch(batch_size)
accuracy, cost = self.session.run(
[self.accuracy, self.log_likelihood_cost],
feed_dict={self.x: batch_x, self.y: batch_y, self.reinforce_direction: 1})
self.test_stats.report(accuracy, cost)
avg_accuracy, avg_cost, accuracy_summaries = self.test_stats.collect()
global_step = self.get_global_step()
print("Step %s test data accuracy: %g; cost: %g" % (global_step, avg_accuracy, avg_cost))
if self.test_summary_writer is not None:
self.test_summary_writer.add_summary(weight_summaries, global_step)
self.test_summary_writer.add_summary(accuracy_summaries, global_step)
def close(self):
self.session.close()
class StatisticsCollector(object):
'''
Accuracy and cost cannot be calculated with the full test dataset
in one pass, so they must be computed in batches. Unfortunately,
the built-in TF summary nodes cannot be told to aggregate multiple
executions. Therefore, we aggregate the accuracy/cost ourselves at
the python level, and then shove it through the accuracy/cost summary
nodes to generate the appropriate summary protobufs for writing.
'''
graph = tf.Graph()
with tf.device("/cpu:0"), graph.as_default():
accuracy = tf.placeholder(tf.float32, [])
cost = tf.placeholder(tf.float32, [])
accuracy_summary = tf.summary.scalar("accuracy", accuracy)
cost_summary = tf.summary.scalar("log_likelihood_cost", cost)
accuracy_summaries = tf.summary.merge([accuracy_summary, cost_summary], name="accuracy_summaries")
session = tf.Session(graph=graph)
def __init__(self):
self.accuracies = []
self.costs = []
def report(self, accuracy, cost):
self.accuracies.append(accuracy)
self.costs.append(cost)
def collect(self):
avg_acc = sum(self.accuracies) / len(self.accuracies)
avg_cost = sum(self.costs) / len(self.costs)
self.accuracies = []
self.costs = []
summary = self.session.run(self.accuracy_summaries,
feed_dict={self.accuracy:avg_acc, self.cost: avg_cost})
return avg_acc, avg_cost, summary