@@ -122,168 +122,15 @@ If you want to build your own agent, it should always inherit from
122
122
from ` MemoryAgent ` , if it uses a batch replay that is emptied after each update,
123
123
it should probably inherit from ` BatchAgent ` .
124
124
125
- Reinforcement learning agents often differ only by their respective
126
- value function. Extending the MemoryAgent ist straightforward:
127
-
128
- ``` python
129
- # Full code at tensorforce/examples/simple_q_agent.py
130
- from tensorforce.agents import MemoryAgent
131
-
132
- class SimpleQAgent (MemoryAgent ):
133
- """
134
- Simple agent extending MemoryAgent
135
- """
136
- name = ' SimpleQAgent'
137
-
138
- model_ref = SimpleQModel
139
-
140
- default_config = {
141
- " memory_capacity" : 1000 , # hold the last 100 observations in the replay memory
142
- " batch_size" : 10 , # train model with batches of 10
143
- " update_rate" : 0.5 , # update parameters every other step
144
- " update_repeat" : 1 , # repeat update only one time
145
- " min_replay_size" : 0 # minimum size of replay memory before updating
146
- }
147
- ```
148
-
149
- ` model_ref ` points to the model class. A model should always inherit
150
- from ` tensorforce.models.Model ` .
151
-
152
- ``` python
153
- # Full code at tensorforce/examples/simple_q_agent.py
154
- import numpy as np
155
- import tensorforce as tf
156
- from tensorforce.models import Model
157
- from tensorforce.models.neural_networks import NeuralNetwork
158
- from tensorforce.config import Configuration
159
-
160
- class SimpleQModel (Model ):
161
- # Default config values
162
- default_config = {
163
- " alpha" : 0.01 ,
164
- " gamma" : 0.99 ,
165
- " network_layers" : [{
166
- " type" : " linear" ,
167
- " num_outputs" : 16
168
- }]
169
- }
170
-
171
- def __init__ (self , config , scope ):
172
- """
173
- Initialize model, build network and tensorflow ops
174
-
175
- :param config: Config object or dict
176
- :param scope: tensorflow scope name
177
- """
178
- super (SimpleQModel, self ).__init__ (config, scope)
179
- self .action_count = self .config.actions
180
-
181
- self .random = np.random.RandomState()
182
-
183
- with tf.device(self .config.tf_device):
184
- # Create state placeholder
185
- self .state = tf.placeholder(tf.float32, [None ] + list (self .config.state_shape), name = " state" )
186
-
187
- # Create neural network
188
- output_layer = [{" type" : " linear" , " num_outputs" : self .action_count}]
189
-
190
- define_network = NeuralNetwork.layered_network(self .config.network_layers + output_layer)
191
- self .network = NeuralNetwork(define_network, [self .state], scope = self .scope + ' network' )
192
- self .network_out = self .network.output
193
-
194
- # Create operations
195
- self .create_ops()
196
- self .init_op = tf.global_variables_initializer()
197
-
198
- # Create optimizer
199
- self .optimizer = tf.train.GradientDescentOptimizer(learning_rate = self .config.alpha)
200
-
201
- self .session.run(self .init_op)
202
-
203
- def get_action (self , state , episode = 1 ):
204
- """
205
- Get action for a given state
206
-
207
- :param state: ndarray containing the state
208
- :param episode: number of episode (for epsilon decay and alike)
209
- :return: action
210
- """
211
-
212
- # self.exploration is initialized in Model.__init__ and provides an API for different explorations methods,
213
- # such as epsilon greedy.
214
- epsilon = self .exploration(episode, self .total_states) # returns a float
215
-
216
- if self .random.random_sample() < epsilon:
217
- action = self .random.randint(0 , self .action_count)
218
- else :
219
- action = self .session.run(self .q_action, {
220
- self .state: [state]
221
- })[0 ]
222
-
223
- self .total_states += 1
224
- return action
225
-
226
- def update (self , batch ):
227
- """
228
- Update model parameters
229
-
230
- :param batch: replay_memory batch
231
- :return:
232
- """
233
- # Get Q values for next states
234
- next_q = self .session.run(self .network_out, {
235
- self .state: batch[' next_states' ]
236
- })
237
-
238
- # Bellmann equation Q = r + y * Q'
239
- q_targets = batch[' rewards' ] + (1 . - batch[' terminals' ].astype(float )) \
240
- * self .config.gamma * np.max(next_q, axis = 1 )
241
-
242
- self .session.run(self .optimize_op, {
243
- self .state: batch[' states' ],
244
- self .actions: batch[' actions' ],
245
- self .q_targets: q_targets
246
- })
247
-
248
- def initialize (self ):
249
- """
250
- Initialize model variables
251
- :return:
252
- """
253
- self .session.run(self .init_op)
254
-
255
- def create_ops (self ):
256
- """
257
- Create tensorflow ops
258
-
259
- :return:
260
- """
261
- with tf.name_scope(self .scope):
262
- with tf.name_scope(" predict" ):
263
- self .q_action = tf.argmax(self .network_out, axis = 1 )
264
-
265
- with tf.name_scope(" update" ):
266
- # These are the target Q values, i.e. the actual rewards plus the expected values of the next states
267
- # (Bellman equation).
268
- self .q_targets = tf.placeholder(tf.float32, [None ], name = ' q_targets' )
269
-
270
- # Actions that have been taken.
271
- self .actions = tf.placeholder(tf.int32, [None ], name = ' actions' )
272
-
273
- # We need the Q values of the current states to calculate the difference ("loss") between the
274
- # expected values and the new values (q targets). Therefore we do a forward-pass
275
- # and reduce the results to the actions that have been taken.
276
-
277
- # One_hot tensor of the actions that have been taken.
278
- actions_one_hot = tf.one_hot(self .actions, self .action_count, 1.0 , 0.0 , name = ' action_one_hot' )
279
-
280
- # Training output, reduced to the actions that have been taken.
281
- q_values_actions_taken = tf.reduce_sum(self .network_out * actions_one_hot, axis = 1 ,
282
- name = ' q_acted' )
283
-
284
- # The loss is the difference between the q_targets and the expected q values.
285
- self .loss = tf.reduce_sum(tf.square(self .q_targets - q_values_actions_taken))
286
- self .optimize_op = self .optimizer.minimize(self .loss)
287
- ```
288
-
125
+ We distinguish between agents and models. The ` Agent ` class handles the
126
+ interaction with the environment, such as state preprocessing, exploration
127
+ and observation of rewards. The ` Model ` class handles the mathematical
128
+ operations, such as building the tensorflow operations, calculating the
129
+ desired action and updating (i.e. optimizing) the model weights.
130
+
131
+ To start building your own agent, please refer to
132
+ [ this blogpost] ( https://reinforce.io ) to gain a deeper understanding of the
133
+ internals of the TensorForce library. Afterwards, have look on a sample
134
+ implementation, e.g. the [ DQN Agent] ( https://github.com/reinforceio/tensorforce/blob/master/tensorforce/agents/dqn_agent.py )
135
+ and [ DQN Model] ( https://github.com/reinforceio/tensorforce/blob/master/tensorforce/models/dqn_model.py ) .
289
136
0 commit comments