Skip to content

Commit 78eb48e

Browse files
committed
updated preprocessing docs
1 parent 0d07fad commit 78eb48e

File tree

9 files changed

+114
-322
lines changed

9 files changed

+114
-322
lines changed

docs/agents_models.md

+11-164
Original file line numberDiff line numberDiff line change
@@ -122,168 +122,15 @@ If you want to build your own agent, it should always inherit from
122122
from `MemoryAgent`, if it uses a batch replay that is emptied after each update,
123123
it should probably inherit from `BatchAgent`.
124124

125-
Reinforcement learning agents often differ only by their respective
126-
value function. Extending the MemoryAgent ist straightforward:
127-
128-
```python
129-
# Full code at tensorforce/examples/simple_q_agent.py
130-
from tensorforce.agents import MemoryAgent
131-
132-
class SimpleQAgent(MemoryAgent):
133-
"""
134-
Simple agent extending MemoryAgent
135-
"""
136-
name = 'SimpleQAgent'
137-
138-
model_ref = SimpleQModel
139-
140-
default_config = {
141-
"memory_capacity": 1000, # hold the last 100 observations in the replay memory
142-
"batch_size": 10, # train model with batches of 10
143-
"update_rate": 0.5, # update parameters every other step
144-
"update_repeat": 1, # repeat update only one time
145-
"min_replay_size": 0 # minimum size of replay memory before updating
146-
}
147-
```
148-
149-
`model_ref` points to the model class. A model should always inherit
150-
from `tensorforce.models.Model`.
151-
152-
```python
153-
# Full code at tensorforce/examples/simple_q_agent.py
154-
import numpy as np
155-
import tensorforce as tf
156-
from tensorforce.models import Model
157-
from tensorforce.models.neural_networks import NeuralNetwork
158-
from tensorforce.config import Configuration
159-
160-
class SimpleQModel(Model):
161-
# Default config values
162-
default_config = {
163-
"alpha": 0.01,
164-
"gamma": 0.99,
165-
"network_layers": [{
166-
"type": "linear",
167-
"num_outputs": 16
168-
}]
169-
}
170-
171-
def __init__(self, config, scope):
172-
"""
173-
Initialize model, build network and tensorflow ops
174-
175-
:param config: Config object or dict
176-
:param scope: tensorflow scope name
177-
"""
178-
super(SimpleQModel, self).__init__(config, scope)
179-
self.action_count = self.config.actions
180-
181-
self.random = np.random.RandomState()
182-
183-
with tf.device(self.config.tf_device):
184-
# Create state placeholder
185-
self.state = tf.placeholder(tf.float32, [None] + list(self.config.state_shape), name="state")
186-
187-
# Create neural network
188-
output_layer = [{"type": "linear", "num_outputs": self.action_count}]
189-
190-
define_network = NeuralNetwork.layered_network(self.config.network_layers + output_layer)
191-
self.network = NeuralNetwork(define_network, [self.state], scope=self.scope + 'network')
192-
self.network_out = self.network.output
193-
194-
# Create operations
195-
self.create_ops()
196-
self.init_op = tf.global_variables_initializer()
197-
198-
# Create optimizer
199-
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.config.alpha)
200-
201-
self.session.run(self.init_op)
202-
203-
def get_action(self, state, episode=1):
204-
"""
205-
Get action for a given state
206-
207-
:param state: ndarray containing the state
208-
:param episode: number of episode (for epsilon decay and alike)
209-
:return: action
210-
"""
211-
212-
# self.exploration is initialized in Model.__init__ and provides an API for different explorations methods,
213-
# such as epsilon greedy.
214-
epsilon = self.exploration(episode, self.total_states) # returns a float
215-
216-
if self.random.random_sample() < epsilon:
217-
action = self.random.randint(0, self.action_count)
218-
else:
219-
action = self.session.run(self.q_action, {
220-
self.state: [state]
221-
})[0]
222-
223-
self.total_states += 1
224-
return action
225-
226-
def update(self, batch):
227-
"""
228-
Update model parameters
229-
230-
:param batch: replay_memory batch
231-
:return:
232-
"""
233-
# Get Q values for next states
234-
next_q = self.session.run(self.network_out, {
235-
self.state: batch['next_states']
236-
})
237-
238-
# Bellmann equation Q = r + y * Q'
239-
q_targets = batch['rewards'] + (1. - batch['terminals'].astype(float)) \
240-
* self.config.gamma * np.max(next_q, axis=1)
241-
242-
self.session.run(self.optimize_op, {
243-
self.state: batch['states'],
244-
self.actions: batch['actions'],
245-
self.q_targets: q_targets
246-
})
247-
248-
def initialize(self):
249-
"""
250-
Initialize model variables
251-
:return:
252-
"""
253-
self.session.run(self.init_op)
254-
255-
def create_ops(self):
256-
"""
257-
Create tensorflow ops
258-
259-
:return:
260-
"""
261-
with tf.name_scope(self.scope):
262-
with tf.name_scope("predict"):
263-
self.q_action = tf.argmax(self.network_out, axis=1)
264-
265-
with tf.name_scope("update"):
266-
# These are the target Q values, i.e. the actual rewards plus the expected values of the next states
267-
# (Bellman equation).
268-
self.q_targets = tf.placeholder(tf.float32, [None], name='q_targets')
269-
270-
# Actions that have been taken.
271-
self.actions = tf.placeholder(tf.int32, [None], name='actions')
272-
273-
# We need the Q values of the current states to calculate the difference ("loss") between the
274-
# expected values and the new values (q targets). Therefore we do a forward-pass
275-
# and reduce the results to the actions that have been taken.
276-
277-
# One_hot tensor of the actions that have been taken.
278-
actions_one_hot = tf.one_hot(self.actions, self.action_count, 1.0, 0.0, name='action_one_hot')
279-
280-
# Training output, reduced to the actions that have been taken.
281-
q_values_actions_taken = tf.reduce_sum(self.network_out * actions_one_hot, axis=1,
282-
name='q_acted')
283-
284-
# The loss is the difference between the q_targets and the expected q values.
285-
self.loss = tf.reduce_sum(tf.square(self.q_targets - q_values_actions_taken))
286-
self.optimize_op = self.optimizer.minimize(self.loss)
287-
```
288-
125+
We distinguish between agents and models. The `Agent` class handles the
126+
interaction with the environment, such as state preprocessing, exploration
127+
and observation of rewards. The `Model` class handles the mathematical
128+
operations, such as building the tensorflow operations, calculating the
129+
desired action and updating (i.e. optimizing) the model weights.
130+
131+
To start building your own agent, please refer to
132+
[this blogpost](https://reinforce.io) to gain a deeper understanding of the
133+
internals of the TensorForce library. Afterwards, have look on a sample
134+
implementation, e.g. the [DQN Agent](https://github.com/reinforceio/tensorforce/blob/master/tensorforce/agents/dqn_agent.py)
135+
and [DQN Model](https://github.com/reinforceio/tensorforce/blob/master/tensorforce/models/dqn_model.py).
289136

0 commit comments

Comments
 (0)