This repository has been archived by the owner on Sep 4, 2024. It is now read-only.
forked from kenjyoung/MinAtar
-
Notifications
You must be signed in to change notification settings - Fork 1
/
experiment.py
330 lines (269 loc) · 11.8 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/usr/bin/env python3
# Import modules
import time
from datetime import datetime
from copy import deepcopy
import numpy as np
class Experiment:
"""
Class Experiment will run a single experiment while logging data. An
experiment consists of a single run of agent-environment interaction.
"""
def __init__(self, agent, env, eval_env, eval_episodes,
total_timesteps, eval_interval_timesteps, max_episodes=-1):
"""
Constructor
Parameters
----------
agent : baseAgent.BaseAgent
The agent to run the experiment on
env : environment.Environment
The environment to use for the experiment
eval_episodes : int
The number of evaluation episodes to run when measuring offline
performance
total_timesteps : int
The maximum number of allowable timesteps per experiment
eval_interval_timesteps: int
The interval of timesteps at which an agent's performance will be
evaluated
state_bins : tuple of int
For the sequence of states used in each update, the number of bins
per dimension with which to bin the states.
min_state_values : array_like
The minimum value of states along each dimension, used to encode
states used in updates to count the number of times states are
used in each update.
max_state_values : array_like
The maximum value of states along each dimension, used to encode
states used in updates to count the number of times states are
used in each update.
action_bins : tuple of int
For the sequence of actions used in each update, the number of bins
per dimension with which to bin the actions.
min_action_values : array_like
The minimum value of actions along each dimension, used to encode
actions used in updates to count the number of times actions are
used in each update.
max_state_values : array_like
The maximum value of actions along each dimension, used to encode
actions used in updates to count the number of times actions are
used in each update.
count_interval : int
The interval of timesteps at which we will store the counts of
state or action bins seen during training or used in updates. At
each timestep, we determine which state/action bins were used in
an update or seen at the current timestep. These values are
accumulated so that the total number of times each bin was
seen/used is stored up to the current timestep. This parameter
controls the timestep interval at which these accumulated values
should be checkpointed.
max_episodes : int
The maximum number of episodes to run. If <= 0, then there is no
episode limit.
"""
self.agent = agent
self.env = env
self.eval_env = eval_env
self.eval_env.monitor = False
self.eval_episodes = eval_episodes
self.max_episodes = max_episodes
# Track the number of time steps
self.timesteps_since_last_eval = 0
self.eval_interval_timesteps = eval_interval_timesteps
self.timesteps_elapsed = 0
self.total_timesteps = total_timesteps
# Keep track of number of training episodes
self.train_episodes = 0
# Track the returns seen at each training episode
self.train_ep_return = []
# Track the steps per each training episode
self.train_ep_steps = []
# Track the steps at which evaluation occurs
self.timesteps_at_eval = []
# Track the returns seen at each eval episode
self.eval_ep_return = []
# Track the number of evaluation steps taken in each evaluation episode
self.eval_ep_steps = []
# Anything the experiment tracks
self.info = {}
# Track the total training and evaluation time
self.train_time = 0.0
self.eval_time = 0.0
def run(self):
"""
Runs the experiment
Returns
-------
14-tuple of list of float, float, int
The online training episodic return, the return per
episode when evaluating offline, the training steps per
episode, the evaluation steps per episode when evaluating
offline, the list of timesteps at which the evaluation episodes
were run, the total amount of training time, the total amount
of evaluation time, and the number of total training episodes,
and the sequence of state, rewards, and actions during training.
Also returns the states, actions, and next states used in each
update to the agent.
"""
# Count total run time
start_run = time.time()
print(f"Starting experiment at: {datetime.now()}")
# Evaluate once at the beginning
self.eval_time += self.eval()
self.timesteps_at_eval.append(self.timesteps_elapsed)
# Train
i = 0
while self.timesteps_elapsed < self.total_timesteps and \
(self.train_episodes < self.max_episodes if
self.max_episodes > 0 else True):
# Run the training episode and save the relevant info
ep_reward, ep_steps, train_time = self.run_episode_train()
self.train_ep_return.append(ep_reward)
self.train_ep_steps.append(ep_steps)
self.train_time += train_time
print(f"=== Train ep: {i}, r: {ep_reward}, n_steps: {ep_steps}, " +
f"elapsed: {train_time}")
i += 1
# Evaluate once at the end
self.eval_time += self.eval()
self.timesteps_at_eval.append(self.timesteps_elapsed)
end_run = time.time()
print(f"End run at time {datetime.now()}")
print(f"Total time taken: {end_run - start_run}")
print(f"Training time: {self.train_time}")
print(f"Evaluation time: {self.eval_time}")
self.info["eval_episode_rewards"] = np.array(self.eval_ep_return)
self.info["eval_episode_steps"] = np.array(self.eval_ep_steps)
self.info["timesteps_at_eval"] = np.array(self.timesteps_at_eval)
self.info["train_episode_steps"] = np.array(self.train_ep_steps)
self.info["train_episode_rewards"] = np.array(self.train_ep_return)
self.info["train_time"] = self.train_time
self.info["eval_time"] = self.eval_time
self.info["total_train_episodes"] = self.train_episodes
def run_episode_train(self):
"""
Runs a single training episode, saving the evaluation metrics in
the corresponding instance variables.
Returns
-------
float, int, float
The return for the episode, the number of steps in the episode,
and the total amount of training time for the episode
"""
# Reset the agent
self.agent.reset()
self.train_episodes += 1
# Track the sequences of states, rewards, and actions during training
# episode_states = []
episode_rewards = []
# episode_actions = []
start = time.time()
episode_return = 0.0
episode_steps = 0
state, _ = self.env.reset()
done = False
action = self.agent.sample_action(state)
while not done:
# Evaluate offline at the appropriate intervals
if self.timesteps_since_last_eval >= \
self.eval_interval_timesteps:
self.eval_time += self.eval()
self.timesteps_at_eval.append(self.timesteps_elapsed)
# Sample the next transition
next_state, reward, done, info = self.env.step(action)
episode_steps += 1
# episode_states.append(next_state_info["orig_state"])
episode_rewards.append(reward)
episode_return += reward
# Compute the done mask, which is 1 if the episode terminated
# without the goal being reached or the episode is incomplete,
# and 0 if the agent reached the goal or terminal state
if self.env.steps_per_episode <= 1:
done_mask = 0
else:
if episode_steps <= self.env.steps_per_episode and done and \
not info["steps_exceeded"]:
done_mask = 0
else:
done_mask = 1
# Update agent
self.agent.update(state, action, reward, next_state, done_mask)
# Continue the episode if not done
if not done:
action = self.agent.sample_action(next_state)
state = next_state
# Keep track of the timesteps since we last evaluated so we know
# when to evaluate again
self.timesteps_since_last_eval += 1
# Keep track of timesteps since we train for a specified number of
# timesteps
self.timesteps_elapsed += 1
# Stop if we are at the max allowable timesteps
if self.timesteps_elapsed >= self.total_timesteps:
break
end = time.time()
return episode_return, episode_steps, (end-start)
def eval(self):
"""
Evaluates the agent's performance offline, for the appropriate number
of offline episodes as determined by the self.eval_episodes
instance variable. While evaluating, this function will populate the
appropriate instance variables with the evaluation data.
Returns
-------
float
The total amount of evaluation time
"""
self.timesteps_since_last_eval = 0
# Set the agent to evaluation mode
self.agent.eval()
# Save the episodic return and the number of steps per episode
temp_rewards_per_episode = []
episode_steps = []
eval_session_time = 0.0
# Evaluate offline
for i in range(self.eval_episodes):
eval_start_time = time.time()
episode_reward, num_steps = self.run_episode_eval()
eval_end_time = time.time()
# Save the evaluation data
temp_rewards_per_episode.append(episode_reward)
episode_steps.append(num_steps)
# Calculate time
eval_elapsed_time = eval_end_time - eval_start_time
eval_session_time += eval_elapsed_time
# Display the offline episodic return
print("=== EVAL ep: " + str(i) + ", r: " +
str(episode_reward) + ", n_steps: " + str(num_steps) +
", elapsed: " +
time.strftime("%H:%M:%S", time.gmtime(eval_elapsed_time)))
# Save evaluation data
self.eval_ep_return.append(temp_rewards_per_episode)
self.eval_ep_steps.append(episode_steps)
self.eval_time += eval_session_time
# Return the agent to training mode
self.agent.train()
return eval_session_time
def run_episode_eval(self):
"""
Runs a single evaluation episode.
Returns
-------
float, int, list
The episodic return and number of steps and the sequence of states,
rewards, and actions during the episode
"""
state, _ = self.eval_env.reset()
episode_return = 0.0
episode_steps = 0
done = False
action = self.agent.sample_action(state)
while not done:
next_state, reward, done, _ = self.eval_env.step(action)
episode_return += reward
if not done:
action = self.agent.sample_action(next_state)
state = next_state
episode_steps += 1
return episode_return, episode_steps