-
Notifications
You must be signed in to change notification settings - Fork 0
/
monitor.py
66 lines (63 loc) · 2.65 KB
/
monitor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from collections import deque
import sys
import math
import numpy as np
def interact(env, agent, num_episodes=20000, window=100, show_progress=False, endline=""):
""" Monitor agent's performance.
Params
======
- env: instance of OpenAI Gym's Taxi-v1 environment
- agent: instance of class Agent (see Agent.py for details)
- num_episodes: number of episodes of agent-environment interaction
- window: number of episodes to consider when calculating average rewards
- show_progress: whether to show progress during run and how often
- endline: character to end progress lines with (usually newline or nothing)
Returns
=======
- avg_rewards: deque containing average rewards
- best_avg_reward: largest value in the avg_rewards deque
"""
# initialize average rewards
avg_rewards = deque(maxlen=num_episodes)
# initialize best average reward
best_avg_reward = -math.inf
# initialize monitor for most recent rewards
samp_rewards = deque(maxlen=window)
# for each episode
for i_episode in range(1, num_episodes+1):
# begin the episode
state = env.reset()
# initialize the sampled reward
samp_reward = 0
while True:
# agent selects an action
action = agent.select_action(state)
# agent performs the selected action
next_state, reward, done, _ = env.step(action)
# agent performs internal updates based on sampled experience
agent.step(state, action, reward, next_state, done)
# update the sampled reward
samp_reward += reward
# update the state (s <- s') to next time step
state = next_state
if done:
# save final sampled reward
samp_rewards.append(samp_reward)
break
if (i_episode >= 100):
# get average reward from last 100 episodes
avg_reward = np.mean(samp_rewards)
# append to deque
avg_rewards.append(avg_reward)
# update best average reward
if avg_reward > best_avg_reward:
best_avg_reward = avg_reward
if show_progress and not i_episode % show_progress:
print("\rEpisode {}/{} || epsilon={:.7f}, Best average reward {}".format(
i_episode, num_episodes, agent.epsilon, best_avg_reward), end=endline)
sys.stdout.flush()
# check if task is solved (according to OpenAI Gym)
if best_avg_reward >= 9.7:
print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
break
return avg_rewards, best_avg_reward