Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #65

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open

Dev #65

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
65d5f18
Moving steps/sec measurement to experiment class
jleni Aug 8, 2015
b966ff8
Moving logging functionality to separate methods
jleni Aug 17, 2015
932d5b4
Measuring the whole episode (including logging, etc.)
jleni Aug 17, 2015
8fc40a4
Making plotting scripts executable
jleni Aug 17, 2015
dec12ec
simplify launcher and generalize constructors through parameters
jleni Aug 17, 2015
79824c2
separate step method functionality (testing/learning)
jleni Aug 17, 2015
4cdc3b6
Keeping line-length below 80 + fixing warnings
jleni Aug 19, 2015
b8908e5
Merge pull request #42 from jleni/dev_refactoring
spragunr Sep 1, 2015
bc85b2d
Simplify and clean up default parameters and command line processing.
spragunr Sep 3, 2015
a810221
Remove unused import in run_nature.py and run_nips.py
spragunr Sep 18, 2015
05207d7
pushing json param dumping to correct branch
omnivert Sep 21, 2015
3540209
dump json params at each run of experiment
omnivert Sep 21, 2015
525bd5b
params now dumped as a dictionary to json
omnivert Sep 21, 2015
3074ef8
removed inline prints
omnivert Sep 21, 2015
3621120
replaced time import with datetime; time import does not support micr…
omnivert Sep 25, 2015
947f6f2
Merge branch 'master' into dev
spragunr Sep 27, 2015
f7d42da
Merge remote-tracking branch 'upstream/dev' into dev
omnivert Sep 28, 2015
44b2c20
Merge branch 'master' into dev
spragunr Sep 29, 2015
121e3a6
network is pickled again
omnivert Oct 5, 2015
9a6b611
Merge pull request #44 from omnivert/dev
spragunr Oct 5, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
*.pyc
roms
utils
data
217 changes: 129 additions & 88 deletions deep_q_rl/ale_agent.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,60 +8,52 @@

import os
import cPickle
import time
import datetime
import logging
import json

import numpy as np

from ale_agent_base import AgentBase
import ale_data_set

import sys
sys.setrecursionlimit(10000)

class NeuralAgent(object):

def __init__(self, q_network, epsilon_start, epsilon_min,
epsilon_decay, replay_memory_size, exp_pref,
replay_start_size, update_frequency, rng):

self.network = q_network
self.epsilon_start = epsilon_start
self.epsilon_min = epsilon_min
self.epsilon_decay = epsilon_decay
self.replay_memory_size = replay_memory_size
self.exp_pref = exp_pref
self.replay_start_size = replay_start_size
self.update_frequency = update_frequency
self.rng = rng
sys.setrecursionlimit(10000)

self.phi_length = self.network.num_frames
self.image_width = self.network.input_width
self.image_height = self.network.input_height

# CREATE A FOLDER TO HOLD RESULTS
time_str = time.strftime("_%m-%d-%H-%M_", time.gmtime())
self.exp_dir = self.exp_pref + time_str + \
"{}".format(self.network.lr).replace(".", "p") + "_" \
+ "{}".format(self.network.discount).replace(".", "p")
class NeuralAgent(AgentBase):
def __init__(self, params):
super(NeuralAgent, self).__init__(params)

try:
os.stat(self.exp_dir)
except OSError:
os.makedirs(self.exp_dir)
self.params = params
self.network = None
self.action_set = None
self.num_actions = -1

self.num_actions = self.network.num_actions
self.epsilon_start = self.params.epsilon_start
self.epsilon_min = self.params.epsilon_min
self.epsilon_decay = self.params.epsilon_decay
self.replay_memory_size = self.params.replay_memory_size
self.exp_pref = self.params.experiment_prefix
self.replay_start_size = self.params.replay_start_size
self.update_frequency = self.params.update_frequency
self.phi_length = self.params.phi_length
self.image_width = self.params.resized_width
self.image_height = self.params.resized_height

self.rng = self.params.rng

self.data_set = ale_data_set.DataSet(width=self.image_width,
height=self.image_height,
rng=rng,
rng=self.rng,
max_steps=self.replay_memory_size,
phi_length=self.phi_length)

# just needs to be big enough to create phi's
self.test_data_set = ale_data_set.DataSet(width=self.image_width,
height=self.image_height,
rng=rng,
rng=self.rng,
max_steps=self.phi_length * 2,
phi_length=self.phi_length)
self.epsilon = self.epsilon_start
Expand All @@ -73,12 +65,10 @@ def __init__(self, q_network, epsilon_start, epsilon_min,

self.testing = False

self._open_results_file()
self._open_learning_file()

self.current_epoch = 0
self.episode_counter = 0
self.batch_counter = 0

self.total_reward = 0
self.holdout_data = None

# In order to add an element to the data set we need the
Expand All @@ -87,22 +77,68 @@ def __init__(self, q_network, epsilon_start, epsilon_min,
self.last_img = None
self.last_action = None

self.export_dir = self._create_export_dir()
self._open_params_file()
self._open_results_file()
self._open_learning_file()

def initialize(self, action_set):
self.action_set = action_set
self.num_actions = len(self.action_set)

if self.params.qlearner_type is None:
raise Exception("The QLearner/network type has not been specified")

if self.params.nn_file is None:
self.network = self.params.qlearner_type(
num_actions=self.num_actions,
input_width=self.params.resized_width,
input_height=self.params.resized_height,
num_frames=self.params.phi_length,
params=self.params)
else:
handle = open(self.params.nn_file, 'r')
self.network = cPickle.load(handle)

# region Dumping/Logging
def _create_export_dir(self):
# CREATE A FOLDER TO HOLD RESULTS
# this is now just exp_pref + timestamp. params are in params.json
time_str = datetime.datetime.now().strftime("_%m-%d-%H%M_%S_%f")
export_dir = self.exp_pref + time_str
try:
os.stat(export_dir)
except OSError:
os.makedirs(export_dir)

return export_dir

def _open_params_file(self):
self.params_file = open(self.export_dir + '/params.json', 'w')
param_dict = {k:v for k, v in self.params.__dict__.items() \
if "__" not in k \
and isinstance(v, (int, float, str, bool))}
json.dump(param_dict, self.params_file, indent=4)
self.params_file.close()

def _open_results_file(self):
logging.info("OPENING " + self.exp_dir + '/results.csv')
self.results_file = open(self.exp_dir + '/results.csv', 'w', 0)
self.results_file.write(\
logging.info("OPENING " + self.export_dir + '/results.csv')
self.results_file = open(self.export_dir + '/results.csv', 'w', 0)
self.results_file.write(
'epoch,num_episodes,total_reward,reward_per_epoch,mean_q\n')
self.results_file.flush()

def _open_learning_file(self):
self.learning_file = open(self.exp_dir + '/learning.csv', 'w', 0)
self.learning_file = open(self.export_dir + '/learning.csv', 'w', 0)
self.learning_file.write('mean_loss,epsilon\n')
self.learning_file.flush()

def _update_results_file(self, epoch, num_episodes, holdout_sum):
out = "{},{},{},{},{}\n".format(epoch, num_episodes, self.total_reward,
out = "{},{},{},{},{}\n".format(epoch, num_episodes,
self.total_reward,
self.total_reward / float(num_episodes),
holdout_sum)

self.results_file.write(out)
self.results_file.flush()

Expand All @@ -112,6 +148,16 @@ def _update_learning_file(self):
self.learning_file.write(out)
self.learning_file.flush()

def _persist_network(self, network_filename):
full_filename = os.path.join(self.export_dir, network_filename)
with open(full_filename, 'w') as net_file:
cPickle.dump(self.network, net_file, -1)

# endregion

def start_epoch(self, epoch):
self.current_epoch = epoch

def start_episode(self, observation):
"""
This method is called once at the beginning of each episode.
Expand All @@ -132,7 +178,6 @@ def start_episode(self, observation):
# We report the mean loss for every epoch.
self.loss_averages = []

self.start_time = time.time()
return_action = self.rng.randint(0, self.num_actions)

self.last_action = return_action
Expand All @@ -141,19 +186,47 @@ def start_episode(self, observation):

return return_action


def _show_phis(self, phi1, phi2):
import matplotlib.pyplot as plt
for p in range(self.phi_length):
plt.subplot(2, self.phi_length, p+1)
plt.subplot(2, self.phi_length, p + 1)
plt.imshow(phi1[p, :, :], interpolation='none', cmap="gray")
plt.grid(color='r', linestyle='-', linewidth=1)
for p in range(self.phi_length):
plt.subplot(2, self.phi_length, p+5)
plt.subplot(2, self.phi_length, p + 5)
plt.imshow(phi2[p, :, :], interpolation='none', cmap="gray")
plt.grid(color='r', linestyle='-', linewidth=1)
plt.show()

def _step_testing(self, reward, observation):
action = self._choose_action(data_set=self.test_data_set,
epsilon=.05,
cur_img=observation,
reward=np.clip(reward, -1, 1))
return action

def _step_training(self, reward, observation):
if len(self.data_set) > self.replay_start_size:
self.epsilon = max(self.epsilon_min,
self.epsilon - self.epsilon_rate)

action = self._choose_action(data_set=self.data_set,
epsilon=self.epsilon,
cur_img=observation,
reward=np.clip(reward, -1, 1))

if self.step_counter % self.update_frequency == 0:
loss = self._do_training()
self.batch_counter += 1
self.loss_averages.append(loss)

else: # Still gathering initial random data...
action = self._choose_action(data_set=self.data_set,
epsilon=self.epsilon,
cur_img=observation,
reward=np.clip(reward, -1, 1))
return action

def step(self, reward, observation):
"""
This method is called each time step.
Expand All @@ -166,37 +239,13 @@ def step(self, reward, observation):
An integer action.

"""

self.step_counter += 1

#TESTING---------------------------
self.episode_reward += reward
if self.testing:
self.episode_reward += reward
action = self._choose_action(self.test_data_set, .05,
observation, np.clip(reward, -1, 1))

#NOT TESTING---------------------------
action = self._step_testing(reward, observation)
else:
action = self._step_training(reward, observation)

if len(self.data_set) > self.replay_start_size:
self.epsilon = max(self.epsilon_min,
self.epsilon - self.epsilon_rate)

action = self._choose_action(self.data_set, self.epsilon,
observation,
np.clip(reward, -1, 1))

if self.step_counter % self.update_frequency == 0:
loss = self._do_training()
self.batch_counter += 1
self.loss_averages.append(loss)

else: # Still gathering initial random data...
action = self._choose_action(self.data_set, self.epsilon,
observation,
np.clip(reward, -1, 1))


self.step_counter += 1
self.last_action = action
self.last_img = observation

Expand Down Expand Up @@ -224,12 +273,11 @@ def _do_training(self):
differently.
"""
states, actions, rewards, next_states, terminals = \
self.data_set.random_batch(
self.network.batch_size)
self.data_set.random_batch(
self.network.batch_size)
return self.network.train(states, actions, rewards,
next_states, terminals)


def end_episode(self, reward, terminal=True):
"""
This function is called once at the end of an episode.
Expand All @@ -244,7 +292,6 @@ def end_episode(self, reward, terminal=True):

self.episode_reward += reward
self.step_counter += 1
total_time = time.time() - self.start_time

if self.testing:
# If we run out of time, only count the last episode if
Expand All @@ -260,22 +307,16 @@ def end_episode(self, reward, terminal=True):
np.clip(reward, -1, 1),
True)

logging.info("steps/second: {:.2f}".format(\
self.step_counter/total_time))

if self.batch_counter > 0:
self._update_learning_file()
logging.info("average loss: {:.4f}".format(\
np.mean(self.loss_averages)))

logging.info(
"average loss: {:.4f}".format(np.mean(self.loss_averages)))

def finish_epoch(self, epoch):
net_file = open(self.exp_dir + '/network_file_' + str(epoch) + \
'.pkl', 'w')
cPickle.dump(self.network, net_file, -1)
net_file.close()
network_filename = 'network_file_' + str(epoch) + '.pkl'
self._persist_network(network_filename)

def start_testing(self):
def start_testing(self, epoch):
self.testing = True
self.total_reward = 0
self.episode_counter = 0
Expand Down
Loading