diff --git a/.gitignore b/.gitignore index 0758f65..31f5d66 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *.pyc roms +utils +data diff --git a/deep_q_rl/ale_agent.py b/deep_q_rl/ale_agent.py old mode 100755 new mode 100644 index 5ec16b4..c96b67e --- a/deep_q_rl/ale_agent.py +++ b/deep_q_rl/ale_agent.py @@ -8,60 +8,52 @@ import os import cPickle -import time +import datetime import logging +import json import numpy as np +from ale_agent_base import AgentBase import ale_data_set import sys -sys.setrecursionlimit(10000) - -class NeuralAgent(object): - - def __init__(self, q_network, epsilon_start, epsilon_min, - epsilon_decay, replay_memory_size, exp_pref, - replay_start_size, update_frequency, rng): - self.network = q_network - self.epsilon_start = epsilon_start - self.epsilon_min = epsilon_min - self.epsilon_decay = epsilon_decay - self.replay_memory_size = replay_memory_size - self.exp_pref = exp_pref - self.replay_start_size = replay_start_size - self.update_frequency = update_frequency - self.rng = rng +sys.setrecursionlimit(10000) - self.phi_length = self.network.num_frames - self.image_width = self.network.input_width - self.image_height = self.network.input_height - # CREATE A FOLDER TO HOLD RESULTS - time_str = time.strftime("_%m-%d-%H-%M_", time.gmtime()) - self.exp_dir = self.exp_pref + time_str + \ - "{}".format(self.network.lr).replace(".", "p") + "_" \ - + "{}".format(self.network.discount).replace(".", "p") +class NeuralAgent(AgentBase): + def __init__(self, params): + super(NeuralAgent, self).__init__(params) - try: - os.stat(self.exp_dir) - except OSError: - os.makedirs(self.exp_dir) + self.params = params + self.network = None + self.action_set = None + self.num_actions = -1 - self.num_actions = self.network.num_actions + self.epsilon_start = self.params.epsilon_start + self.epsilon_min = self.params.epsilon_min + self.epsilon_decay = self.params.epsilon_decay + self.replay_memory_size = self.params.replay_memory_size + self.exp_pref = self.params.experiment_prefix + self.replay_start_size = self.params.replay_start_size + self.update_frequency = self.params.update_frequency + self.phi_length = self.params.phi_length + self.image_width = self.params.resized_width + self.image_height = self.params.resized_height + self.rng = self.params.rng self.data_set = ale_data_set.DataSet(width=self.image_width, height=self.image_height, - rng=rng, + rng=self.rng, max_steps=self.replay_memory_size, phi_length=self.phi_length) # just needs to be big enough to create phi's self.test_data_set = ale_data_set.DataSet(width=self.image_width, height=self.image_height, - rng=rng, + rng=self.rng, max_steps=self.phi_length * 2, phi_length=self.phi_length) self.epsilon = self.epsilon_start @@ -73,12 +65,10 @@ def __init__(self, q_network, epsilon_start, epsilon_min, self.testing = False - self._open_results_file() - self._open_learning_file() - + self.current_epoch = 0 self.episode_counter = 0 self.batch_counter = 0 - + self.total_reward = 0 self.holdout_data = None # In order to add an element to the data set we need the @@ -87,22 +77,68 @@ def __init__(self, q_network, epsilon_start, epsilon_min, self.last_img = None self.last_action = None + self.export_dir = self._create_export_dir() + self._open_params_file() + self._open_results_file() + self._open_learning_file() + + def initialize(self, action_set): + self.action_set = action_set + self.num_actions = len(self.action_set) + + if self.params.qlearner_type is None: + raise Exception("The QLearner/network type has not been specified") + + if self.params.nn_file is None: + self.network = self.params.qlearner_type( + num_actions=self.num_actions, + input_width=self.params.resized_width, + input_height=self.params.resized_height, + num_frames=self.params.phi_length, + params=self.params) + else: + handle = open(self.params.nn_file, 'r') + self.network = cPickle.load(handle) + + # region Dumping/Logging + def _create_export_dir(self): + # CREATE A FOLDER TO HOLD RESULTS + # this is now just exp_pref + timestamp. params are in params.json + time_str = datetime.datetime.now().strftime("_%m-%d-%H%M_%S_%f") + export_dir = self.exp_pref + time_str + try: + os.stat(export_dir) + except OSError: + os.makedirs(export_dir) + + return export_dir + + def _open_params_file(self): + self.params_file = open(self.export_dir + '/params.json', 'w') + param_dict = {k:v for k, v in self.params.__dict__.items() \ + if "__" not in k \ + and isinstance(v, (int, float, str, bool))} + json.dump(param_dict, self.params_file, indent=4) + self.params_file.close() + def _open_results_file(self): - logging.info("OPENING " + self.exp_dir + '/results.csv') - self.results_file = open(self.exp_dir + '/results.csv', 'w', 0) - self.results_file.write(\ + logging.info("OPENING " + self.export_dir + '/results.csv') + self.results_file = open(self.export_dir + '/results.csv', 'w', 0) + self.results_file.write( 'epoch,num_episodes,total_reward,reward_per_epoch,mean_q\n') self.results_file.flush() def _open_learning_file(self): - self.learning_file = open(self.exp_dir + '/learning.csv', 'w', 0) + self.learning_file = open(self.export_dir + '/learning.csv', 'w', 0) self.learning_file.write('mean_loss,epsilon\n') self.learning_file.flush() def _update_results_file(self, epoch, num_episodes, holdout_sum): - out = "{},{},{},{},{}\n".format(epoch, num_episodes, self.total_reward, + out = "{},{},{},{},{}\n".format(epoch, num_episodes, + self.total_reward, self.total_reward / float(num_episodes), holdout_sum) + self.results_file.write(out) self.results_file.flush() @@ -112,6 +148,16 @@ def _update_learning_file(self): self.learning_file.write(out) self.learning_file.flush() + def _persist_network(self, network_filename): + full_filename = os.path.join(self.export_dir, network_filename) + with open(full_filename, 'w') as net_file: + cPickle.dump(self.network, net_file, -1) + + # endregion + + def start_epoch(self, epoch): + self.current_epoch = epoch + def start_episode(self, observation): """ This method is called once at the beginning of each episode. @@ -132,7 +178,6 @@ def start_episode(self, observation): # We report the mean loss for every epoch. self.loss_averages = [] - self.start_time = time.time() return_action = self.rng.randint(0, self.num_actions) self.last_action = return_action @@ -141,19 +186,47 @@ def start_episode(self, observation): return return_action - def _show_phis(self, phi1, phi2): import matplotlib.pyplot as plt for p in range(self.phi_length): - plt.subplot(2, self.phi_length, p+1) + plt.subplot(2, self.phi_length, p + 1) plt.imshow(phi1[p, :, :], interpolation='none', cmap="gray") plt.grid(color='r', linestyle='-', linewidth=1) for p in range(self.phi_length): - plt.subplot(2, self.phi_length, p+5) + plt.subplot(2, self.phi_length, p + 5) plt.imshow(phi2[p, :, :], interpolation='none', cmap="gray") plt.grid(color='r', linestyle='-', linewidth=1) plt.show() + def _step_testing(self, reward, observation): + action = self._choose_action(data_set=self.test_data_set, + epsilon=.05, + cur_img=observation, + reward=np.clip(reward, -1, 1)) + return action + + def _step_training(self, reward, observation): + if len(self.data_set) > self.replay_start_size: + self.epsilon = max(self.epsilon_min, + self.epsilon - self.epsilon_rate) + + action = self._choose_action(data_set=self.data_set, + epsilon=self.epsilon, + cur_img=observation, + reward=np.clip(reward, -1, 1)) + + if self.step_counter % self.update_frequency == 0: + loss = self._do_training() + self.batch_counter += 1 + self.loss_averages.append(loss) + + else: # Still gathering initial random data... + action = self._choose_action(data_set=self.data_set, + epsilon=self.epsilon, + cur_img=observation, + reward=np.clip(reward, -1, 1)) + return action + def step(self, reward, observation): """ This method is called each time step. @@ -166,37 +239,13 @@ def step(self, reward, observation): An integer action. """ - - self.step_counter += 1 - - #TESTING--------------------------- + self.episode_reward += reward if self.testing: - self.episode_reward += reward - action = self._choose_action(self.test_data_set, .05, - observation, np.clip(reward, -1, 1)) - - #NOT TESTING--------------------------- + action = self._step_testing(reward, observation) else: + action = self._step_training(reward, observation) - if len(self.data_set) > self.replay_start_size: - self.epsilon = max(self.epsilon_min, - self.epsilon - self.epsilon_rate) - - action = self._choose_action(self.data_set, self.epsilon, - observation, - np.clip(reward, -1, 1)) - - if self.step_counter % self.update_frequency == 0: - loss = self._do_training() - self.batch_counter += 1 - self.loss_averages.append(loss) - - else: # Still gathering initial random data... - action = self._choose_action(self.data_set, self.epsilon, - observation, - np.clip(reward, -1, 1)) - - + self.step_counter += 1 self.last_action = action self.last_img = observation @@ -224,12 +273,11 @@ def _do_training(self): differently. """ states, actions, rewards, next_states, terminals = \ - self.data_set.random_batch( - self.network.batch_size) + self.data_set.random_batch( + self.network.batch_size) return self.network.train(states, actions, rewards, next_states, terminals) - def end_episode(self, reward, terminal=True): """ This function is called once at the end of an episode. @@ -244,7 +292,6 @@ def end_episode(self, reward, terminal=True): self.episode_reward += reward self.step_counter += 1 - total_time = time.time() - self.start_time if self.testing: # If we run out of time, only count the last episode if @@ -260,22 +307,16 @@ def end_episode(self, reward, terminal=True): np.clip(reward, -1, 1), True) - logging.info("steps/second: {:.2f}".format(\ - self.step_counter/total_time)) - if self.batch_counter > 0: self._update_learning_file() - logging.info("average loss: {:.4f}".format(\ - np.mean(self.loss_averages))) - + logging.info( + "average loss: {:.4f}".format(np.mean(self.loss_averages))) def finish_epoch(self, epoch): - net_file = open(self.exp_dir + '/network_file_' + str(epoch) + \ - '.pkl', 'w') - cPickle.dump(self.network, net_file, -1) - net_file.close() + network_filename = 'network_file_' + str(epoch) + '.pkl' + self._persist_network(network_filename) - def start_testing(self): + def start_testing(self, epoch): self.testing = True self.total_reward = 0 self.episode_counter = 0 diff --git a/deep_q_rl/ale_agent_base.py b/deep_q_rl/ale_agent_base.py new file mode 100644 index 0000000..a492511 --- /dev/null +++ b/deep_q_rl/ale_agent_base.py @@ -0,0 +1,66 @@ +from abc import ABCMeta, abstractmethod + + +class AgentBase(object): + __metaclass__ = ABCMeta + + def __init__(self, params): + pass + + @abstractmethod + def initialize(self, action_set): + pass + + @abstractmethod + def start_episode(self, observation): + """ + This method is called once at the beginning of each episode. + No reward is provided, because reward is only available after + an action has been taken. + Arguments: + observation - height x width numpy array + Returns: + An integer action + """ + pass + + @abstractmethod + def step(self, reward, observation): + """ + This method is called each time step. + Arguments: + reward - Real valued reward. + observation - A height x width numpy array + Returns: + An integer action. + """ + pass + + @abstractmethod + def end_episode(self, reward, terminal): + """ + This function is called once at the end of an episode. + Arguments: + reward - Real valued reward. + terminal - Whether the episode ended intrinsically + (ie we didn't run out of steps) + Returns: + None + """ + pass + + @abstractmethod + def start_epoch(self, epoch): + pass + + @abstractmethod + def finish_epoch(self, epoch): + pass + + @abstractmethod + def start_testing(self, epoch): + pass + + @abstractmethod + def finish_testing(self, epoch): + pass \ No newline at end of file diff --git a/deep_q_rl/ale_agent_random.py b/deep_q_rl/ale_agent_random.py new file mode 100644 index 0000000..d9677f2 --- /dev/null +++ b/deep_q_rl/ale_agent_random.py @@ -0,0 +1,32 @@ +import random +from ale_agent_base import AgentBase + + +class AgentRandom(AgentBase): + def __init__(self, params): + super(AgentRandom, self).__init__(params) + self.action_set = None + + def initialize(self, action_set): + self.action_set = action_set + + def start_episode(self, observation): + return self.step(None, None) + + def step(self, reward, observation): + return random.randint(0, len(self.action_set) - 1) + + def end_episode(self, reward, terminal): + pass + + def start_epoch(self, epoch): + pass + + def finish_epoch(self, epoch): + pass + + def start_testing(self, epoch): + pass + + def finish_testing(self, epoch): + pass diff --git a/deep_q_rl/ale_data_set.py b/deep_q_rl/ale_data_set.py index ba1332f..f1eda36 100644 --- a/deep_q_rl/ale_data_set.py +++ b/deep_q_rl/ale_data_set.py @@ -8,6 +8,7 @@ floatX = theano.config.floatX + class DataSet(object): """A replay memory consisting of circular buffers for observed images, actions, and rewards. diff --git a/deep_q_rl/ale_experiment.py b/deep_q_rl/ale_experiment.py old mode 100755 new mode 100644 index a9a4477..b94d2a2 --- a/deep_q_rl/ale_experiment.py +++ b/deep_q_rl/ale_experiment.py @@ -11,6 +11,8 @@ # Number of rows to crop off the bottom of the (downsampled) screen. # This is appropriate for breakout, but it may need to be modified # for other games. +import time + CROP_OFFSET = 8 @@ -37,7 +39,7 @@ def __init__(self, ale, agent, resized_width, resized_height, self.height, self.width), dtype=np.uint8) - self.terminal_lol = False # Most recent episode ended on a loss of life + self.terminal_lol = False # Most recent episode ended on a loss of life self.max_start_nullops = max_start_nullops self.rng = rng @@ -46,12 +48,16 @@ def run(self): Run the desired number of training epochs, a testing epoch is conducted after each training epoch. """ + + self.agent.initialize(self.ale.getMinimalActionSet()) + for epoch in range(1, self.num_epochs + 1): + self.agent.start_epoch(epoch) self.run_epoch(epoch, self.epoch_length) self.agent.finish_epoch(epoch) if self.test_length > 0: - self.agent.start_testing() + self.agent.start_testing(epoch) self.run_epoch(epoch, self.test_length, True) self.agent.finish_testing(epoch) @@ -66,16 +72,24 @@ def run_epoch(self, epoch, num_steps, testing=False): testing - True if this Epoch is used for testing and not training """ - self.terminal_lol = False # Make sure each epoch starts with a reset. + self.terminal_lol = False # Make sure each epoch starts with a reset. steps_left = num_steps while steps_left > 0: prefix = "testing" if testing else "training" - logging.info(prefix + " epoch: " + str(epoch) + " steps_left: " + - str(steps_left)) - _, num_steps = self.run_episode(steps_left, testing) + t0 = time.time() + _, num_steps = self.run_episode(steps_left, testing) steps_left -= num_steps + t1 = time.time() + total_time = t1 - t0 + logging.info("[{:8}] epoch {:3} | num_steps {:7} " \ + "steps_left {:7} steps/second: {:>7.2f}" + .format(prefix, + epoch, + num_steps, + steps_left, + num_steps / total_time)) def _init_episode(self): """ This method resets the game if needed, performs enough null @@ -89,14 +103,13 @@ def _init_episode(self): if self.max_start_nullops > 0: random_actions = self.rng.randint(0, self.max_start_nullops+1) for _ in range(random_actions): - self._act(0) # Null action + self._act(0) # Null action # Make sure the screen buffer is filled at the beginning of # each episode... self._act(0) self._act(0) - def _act(self, action): """Perform the indicated action for a single frame, return the resulting reward and store the resulting screen image in the @@ -138,6 +151,8 @@ def run_episode(self, max_steps, testing): action = self.agent.start_episode(self.get_observation()) num_steps = 0 + terminal = False + while True: reward = self._step(self.min_action_set[action]) self.terminal_lol = (self.death_ends_episode and not testing and @@ -150,8 +165,8 @@ def run_episode(self, max_steps, testing): break action = self.agent.step(reward, self.get_observation()) - return terminal, num_steps + return terminal, num_steps def get_observation(self): """ Resize and merge the previous two screen images """ @@ -177,7 +192,7 @@ def resize_image(self, image): # Crop the part we want crop_y_cutoff = resize_height - CROP_OFFSET - self.resized_height cropped = resized[crop_y_cutoff: - crop_y_cutoff + self.resized_height, :] + crop_y_cutoff + self.resized_height, :] return cropped elif self.resize_method == 'scale': @@ -186,4 +201,3 @@ def resize_image(self, image): interpolation=cv2.INTER_LINEAR) else: raise ValueError('Unrecognized image resize method.') - diff --git a/deep_q_rl/ale_run_watch.py b/deep_q_rl/ale_run_watch.py index 67a0bd5..73fd9a3 100644 --- a/deep_q_rl/ale_run_watch.py +++ b/deep_q_rl/ale_run_watch.py @@ -8,6 +8,7 @@ import subprocess import sys + def run_watch(): command = ['./run_nature.py', '--steps-per-epoch', '0', '--test-length', '10000', '--nn-file', sys.argv[1], @@ -17,8 +18,9 @@ def run_watch(): command.extend(['--rom', sys.argv[2]]) p1 = subprocess.Popen(command) - + p1.wait() + if __name__ == "__main__": run_watch() diff --git a/deep_q_rl/launcher.py b/deep_q_rl/launcher.py index c136f01..b6fc239 100755 --- a/deep_q_rl/launcher.py +++ b/deep_q_rl/launcher.py @@ -4,17 +4,28 @@ run_nips.py or run_nature.py. """ +from inspect import ismethod import os import argparse import logging import ale_python_interface -import cPickle import numpy as np import theano - import ale_experiment -import ale_agent -import q_network + + +def convert_bool_arg(params, param_name): + """Unfortunately, argparse doesn't handle converting strings to + booleans. + """ + param_val = getattr(params, param_name) + if param_val.lower() == 'true': + setattr(params, param_name, True) + elif param_val.lower() == 'false': + setattr(params, param_name, False) + else: + raise ValueError("--" + param_name + " must be true or false") + def process_args(args, defaults, description): """ @@ -26,16 +37,16 @@ def process_args(args, defaults, description): description - a string to display at the top of the help message. """ parser = argparse.ArgumentParser(description=description) - parser.add_argument('-r', '--rom', dest="rom", default=defaults.ROM, + parser.add_argument('-r', '--rom', dest="rom", default=defaults.rom, help='ROM to run (default: %(default)s)') parser.add_argument('-e', '--epochs', dest="epochs", type=int, - default=defaults.EPOCHS, + default=defaults.epochs, help='Number of training epochs (default: %(default)s)') parser.add_argument('-s', '--steps-per-epoch', dest="steps_per_epoch", - type=int, default=defaults.STEPS_PER_EPOCH, + type=int, default=defaults.steps_per_epoch, help='Number of steps per epoch (default: %(default)s)') parser.add_argument('-t', '--test-length', dest="steps_per_test", - type=int, default=defaults.STEPS_PER_TEST, + type=int, default=defaults.steps_per_test, help='Number of steps per test (default: %(default)s)') parser.add_argument('--display-screen', dest="display_screen", action='store_true', default=False, @@ -43,125 +54,123 @@ def process_args(args, defaults, description): parser.add_argument('--experiment-prefix', dest="experiment_prefix", default=None, help='Experiment name prefix ' - '(default is the name of the game)') + '(default is the name of the game)') parser.add_argument('--frame-skip', dest="frame_skip", - default=defaults.FRAME_SKIP, type=int, + default=defaults.frame_skip, type=int, help='Every how many frames to process ' - '(default: %(default)s)') + '(default: %(default)s)') parser.add_argument('--repeat-action-probability', dest="repeat_action_probability", - default=defaults.REPEAT_ACTION_PROBABILITY, type=float, + default=defaults.repeat_action_probability, type=float, help=('Probability that action choice will be ' + 'ignored (default: %(default)s)')) parser.add_argument('--update-rule', dest="update_rule", - type=str, default=defaults.UPDATE_RULE, + type=str, default=defaults.update_rule, help=('deepmind_rmsprop|rmsprop|sgd ' + '(default: %(default)s)')) parser.add_argument('--batch-accumulator', dest="batch_accumulator", - type=str, default=defaults.BATCH_ACCUMULATOR, - help=('sum|mean (default: %(default)s)')) + type=str, default=defaults.batch_accumulator, + help='sum|mean (default: %(default)s)') parser.add_argument('--learning-rate', dest="learning_rate", - type=float, default=defaults.LEARNING_RATE, + type=float, default=defaults.learning_rate, help='Learning rate (default: %(default)s)') parser.add_argument('--rms-decay', dest="rms_decay", - type=float, default=defaults.RMS_DECAY, + type=float, default=defaults.rms_decay, help='Decay rate for rms_prop (default: %(default)s)') parser.add_argument('--rms-epsilon', dest="rms_epsilon", - type=float, default=defaults.RMS_EPSILON, + type=float, default=defaults.rms_epsilon, help='Denominator epsilson for rms_prop ' + - '(default: %(default)s)') - parser.add_argument('--momentum', type=float, default=defaults.MOMENTUM, - help=('Momentum term for Nesterov momentum. '+ + '(default: %(default)s)') + parser.add_argument('--momentum', type=float, default=defaults.momentum, + help=('Momentum term for Nesterov momentum. ' + '(default: %(default)s)')) parser.add_argument('--clip-delta', dest="clip_delta", type=float, - default=defaults.CLIP_DELTA, + default=defaults.clip_delta, help=('Max absolute value for Q-update delta value. ' + '(default: %(default)s)')) - parser.add_argument('--discount', type=float, default=defaults.DISCOUNT, + parser.add_argument('--discount', type=float, default=defaults.discount, help='Discount rate') parser.add_argument('--epsilon-start', dest="epsilon_start", - type=float, default=defaults.EPSILON_START, + type=float, default=defaults.epsilon_start, help=('Starting value for epsilon. ' + '(default: %(default)s)')) parser.add_argument('--epsilon-min', dest="epsilon_min", - type=float, default=defaults.EPSILON_MIN, + type=float, default=defaults.epsilon_min, help='Minimum epsilon. (default: %(default)s)') parser.add_argument('--epsilon-decay', dest="epsilon_decay", - type=float, default=defaults.EPSILON_DECAY, + type=float, default=defaults.epsilon_decay, help=('Number of steps to minimum epsilon. ' + '(default: %(default)s)')) parser.add_argument('--phi-length', dest="phi_length", - type=int, default=defaults.PHI_LENGTH, + type=int, default=defaults.phi_length, help=('Number of recent frames used to represent ' + 'state. (default: %(default)s)')) parser.add_argument('--max-history', dest="replay_memory_size", - type=int, default=defaults.REPLAY_MEMORY_SIZE, + type=int, default=defaults.replay_memory_size, help=('Maximum number of steps stored in replay ' + 'memory. (default: %(default)s)')) parser.add_argument('--batch-size', dest="batch_size", - type=int, default=defaults.BATCH_SIZE, + type=int, default=defaults.batch_size, help='Batch size. (default: %(default)s)') parser.add_argument('--network-type', dest="network_type", - type=str, default=defaults.NETWORK_TYPE, + type=str, default=defaults.network_type, help=('nips_cuda|nips_dnn|nature_cuda|nature_dnn' + '|linear (default: %(default)s)')) parser.add_argument('--freeze-interval', dest="freeze_interval", - type=int, default=defaults.FREEZE_INTERVAL, + type=int, default=defaults.freeze_interval, help=('Interval between target freezes. ' + '(default: %(default)s)')) parser.add_argument('--update-frequency', dest="update_frequency", - type=int, default=defaults.UPDATE_FREQUENCY, - help=('Number of actions before each SGD update. '+ + type=int, default=defaults.update_frequency, + help=('Number of actions before each SGD update. ' + '(default: %(default)s)')) parser.add_argument('--replay-start-size', dest="replay_start_size", - type=int, default=defaults.REPLAY_START_SIZE, + type=int, default=defaults.replay_start_size, help=('Number of random steps before training. ' + '(default: %(default)s)')) parser.add_argument('--resize-method', dest="resize_method", - type=str, default=defaults.RESIZE_METHOD, + type=str, default=defaults.resize_method, help=('crop|scale (default: %(default)s)')) parser.add_argument('--nn-file', dest="nn_file", type=str, default=None, help='Pickle file containing trained net.') parser.add_argument('--death-ends-episode', dest="death_ends_episode", - type=str, default=defaults.DEATH_ENDS_EPISODE, + type=str, default=defaults.death_ends_episode, help=('true|false (default: %(default)s)')) parser.add_argument('--max-start-nullops', dest="max_start_nullops", - type=int, default=defaults.MAX_START_NULLOPS, + type=int, default=defaults.max_start_nullops, help=('Maximum number of null-ops at the start ' + 'of games. (default: %(default)s)')) parser.add_argument('--deterministic', dest="deterministic", - type=bool, default=defaults.DETERMINISTIC, + type=str, default=defaults.deterministic, help=('Whether to use deterministic parameters ' + 'for learning. (default: %(default)s)')) parser.add_argument('--cudnn_deterministic', dest="cudnn_deterministic", - type=bool, default=defaults.CUDNN_DETERMINISTIC, + type=str, default=defaults.cudnn_deterministic, help=('Whether to use deterministic backprop. ' + '(default: %(default)s)')) - parameters = parser.parse_args(args) - if parameters.experiment_prefix is None: - name = os.path.splitext(os.path.basename(parameters.rom))[0] - parameters.experiment_prefix = name + params = parser.parse_args(args, defaults) + if params.experiment_prefix is None: + name = os.path.splitext(os.path.basename(params.rom))[0] + params.experiment_prefix = name - if parameters.death_ends_episode == 'true': - parameters.death_ends_episode = True - elif parameters.death_ends_episode == 'false': - parameters.death_ends_episode = False - else: - raise ValueError("--death-ends-episode must be true or false") - if parameters.freeze_interval > 0: + convert_bool_arg(params, 'death_ends_episode') + convert_bool_arg(params, 'deterministic') + convert_bool_arg(params, 'cudnn_deterministic') + + + if params.freeze_interval > 0: # This addresses an inconsistency between the Nature paper and # the Deepmind code. The paper states that the target network # update frequency is "measured in the number of parameter # updates". In the code it is actually measured in the number # of action choices. - parameters.freeze_interval = (parameters.freeze_interval // - parameters.update_frequency) - - return parameters + params.freeze_interval = (params.freeze_interval // + params.update_frequency) + return params def launch(args, defaults, description): @@ -170,87 +179,59 @@ def launch(args, defaults, description): """ logging.basicConfig(level=logging.INFO) - parameters = process_args(args, defaults, description) + params = process_args(args, defaults, description) - if parameters.rom.endswith('.bin'): - rom = parameters.rom + if params.rom.endswith('.bin'): + rom = params.rom else: - rom = "%s.bin" % parameters.rom - full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) + rom = "%s.bin" % params.rom + full_rom_path = os.path.join(defaults.base_rom_path, rom) - if parameters.deterministic: - rng = np.random.RandomState(123456) + if params.deterministic: + params.rng = np.random.RandomState(123456) else: - rng = np.random.RandomState() + params.rng = np.random.RandomState() - if parameters.cudnn_deterministic: + if params.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' ale = ale_python_interface.ALEInterface() - ale.setInt('random_seed', rng.randint(1000)) + ale.setInt('random_seed', params.rng.randint(1000)) - if parameters.display_screen: + if params.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX - ale.setBool('display_screen', parameters.display_screen) + ale.setBool('display_screen', params.display_screen) ale.setFloat('repeat_action_probability', - parameters.repeat_action_probability) + params.repeat_action_probability) ale.loadROM(full_rom_path) - num_actions = len(ale.getMinimalActionSet()) - - if parameters.nn_file is None: - network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, - defaults.RESIZED_HEIGHT, - num_actions, - parameters.phi_length, - parameters.discount, - parameters.learning_rate, - parameters.rms_decay, - parameters.rms_epsilon, - parameters.momentum, - parameters.clip_delta, - parameters.freeze_interval, - parameters.batch_size, - parameters.network_type, - parameters.update_rule, - parameters.batch_accumulator, - rng) - else: - handle = open(parameters.nn_file, 'r') - network = cPickle.load(handle) - - agent = ale_agent.NeuralAgent(network, - parameters.epsilon_start, - parameters.epsilon_min, - parameters.epsilon_decay, - parameters.replay_memory_size, - parameters.experiment_prefix, - parameters.replay_start_size, - parameters.update_frequency, - rng) - - experiment = ale_experiment.ALEExperiment(ale, agent, - defaults.RESIZED_WIDTH, - defaults.RESIZED_HEIGHT, - parameters.resize_method, - parameters.epochs, - parameters.steps_per_epoch, - parameters.steps_per_test, - parameters.frame_skip, - parameters.death_ends_episode, - parameters.max_start_nullops, - rng) - + if params.agent_type is None: + raise Exception("The agent type has not been specified") + + agent = params.agent_type(params) + + experiment = ale_experiment.ALEExperiment( + ale=ale, + agent=agent, + resized_width=params.resized_width, + resized_height=params.resized_height, + resize_method=params.resize_method, + num_epochs=params.epochs, + epoch_length=params.steps_per_epoch, + test_length=params.steps_per_test, + frame_skip=params.frame_skip, + death_ends_episode=params.death_ends_episode, + max_start_nullops=params.max_start_nullops, + rng=params.rng) experiment.run() - if __name__ == '__main__': pass diff --git a/deep_q_rl/plot_filters.py b/deep_q_rl/plot_filters.py old mode 100644 new mode 100755 index 52a3da6..b8b963c --- a/deep_q_rl/plot_filters.py +++ b/deep_q_rl/plot_filters.py @@ -1,3 +1,4 @@ +#! /usr/bin/env python """ Utility to plot the first layer of convolutions learned by the Deep q-network. @@ -19,8 +20,8 @@ q_layers = lasagne.layers.get_all_layers(network.l_out) w = q_layers[1].W.get_value() count = 1 -for f in range(w.shape[0]): # filters - for c in range(w.shape[1]): # channels/time-steps +for f in range(w.shape[0]): # filters + for c in range(w.shape[1]): # channels/time-steps plt.subplot(w.shape[0], w.shape[1], count) img = w[f, c, :, :] plt.imshow(img, vmin=img.min(), vmax=img.max(), diff --git a/deep_q_rl/plot_results.py b/deep_q_rl/plot_results.py old mode 100644 new mode 100755 index 8588cf4..9743e44 --- a/deep_q_rl/plot_results.py +++ b/deep_q_rl/plot_results.py @@ -1,3 +1,4 @@ +#! /usr/bin/env python """Plots data corresponding to Figure 2 in Playing Atari with Deep Reinforcement Learning diff --git a/deep_q_rl/q_learner.py b/deep_q_rl/q_learner.py new file mode 100644 index 0000000..7d404d8 --- /dev/null +++ b/deep_q_rl/q_learner.py @@ -0,0 +1,30 @@ +from abc import ABCMeta, abstractmethod + + +class QLearner: + __metaclass__ = ABCMeta + + def __init__(self, + num_actions, + input_width, input_height, num_frames, + params): + pass + + @abstractmethod + def train(self, states, actions, rewards, next_states, terminals): + """ + Train one batch. + Arguments: + states - b x f x h x w numpy array, where b is batch size, + f is num frames, h is height and w is width. + actions - b x 1 numpy array of integers + rewards - b x 1 numpy array + next_states - b x f x h x w numpy array + terminals - b x 1 numpy boolean array (currently ignored) + Returns: average loss + """ + pass + + @abstractmethod + def q_vals(self, state): + pass diff --git a/deep_q_rl/q_network.py b/deep_q_rl/q_network.py index 0fa360b..5828172 100644 --- a/deep_q_rl/q_network.py +++ b/deep_q_rl/q_network.py @@ -13,49 +13,61 @@ Author of Lasagne port: Nissan Pow Modifications: Nathan Sprague """ + import lasagne import numpy as np import theano import theano.tensor as T + +from q_learner import QLearner from updates import deepmind_rmsprop -class DeepQLearner: +class DeepQLearner(QLearner): """ Deep Q-learning network using Lasagne. """ - def __init__(self, input_width, input_height, num_actions, - num_frames, discount, learning_rate, rho, - rms_epsilon, momentum, clip_delta, freeze_interval, - batch_size, network_type, update_rule, - batch_accumulator, rng, input_scale=255.0): + def __init__(self, + num_actions, + input_width, input_height, num_frames, + params): + + super(DeepQLearner, self).__init__(num_actions, + input_width, + input_height, + num_frames, + params) + + self.num_actions = num_actions + self.params = params self.input_width = input_width self.input_height = input_height - self.num_actions = num_actions self.num_frames = num_frames - self.batch_size = batch_size - self.discount = discount - self.rho = rho - self.lr = learning_rate - self.rms_epsilon = rms_epsilon - self.momentum = momentum - self.clip_delta = clip_delta - self.freeze_interval = freeze_interval - self.rng = rng + self.discount = self.params.discount + self.rho = self.params.rms_decay + self.lr = self.params.learning_rate + self.rms_epsilon = self.params.rms_epsilon + self.momentum = self.params.momentum + self.clip_delta = self.params.clip_delta + self.freeze_interval = self.params.freeze_interval + self.batch_size = self.params.batch_size + self.update_rule = self.params.update_rule + self.batch_accumulator = self.params.batch_accumulator + + self.rng = params.rng lasagne.random.set_rng(self.rng) + self.network_type = self.params.network_type + self.update_counter = 0 - self.l_out = self.build_network(network_type, input_width, input_height, - num_actions, num_frames, batch_size) + self.l_out = self._build_network() if self.freeze_interval > 0: - self.next_l_out = self.build_network(network_type, input_width, - input_height, num_actions, - num_frames, batch_size) - self.reset_q_hat() + self.next_l_out = self._build_network() + self._reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') @@ -64,39 +76,46 @@ def __init__(self, input_width, input_height, num_actions, terminals = T.icol('terminals') self.states_shared = theano.shared( - np.zeros((batch_size, num_frames, input_height, input_width), + np.zeros((self.batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( - np.zeros((batch_size, num_frames, input_height, input_width), + np.zeros((self.batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( - np.zeros((batch_size, 1), dtype=theano.config.floatX), + np.zeros((self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( - np.zeros((batch_size, 1), dtype='int32'), + np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( - np.zeros((batch_size, 1), dtype='int32'), + np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) - q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) - + q_vals = lasagne.layers.get_output( + self.l_out, + states / self.params.input_scale) + if self.freeze_interval > 0: - next_q_vals = lasagne.layers.get_output(self.next_l_out, - next_states / input_scale) + next_q_vals = lasagne.layers.get_output( + self.next_l_out, + next_states / self.params.input_scale) + else: - next_q_vals = lasagne.layers.get_output(self.l_out, - next_states / input_scale) + next_q_vals = lasagne.layers.get_output( + self.l_out, + next_states / self.params.input_scale) + next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) - diff = target - q_vals[T.arange(batch_size), + + diff = target - q_vals[T.arange(self.batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: @@ -115,12 +134,13 @@ def __init__(self, input_width, input_height, num_actions, else: loss = 0.5 * diff ** 2 - if batch_accumulator == 'sum': + if self.batch_accumulator == 'sum': loss = T.sum(loss) - elif batch_accumulator == 'mean': + elif self.batch_accumulator == 'mean': loss = T.mean(loss) else: - raise ValueError("Bad accumulator: {}".format(batch_accumulator)) + raise ValueError("Bad accumulator: {}" + .format(self.batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { @@ -130,16 +150,17 @@ def __init__(self, input_width, input_height, num_actions, actions: self.actions_shared, terminals: self.terminals_shared } - if update_rule == 'deepmind_rmsprop': + + if self.update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) - elif update_rule == 'rmsprop': + elif self.update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) - elif update_rule == 'sgd': + elif self.update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: - raise ValueError("Unrecognized update: {}".format(update_rule)) + raise ValueError("Unrecognized update: {}".format(self.update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, @@ -150,29 +171,44 @@ def __init__(self, input_width, input_height, num_actions, self._q_vals = theano.function([], q_vals, givens={states: self.states_shared}) - def build_network(self, network_type, input_width, input_height, - output_dim, num_frames, batch_size): - if network_type == "nature_cuda": - return self.build_nature_network(input_width, input_height, - output_dim, num_frames, batch_size) - if network_type == "nature_dnn": - return self.build_nature_network_dnn(input_width, input_height, - output_dim, num_frames, - batch_size) - elif network_type == "nips_cuda": - return self.build_nips_network(input_width, input_height, - output_dim, num_frames, batch_size) - elif network_type == "nips_dnn": - return self.build_nips_network_dnn(input_width, input_height, - output_dim, num_frames, - batch_size) - elif network_type == "linear": - return self.build_linear_network(input_width, input_height, - output_dim, num_frames, batch_size) + def _build_network(self): + if self.network_type == "nature_cuda": + return self._build_nature_network(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) + + if self.network_type == "nature_dnn": + return self._build_nature_network_dnn(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) + + elif self.network_type == "nips_cuda": + return self._build_nips_network(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) + + elif self.network_type == "nips_dnn": + return self._build_nips_network_dnn(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) + + elif self.network_type == "linear": + return self._build_linear_network(self.input_width, + self.input_height, + self.num_actions, + self.num_frames, + self.batch_size) else: - raise ValueError("Unrecognized network: {}".format(network_type)) - - + raise ValueError("Unrecognized network: {}" + .format(self.network_type)) def train(self, states, actions, rewards, next_states, terminals): """ @@ -196,8 +232,8 @@ def train(self, states, actions, rewards, next_states, terminals): self.rewards_shared.set_value(rewards) self.terminals_shared.set_value(terminals) if (self.freeze_interval > 0 and - self.update_counter % self.freeze_interval == 0): - self.reset_q_hat() + self.update_counter % self.freeze_interval == 0): + self._reset_q_hat() loss, _ = self._train() self.update_counter += 1 return np.sqrt(loss) @@ -215,12 +251,12 @@ def choose_action(self, state, epsilon): q_vals = self.q_vals(state) return np.argmax(q_vals) - def reset_q_hat(self): + def _reset_q_hat(self): all_params = lasagne.layers.helper.get_all_param_values(self.l_out) lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params) - def build_nature_network(self, input_width, input_height, output_dim, - num_frames, batch_size): + def _build_nature_network(self, input_width, input_height, output_dim, + num_frames, batch_size): """ Build a large network consistent with the DeepMind Nature paper. """ @@ -236,7 +272,7 @@ def build_nature_network(self, input_width, input_height, output_dim, filter_size=(8, 8), stride=(4, 4), nonlinearity=lasagne.nonlinearities.rectify, - W=lasagne.init.HeUniform(), # Defaults to Glorot + W=lasagne.init.HeUniform(), # Defaults to Glorot b=lasagne.init.Constant(.1), dimshuffle=True ) @@ -281,9 +317,8 @@ def build_nature_network(self, input_width, input_height, output_dim, return l_out - - def build_nature_network_dnn(self, input_width, input_height, output_dim, - num_frames, batch_size): + def _build_nature_network_dnn(self, input_width, input_height, output_dim, + num_frames, batch_size): """ Build a large network consistent with the DeepMind Nature paper. """ @@ -341,10 +376,8 @@ def build_nature_network_dnn(self, input_width, input_height, output_dim, return l_out - - - def build_nips_network(self, input_width, input_height, output_dim, - num_frames, batch_size): + def _build_nips_network(self, input_width, input_height, output_dim, + num_frames, batch_size): """ Build a network consistent with the 2013 NIPS paper. """ @@ -359,7 +392,7 @@ def build_nips_network(self, input_width, input_height, output_dim, filter_size=(8, 8), stride=(4, 4), nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(c01b=True), + # W=lasagne.init.HeUniform(c01b=True), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1), dimshuffle=True @@ -371,7 +404,7 @@ def build_nips_network(self, input_width, input_height, output_dim, filter_size=(4, 4), stride=(2, 2), nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(c01b=True), + # W=lasagne.init.HeUniform(c01b=True), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1), dimshuffle=True @@ -381,7 +414,7 @@ def build_nips_network(self, input_width, input_height, output_dim, l_conv2, num_units=256, nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) @@ -390,16 +423,15 @@ def build_nips_network(self, input_width, input_height, output_dim, l_hidden1, num_units=output_dim, nonlinearity=None, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) return l_out - - def build_nips_network_dnn(self, input_width, input_height, output_dim, - num_frames, batch_size): + def _build_nips_network_dnn(self, input_width, input_height, output_dim, + num_frames, batch_size): """ Build a network consistent with the 2013 NIPS paper. """ @@ -410,14 +442,13 @@ def build_nips_network_dnn(self, input_width, input_height, output_dim, shape=(batch_size, num_frames, input_width, input_height) ) - l_conv1 = dnn.Conv2DDNNLayer( l_in, num_filters=16, filter_size=(8, 8), stride=(4, 4), nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) @@ -428,7 +459,7 @@ def build_nips_network_dnn(self, input_width, input_height, output_dim, filter_size=(4, 4), stride=(2, 2), nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) @@ -437,7 +468,7 @@ def build_nips_network_dnn(self, input_width, input_height, output_dim, l_conv2, num_units=256, nonlinearity=lasagne.nonlinearities.rectify, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) @@ -446,16 +477,15 @@ def build_nips_network_dnn(self, input_width, input_height, output_dim, l_hidden1, num_units=output_dim, nonlinearity=None, - #W=lasagne.init.HeUniform(), + # W=lasagne.init.HeUniform(), W=lasagne.init.Normal(.01), b=lasagne.init.Constant(.1) ) return l_out - - def build_linear_network(self, input_width, input_height, output_dim, - num_frames, batch_size): + def _build_linear_network(self, input_width, input_height, output_dim, + num_frames, batch_size): """ Build a simple linear learner. Useful for creating tests that sanity-check the weight update code. @@ -474,11 +504,3 @@ def build_linear_network(self, input_width, input_height, output_dim, ) return l_out - -def main(): - net = DeepQLearner(84, 84, 16, 4, .99, .00025, .95, .95, 10000, - 32, 'nature_cuda') - - -if __name__ == '__main__': - main() diff --git a/deep_q_rl/run_nature.py b/deep_q_rl/run_nature.py index 2da46bc..c60b86e 100755 --- a/deep_q_rl/run_nature.py +++ b/deep_q_rl/run_nature.py @@ -7,59 +7,66 @@ Nature, 518(7540):529-533, February 2015 """ +import sys +from ale_agent import NeuralAgent +from q_network import DeepQLearner import launcher -import sys -class Defaults: + +class Parameters: # ---------------------- # Experiment Parameters # ---------------------- - STEPS_PER_EPOCH = 250000 - EPOCHS = 200 - STEPS_PER_TEST = 125000 + steps_per_epoch = 250000 + epochs = 200 + steps_per_test = 125000 # ---------------------- # ALE Parameters # ---------------------- - BASE_ROM_PATH = "../roms/" - ROM = 'breakout.bin' - FRAME_SKIP = 4 - REPEAT_ACTION_PROBABILITY = 0 + base_rom_path = "../roms/" + rom = 'breakout.bin' + frame_skip = 4 + repeat_action_probability = 0 # ---------------------- # Agent/Network parameters: # ---------------------- - UPDATE_RULE = 'deepmind_rmsprop' - BATCH_ACCUMULATOR = 'sum' - LEARNING_RATE = .00025 - DISCOUNT = .99 - RMS_DECAY = .95 # (Rho) - RMS_EPSILON = .01 - MOMENTUM = 0 # Note that the "momentum" value mentioned in the Nature + update_rule = 'deepmind_rmsprop' + batch_accumulator = 'sum' + learning_rate = .00025 + discount = .99 + rms_decay = .95 # (Rho) + rms_epsilon = .01 + momentum = 0 # Note that the "momentum" value mentioned in the Nature # paper is not used in the same way as a traditional momentum # term. It is used to track gradient for the purpose of # estimating the standard deviation. This package uses # rho/RMS_DECAY to track both the history of the gradient # and the squared gradient. - CLIP_DELTA = 1.0 - EPSILON_START = 1.0 - EPSILON_MIN = .1 - EPSILON_DECAY = 1000000 - PHI_LENGTH = 4 - UPDATE_FREQUENCY = 4 - REPLAY_MEMORY_SIZE = 1000000 - BATCH_SIZE = 32 - NETWORK_TYPE = "nature_dnn" - FREEZE_INTERVAL = 10000 - REPLAY_START_SIZE = 50000 - RESIZE_METHOD = 'scale' - RESIZED_WIDTH = 84 - RESIZED_HEIGHT = 84 - DEATH_ENDS_EPISODE = 'true' - MAX_START_NULLOPS = 30 - DETERMINISTIC = True - CUDNN_DETERMINISTIC = False + clip_delta = 1.0 + epsilon_start = 1.0 + epsilon_min = .1 + epsilon_decay = 1000000 + phi_length = 4 + update_frequency = 4 + replay_memory_size = 1000000 + batch_size = 32 + network_type = "nature_dnn" + freeze_interval = 10000 + input_scale = 255. + replay_start_size = 50000 + resize_method = 'scale' + resized_width = 84 + resized_height = 84 + death_ends_episode = 'true' + max_start_nullops = 30 + deterministic = 'true' + cudnn_deterministic = 'false' + + agent_type = NeuralAgent + qlearner_type = DeepQLearner if __name__ == "__main__": - launcher.launch(sys.argv[1:], Defaults, __doc__) + launcher.launch(sys.argv[1:], Parameters, __doc__) diff --git a/deep_q_rl/run_nips.py b/deep_q_rl/run_nips.py index 8a6ddfc..7e67a77 100755 --- a/deep_q_rl/run_nips.py +++ b/deep_q_rl/run_nips.py @@ -7,54 +7,61 @@ NIPS Deep Learning Workshop 2013 """ +from ale_agent import NeuralAgent +from q_network import DeepQLearner import launcher import sys -class Defaults: + +class Parameters: # ---------------------- # Experiment Parameters # ---------------------- - STEPS_PER_EPOCH = 50000 - EPOCHS = 100 - STEPS_PER_TEST = 10000 + steps_per_epoch = 50000 + epochs = 100 + steps_per_test = 10000 # ---------------------- # ALE Parameters # ---------------------- - BASE_ROM_PATH = "../roms/" - ROM = 'breakout.bin' - FRAME_SKIP = 4 - REPEAT_ACTION_PROBABILITY = 0 + base_rom_path = "../roms/" + rom = 'breakout.bin' + frame_skip = 4 + repeat_action_probability = 0 # ---------------------- # Agent/Network parameters: # ---------------------- - UPDATE_RULE = 'rmsprop' - BATCH_ACCUMULATOR = 'mean' - LEARNING_RATE = .0002 - DISCOUNT = .95 - RMS_DECAY = .99 # (Rho) - RMS_EPSILON = 1e-6 - MOMENTUM = 0 - CLIP_DELTA = 0 - EPSILON_START = 1.0 - EPSILON_MIN = .1 - EPSILON_DECAY = 1000000 - PHI_LENGTH = 4 - UPDATE_FREQUENCY = 1 - REPLAY_MEMORY_SIZE = 1000000 - BATCH_SIZE = 32 - NETWORK_TYPE = "nips_dnn" - FREEZE_INTERVAL = -1 - REPLAY_START_SIZE = 100 - RESIZE_METHOD = 'crop' - RESIZED_WIDTH = 84 - RESIZED_HEIGHT = 84 - DEATH_ENDS_EPISODE = 'false' - MAX_START_NULLOPS = 0 - DETERMINISTIC = True - CUDNN_DETERMINISTIC = False + update_rule = 'rmsprop' + batch_accumulator = 'mean' + learning_rate = .0002 + discount = .95 + rms_decay = .99 # (Rho) + rms_epsilon = 1e-6 + momentum = 0 + clip_delta = 0 + epsilon_start = 1.0 + epsilon_min = .1 + epsilon_decay = 1000000 + phi_length = 4 + update_frequency = 1 + replay_memory_size = 1000000 + batch_size = 32 + network_type = "nips_dnn" + freeze_interval = -1 + input_scale = 255. + replay_start_size = 100 + resize_method = 'crop' + resized_width = 84 + resized_height = 84 + death_ends_episode = 'false' + max_start_nullops = 0 + deterministic = 'true' + cudnn_deterministic = 'false' + + agent_type = NeuralAgent + qlearner_type = DeepQLearner if __name__ == "__main__": - launcher.launch(sys.argv[1:], Defaults, __doc__) + launcher.launch(sys.argv[1:], Parameters, __doc__) diff --git a/deep_q_rl/run_random.py b/deep_q_rl/run_random.py new file mode 100755 index 0000000..a234830 --- /dev/null +++ b/deep_q_rl/run_random.py @@ -0,0 +1,66 @@ +#! /usr/bin/env python +""" +Execute a training run of using an agent that plays random moves + +""" +import sys +from ale_agent_random import AgentRandom + +from ale_parameters_default import ParametersDefault +import launcher + + +class Parameters(ParametersDefault): + # ---------------------- + # Experiment Parameters + # ---------------------- + STEPS_PER_EPOCH = 250000 + EPOCHS = 200 + STEPS_PER_TEST = 125000 + + # ---------------------- + # ALE Parameters + # ---------------------- + BASE_ROM_PATH = "../roms/" + ROM = 'breakout.bin' + FRAME_SKIP = 4 + REPEAT_ACTION_PROBABILITY = 0 + + # ---------------------- + # Agent/Network parameters: + # ---------------------- + UPDATE_RULE = 'deepmind_rmsprop' + BATCH_ACCUMULATOR = 'sum' + LEARNING_RATE = .00025 + DISCOUNT = .99 + RMS_DECAY = .95 # (Rho) + RMS_EPSILON = .01 + MOMENTUM = 0 # Note that the "momentum" value mentioned in the Nature + # paper is not used in the same way as a traditional momentum + # term. It is used to track gradient for the purpose of + # estimating the standard deviation. This package uses + # rho/RMS_DECAY to track both the history of the gradient + # and the squared gradient. + CLIP_DELTA = 1.0 + EPSILON_START = 1.0 + EPSILON_MIN = .1 + EPSILON_DECAY = 1000000 + PHI_LENGTH = 4 + UPDATE_FREQUENCY = 4 + REPLAY_MEMORY_SIZE = 1000000 + BATCH_SIZE = 32 + NETWORK_TYPE = "nature_dnn" + FREEZE_INTERVAL = 10000 + INPUT_SCALE = 255. + REPLAY_START_SIZE = 50000 + RESIZE_METHOD = 'scale' + RESIZED_WIDTH = 84 + RESIZED_HEIGHT = 84 + DEATH_ENDS_EPISODE = 'true' + MAX_START_NULLOPS = 30 + + AGENT_TYPE = AgentRandom + QLEARNER_TYPE = None + +if __name__ == "__main__": + launcher.launch(sys.argv[1:], Parameters, __doc__) diff --git a/deep_q_rl/test/test_q_network.py b/deep_q_rl/test/test_q_network.py index 82cd142..e050449 100644 --- a/deep_q_rl/test/test_q_network.py +++ b/deep_q_rl/test/test_q_network.py @@ -7,9 +7,11 @@ import unittest import numpy.testing import lasagne +from deep_q_rl.run_nature import Parameters import deep_q_rl.q_network as q_network + class ChainMDP(object): """Simple markov chain style MDP. Three "rooms" and one absorbing state. States are encoded for the q_network as arrays with @@ -52,7 +54,7 @@ def act(self, state, action_index): """ action 0 is left, 1 is right. """ - state_index = np.nonzero(state[0, 0, 0, :])[0][0] + state_index = np.nonzero(state[0, 0, 0, :])[0][0] next_index = state_index if np.random.random() < self.success_prob: @@ -80,15 +82,28 @@ class LinearTests(unittest.TestCase): Q-learning code operates as good-ol-fashioned Q-learning. These tests check that the basic updates code is working correctly. """ + def setUp(self): # Divide the desired learning rate by two, because loss is # defined as L^2, not 1/2 L^2. - self.learning_rate = .1 / 2.0 + self.params = Parameters() + self.params.discount = .5 + self.params.learning_rate = .1 / 2.0 + self.params.rms_decay = 0 + self.params.rms_epsilon = 0 + self.params.momentum = 0 + self.params.clip_delta = 0 + self.params.freeze_interval = -1 + self.params.batch_size = 1 + self.params.network_type = 'linear' + self.params.update_rule = 'sgd' + self.params.batch_accumulator = 'sum' + self.params.input_scale = 1.0 + self.params.rng = np.random.RandomState(123456) - self.discount = .5 - self.mdp = ChainMDP() + self.mdp = ChainMDP() def all_q_vals(self, net): """ Helper method to get the entire Q-table """ @@ -101,7 +116,7 @@ def all_q_vals(self, net): def train(self, net, steps): mdp = self.mdp for _ in range(steps): - state = mdp.states[np.random.randint(0, mdp.num_states-1)] + state = mdp.states[np.random.randint(0, mdp.num_states - 1)] action_index = np.random.randint(0, mdp.num_actions) reward, next_state, terminal = mdp.act(state, action_index) @@ -109,13 +124,13 @@ def train(self, net, steps): terminal) def test_updates_sgd_no_freeze(self): - freeze_interval = -1 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.params.freeze_interval = -1 + + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) mdp = self.mdp @@ -150,16 +165,13 @@ def test_updates_sgd_no_freeze(self): [[.07, 0], [0.0035, 0], [0, .1], [0, 0]]) - def test_convergence_sgd_no_freeze(self): - freeze_interval = -1 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) - + self.params.freeze_interval = -1 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) self.train(net, 1000) @@ -167,44 +179,38 @@ def test_convergence_sgd_no_freeze(self): [[.7, .25], [.35, .5], [.25, 1.0], [0., 0.]], 3) - def test_convergence_random_initialization(self): """ This test will only pass if terminal states are handled correctly. Otherwise the random initialization of the value of the terminal state will propagate back. """ - freeze_interval = -1 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.params.freeze_interval = -1 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) # Randomize initial q-values: params = lasagne.layers.helper.get_all_param_values(net.l_out) rand = np.random.random(params[0].shape) - rand = numpy.array(rand, dtype=theano.config.floatX) + rand = numpy.array(rand, dtype=theano.config.floatX) lasagne.layers.helper.set_all_param_values(net.l_out, [rand]) self.train(net, 1000) - numpy.testing.assert_almost_equal(self.all_q_vals(net)[0:3,:], + numpy.testing.assert_almost_equal(self.all_q_vals(net)[0:3, :], [[.7, .25], [.35, .5], [.25, 1.0]], 3) - - - def test_convergence_sgd_permanent_freeze(self): - freeze_interval = 1000000 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.params.freeze_interval = 1000000 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) self.train(net, 1000) @@ -213,13 +219,12 @@ def test_convergence_sgd_permanent_freeze(self): [0, 1.0], [0., 0.]], 3) def test_convergence_sgd_frequent_freeze(self): - freeze_interval = 2 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.params.freeze_interval = 2 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) self.train(net, 1000) @@ -228,19 +233,19 @@ def test_convergence_sgd_frequent_freeze(self): [.25, 1.0], [0., 0.]], 3) def test_convergence_sgd_one_freeze(self): - freeze_interval = 500 - net = q_network.DeepQLearner(self.mdp.num_states, 1, - self.mdp.num_actions, 1, - self.discount, - self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', - 'sgd', 'sum', 1.0) + self.params.freeze_interval = 500 + net = q_network.DeepQLearner(input_width=self.mdp.num_states, + input_height=1, + num_actions=self.mdp.num_actions, + num_frames=1, + params=self.params) - self.train(net, freeze_interval * 2) + self.train(net, self.params.freeze_interval * 2) numpy.testing.assert_almost_equal(self.all_q_vals(net), [[.7, 0], [.35, .5], [0, 1.0], [0., 0.]], 3) + if __name__ == "__main__": unittest.main()