diff --git a/rl/agent/actor_critic.py b/rl/agent/actor_critic.py index 945f2c9..7c3ae86 100644 --- a/rl/agent/actor_critic.py +++ b/rl/agent/actor_critic.py @@ -114,7 +114,11 @@ def train_critic(self, minibatch): actor_delta = Q_next_vals - Q_vals loss = self.critic.train_on_batch(minibatch['states'], Q_targets) + # update memory, needed for PER errors = abs(np.sum(Q_vals - Q_targets, axis=1)) + # Q size is only 1, from critic + assert Q_targets.shape == (self.batch_size, 1) + assert errors.shape == (self.batch_size, ) self.memory.update(errors) return loss, actor_delta diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py index cb9efb8..eece44e 100644 --- a/rl/agent/ddpg.py +++ b/rl/agent/ddpg.py @@ -242,6 +242,7 @@ def train_an_epoch(self): # train critic mu_prime = self.actor.target_predict(minibatch['next_states']) + q_val = self.critic.target_predict(minibatch['states'], mu_prime) q_prime = self.critic.target_predict( minibatch['next_states'], mu_prime) # reshape for element-wise multiplication @@ -250,6 +251,13 @@ def train_an_epoch(self): (1 - minibatch['terminals']) * np.reshape(q_prime, (-1)) y = np.reshape(y, (-1, 1)) + # update memory, needed for PER + errors = abs(np.sum(q_val - y, axis=1)) + # Q size is only 1, from critic + assert y.shape == (self.batch_size, 1) + assert errors.shape == (self.batch_size, ) + self.memory.update(errors) + _, _, critic_loss = self.critic.train_tf( minibatch['states'], minibatch['actions'], y) diff --git a/rl/agent/deep_sarsa.py b/rl/agent/deep_sarsa.py index a535050..f6ddd09 100644 --- a/rl/agent/deep_sarsa.py +++ b/rl/agent/deep_sarsa.py @@ -1,3 +1,4 @@ +import numpy as np from rl.agent.dqn import DQN @@ -30,4 +31,10 @@ def train_an_epoch(self): Q_targets = self.compute_Q_targets( minibatch, Q_states, Q_next_states_selected) loss = self.model.train_on_batch(minibatch['states'], Q_targets) + + errors = abs(np.sum(Q_states - Q_targets, axis=1)) + assert Q_targets.shape == ( + self.batch_size, self.env_spec['action_dim']) + assert errors.shape == (self.batch_size, ) + self.memory.update(errors) return loss diff --git a/rl/agent/dqn.py b/rl/agent/dqn.py index 26e3ae4..4810e9c 100644 --- a/rl/agent/dqn.py +++ b/rl/agent/dqn.py @@ -190,10 +190,12 @@ def train_an_epoch(self): minibatch) Q_targets = self.compute_Q_targets( minibatch, Q_states, Q_next_states_max) - loss = self.model.train_on_batch(minibatch['states'], Q_targets) errors = abs(np.sum(Q_states - Q_targets, axis=1)) + assert Q_targets.shape == ( + self.batch_size, self.env_spec['action_dim']) + assert errors.shape == (self.batch_size, ) self.memory.update(errors) return loss diff --git a/rl/analytics.py b/rl/analytics.py index 57d654a..1be9044 100644 --- a/rl/analytics.py +++ b/rl/analytics.py @@ -317,9 +317,10 @@ def compose_data(trial): } # param variables for independent vars of trials + default_param = trial.experiment_spec['param'] param_variables = { - pv: trial.experiment_spec['param'][pv] for - pv in trial.param_variables} + pv: default_param[pv] for + pv in trial.param_variables if pv in default_param} trial.data['metrics'].update(metrics) trial.data['param_variables'] = param_variables @@ -459,7 +460,7 @@ def analyze_data(experiment_data_or_experiment_id): data_df.sort_values( ['fitness_score'], ascending=False, inplace=True) - data_df.reset_index(inplace=True) + data_df.reset_index(drop=True, inplace=True) trial_id = experiment_data[0]['trial_id'] save_experiment_data(data_df, trial_id) diff --git a/rl/memory/prioritized_exp_replay.py b/rl/memory/prioritized_exp_replay.py index d6e8320..476bbe0 100644 --- a/rl/memory/prioritized_exp_replay.py +++ b/rl/memory/prioritized_exp_replay.py @@ -12,8 +12,13 @@ class PrioritizedExperienceReplay(LinearMemoryWithForgetting): memory unit ''' - def __init__(self, env_spec, max_mem_len=10000, e=0.01, alpha=0.6, + def __init__(self, env_spec, max_mem_len=None, e=0.01, alpha=0.6, **kwargs): + if max_mem_len is None: # auto calculate mem len + max_timestep = env_spec['timestep_limit'] + max_epis = env_spec['problem']['MAX_EPISODES'] + memory_epi = np.ceil(max_epis / 3.).astype(int) + max_mem_len = max(10**6, max_timestep * memory_epi) super(PrioritizedExperienceReplay, self).__init__( env_spec, max_mem_len) self.exp_keys.append('error') @@ -27,21 +32,18 @@ def __init__(self, env_spec, max_mem_len=10000, e=0.01, alpha=0.6, self.prio_tree = SumTree(self.max_mem_len) self.head = 0 - # bump to account for negative terms in reward get_priority - # and we cannot abs(reward) cuz it's sign sensitive - SOLVED_MEAN_REWARD = self.env_spec['problem']['SOLVED_MEAN_REWARD'] or 10000 - self.min_priority = abs(10 * SOLVED_MEAN_REWARD) - def get_priority(self, error): # add min_priority to prevent root of negative = complex - p = (self.min_priority + error + self.e) ** self.alpha - assert not np.isnan(p) + p = (error + self.e) ** self.alpha + assert np.isfinite(p) return p def add_exp(self, action, reward, next_state, terminal): '''Round robin memory updating''' - # roughly the error between estimated Q and true q is the reward - error = reward + # init error to reward first, update later + error = abs(reward) + p = self.get_priority(error) + if self.size() < self.max_mem_len: # add as usual super(PrioritizedExperienceReplay, self).add_exp( action, reward, next_state, terminal) @@ -59,7 +61,6 @@ def add_exp(self, action, reward, next_state, terminal): if self.head >= self.max_mem_len: self.head = 0 # reset for round robin - p = self.get_priority(error) self.prio_tree.add(p) assert self.head == self.prio_tree.head, 'prio_tree head is wrong' diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index 900ac7d..89f15be 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -39,7 +39,7 @@ class SoftmaxPolicy(Policy): def __init__(self, env_spec, **kwargs): # absorb generic param without breaking super(SoftmaxPolicy, self).__init__(env_spec) - self.clip_val = 500 + self.clip_val = 500. log_self(self) def select_action(self, state): @@ -47,11 +47,10 @@ def select_action(self, state): state = np.expand_dims(state, axis=0) A_score = agent.actor.predict(state)[0] # extract from batch predict assert A_score.ndim == 1 - A_score = A_score.astype('float32') # fix precision nan issue - A_score = A_score - np.amax(A_score) # prevent overflow + A_score = A_score.astype('float64') # fix precision overflow exp_values = np.exp( np.clip(A_score, -self.clip_val, self.clip_val)) - assert not np.isnan(exp_values).any() + assert np.isfinite(exp_values).all() probs = np.array(exp_values / np.sum(exp_values)) probs /= probs.sum() # renormalize to prevent floating pt error action = np.random.choice(agent.env_spec['actions'], p=probs) @@ -83,6 +82,9 @@ def select_action(self, state): a_mean = agent.actor.predict(state)[0] # extract from batch predict action = a_mean + np.random.normal( loc=0.0, scale=self.variance, size=a_mean.shape) + action = np.clip(action, + self.env_spec['action_bound_low'], + self.env_spec['action_bound_high']) return action def update(self, sys_vars): diff --git a/rl/policy/boltzmann.py b/rl/policy/boltzmann.py index ef9069b..0fdd53f 100644 --- a/rl/policy/boltzmann.py +++ b/rl/policy/boltzmann.py @@ -18,7 +18,7 @@ def __init__(self, env_spec, self.final_tau = final_tau self.tau = self.init_tau self.exploration_anneal_episodes = exploration_anneal_episodes - self.clip_val = 500 + self.clip_val = 500. log_self(self) def select_action(self, state): @@ -26,11 +26,10 @@ def select_action(self, state): state = np.expand_dims(state, axis=0) Q_state = agent.model.predict(state)[0] # extract from batch predict assert Q_state.ndim == 1 - Q_state = Q_state.astype('float32') # fix precision nan issue - Q_state = Q_state - np.amax(Q_state) # prevent overflow + Q_state = Q_state.astype('float64') # fix precision overflow exp_values = np.exp( np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) - assert not np.isnan(exp_values).any() + assert np.isfinite(exp_values).all() probs = np.array(exp_values / np.sum(exp_values)) probs /= probs.sum() # renormalize to prevent floating pt error action = np.random.choice(agent.env_spec['actions'], p=probs) @@ -66,11 +65,10 @@ def select_action(self, state): Q_state2 = agent.model_2.predict(state)[0] Q_state = Q_state1 + Q_state2 assert Q_state.ndim == 1 - Q_state = Q_state.astype('float32') # fix precision nan issue - Q_state = Q_state - np.amax(Q_state) # prevent overflow + Q_state = Q_state.astype('float64') # fix precision overflow exp_values = np.exp( np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) - assert not np.isnan(exp_values).any() + assert np.isfinite(exp_values).all() probs = np.array(exp_values / np.sum(exp_values)) probs /= probs.sum() # renormalize to prevent floating pt error action = np.random.choice(agent.env_spec['actions'], p=probs) diff --git a/rl/policy/noise.py b/rl/policy/noise.py index fec9507..e1b695b 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -1,6 +1,7 @@ import numpy as np from rl.util import log_self from rl.policy.base_policy import Policy +from rl.policy.epsilon_greedy import EpsilonGreedyPolicy class NoNoisePolicy(Policy): @@ -25,6 +26,9 @@ def select_action(self, state): state = np.expand_dims(state, axis=0) if self.env_spec['actions'] == 'continuous': action = agent.actor.predict(state)[0] + self.sample() + action = np.clip(action, + self.env_spec['action_bound_low'], + self.env_spec['action_bound_high']) else: Q_state = agent.actor.predict(state)[0] assert Q_state.ndim == 1 @@ -60,6 +64,26 @@ def update(self, sys_vars): self.n_step = sys_vars['epi'] +class EpsilonGreedyNoisePolicy(EpsilonGreedyPolicy, NoNoisePolicy): + + ''' + akin to epsilon greedy decay, + but return random sample instead + ''' + + def sample(self): + if self.e > np.random.rand(): + noise = np.random.uniform( + 0.5 * self.env_spec['action_bound_low'], + 0.5 * self.env_spec['action_bound_high']) + else: + noise = 0 + return noise + + def select_action(self, state): + return NoNoisePolicy.select_action(self, state) + + class AnnealedGaussianPolicy(LinearNoisePolicy): ''' diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index 2ed58f0..6b8ce94 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -97,35 +97,6 @@ ] } }, - "lunar_double_dqn_per": { - "problem": "LunarLander-v2", - "Agent": "DoubleDQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "DoubleDQNBoltzmannPolicy", - "PreProcessor": "StackStates", - "param": { - "train_per_n_new_exp": 2, - "lr": 0.005, - "gamma": 0.99, - "hidden_layers": [800, 400], - "hidden_layers_activation": "sigmoid", - "output_layer_activation": "linear", - "exploration_anneal_episodes": 150, - "epi_change_lr": 200, - "max_mem_len": 30000 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [400, 200], - [800, 400], - [400, 200, 100] - ] - } - }, "lunar_double_dqn_nopreprocess": { "problem": "LunarLander-v2", "Agent": "DoubleDQN", @@ -266,11 +237,11 @@ ] } }, - "lunar_ddpg_linearnoise": { + "lunar_cont_ddpg_per_linearnoise": { "problem": "LunarLanderContinuous-v2", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", + "Memory": "PrioritizedExperienceReplay", "Optimizer": "AdamOptimizer", "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", @@ -327,5 +298,35 @@ [800, 400, 200] ] } + }, + "walker_ddpg_per_linearnoise": { + "problem": "BipedalWalker-v2", + "Agent": "DDPG", + "HyperOptimizer": "GridSearch", + "Memory": "PrioritizedExperienceReplay", + "Optimizer": "AdamOptimizer", + "Policy": "LinearNoisePolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "batch_size": 64, + "n_epoch": 1, + "tau": 0.005, + "lr": 0.0005, + "critic_lr": 0.001, + "gamma": 0.97, + "hidden_layers": [400, 200], + "hidden_layers_activation": "relu", + "output_layer_activation": "tanh" + }, + "param_range": { + "lr": [0.0001, 0.0005], + "critic_lr": [0.001, 0.005], + "gamma": [0.95, 0.97, 0.99], + "hidden_layers": [ + [200, 100], + [400, 300], + [800, 400] + ] + } } } diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index e007b88..65e012f 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -92,14 +92,11 @@ "exploration_anneal_episodes": 10 }, "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.95, 0.97, 0.99, 0.999], + "max_mem_len": [2000, 5000, 10000], + "alpha": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], "hidden_layers": [ - [16], - [32], [64], - [16, 8], - [32, 16] + [128] ] } }, @@ -165,35 +162,6 @@ ] } }, - "double_dqn_per": { - "problem": "CartPole-v0", - "Agent": "DoubleDQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "DoubleDQNBoltzmannPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10, - "e": 0.01, - "alpha": 0.5, - "max_mem_len": 20000 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [16], - [32], - [64], - [32, 16] - ] - } - }, "sarsa": { "problem": "CartPole-v0", "Agent": "DeepSarsa", @@ -222,68 +190,6 @@ ] } }, - "sarsa_epsilon": { - "problem": "CartPole-v0", - "Agent": "DeepSarsa", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "EpsilonGreedyPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 50, - "max_mem_len": 50000 - }, - "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [16], - [32], - [64], - [16, 8], - [200, 100] - ] - } - }, - "rand_sarsa": { - "problem": "CartPole-v0", - "Agent": "DeepSarsa", - "HyperOptimizer": "RandomSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "BoltzmannPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "max_evals": 50, - "lr": 0.01, - "gamma": 0.99, - "hidden_layers": [32], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10 - }, - "param_range": { - "lr": { - "min": 0.0005, - "max": 0.05 - }, - "gamma": { - "min": 0.95, - "max": 0.999 - }, - "hidden_layers": [ - [16], - [32], - [64], - [16, 8], - [32, 16] - ] - } - }, "exp_sarsa": { "problem": "CartPole-v0", "Agent": "DeepExpectedSarsa", @@ -410,6 +316,31 @@ ] } }, + "dqn_per_v1": { + "problem": "CartPole-v1", + "Agent": "DQN", + "HyperOptimizer": "GridSearch", + "Memory": "PrioritizedExperienceReplay", + "Optimizer": "AdamOptimizer", + "Policy": "BoltzmannPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "exploration_anneal_episodes": 10, + "gamma": 0.999, + "hidden_layers": [128], + "hidden_layers_activation": "sigmoid", + "lr": 0.005 + }, + "param_range": { + "max_mem_len": [5000, 10000, 20000], + "alpha": [0.0, 0.6, 0.8, 1.0], + "gamma": [0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, "rand_dqn_v1": { "problem": "CartPole-v1", "Agent": "DQN", @@ -472,32 +403,6 @@ ] } }, - "double_dqn_per_v1": { - "problem": "CartPole-v1", - "Agent": "DoubleDQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "DoubleDQNBoltzmannPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.999, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [16], - [32], - [64], - [32, 16] - ] - } - }, "offpol_sarsa_v1": { "problem": "CartPole-v1", "Agent": "OffPolicySarsa", @@ -602,32 +507,6 @@ ] } }, - "acrobot_double_dqn_per": { - "problem": "Acrobot-v1", - "Agent": "DoubleDQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "DoubleDQNBoltzmannPolicy", - "PreProcessor": "StackStates", - "param": { - "train_per_n_new_exp": 1, - "lr": 0.01, - "gamma": 0.99, - "hidden_layers": [32], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 200 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [200], - [200, 100], - [400, 200, 100] - ] - } - }, "acrobot_offpol_sarsa": { "problem": "Acrobot-v1", "Agent": "OffPolicySarsa", @@ -857,26 +736,25 @@ ] } }, - "pendulum_ddpg_per": { + "pendulum_ddpg_epsilonnoise": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", + "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "NoNoisePolicy", + "Policy": "EpsilonGreedyNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, - "tau": 0.001, + "tau": 0.005, "lr": 0.001, "critic_lr": 0.001, "exploration_anneal_episodes": 50, "gamma": 0.97, "hidden_layers": [400, 300], "hidden_layers_activation": "relu", - "output_layer_activation": "tanh", - "max_mem_len": 30000 + "output_layer_activation": "tanh" }, "param_range": { "lr": [0.0001, 0.0005, 0.001], @@ -889,11 +767,11 @@ ] } }, - "pendulum_ddpg_per_linearnoise": { + "pendulum_ddpg_linearnoise": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", + "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", @@ -901,41 +779,39 @@ "batch_size": 64, "n_epoch": 1, "tau": 0.005, - "lr": 0.0005, - "critic_lr": 0.001, - "exploration_anneal_episodes": 100, + "lr": 0.0001, + "critic_lr": 0.005, + "exploration_anneal_episodes": 50, "gamma": 0.97, - "hidden_layers": [400, 200], + "hidden_layers": [400, 300], "hidden_layers_activation": "relu", - "output_layer_activation": "tanh", - "max_mem_len": 30000 + "output_layer_activation": "tanh" }, "param_range": { - "lr": [0.0001, 0.0005], + "lr": [0.0001, 0.0005, 0.001], "critic_lr": [0.001, 0.005], "gamma": [0.95, 0.97, 0.99], "hidden_layers": [ - [200, 100], - [400, 200], [400, 300], - [800, 400] + [800, 400, 200], + [800, 600, 400, 200] ] } }, - "pendulum_ddpg_linearnoise": { + "pendulum_ddpg_ounoise": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "LinearNoisePolicy", + "Policy": "NoNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, "tau": 0.005, - "lr": 0.0001, - "critic_lr": 0.005, + "lr": 0.001, + "critic_lr": 0.001, "exploration_anneal_episodes": 50, "gamma": 0.97, "hidden_layers": [400, 300], @@ -953,13 +829,13 @@ ] } }, - "pendulum_ddpg_ounoise": { + "pendulum_ddpg_gaussiannoise": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "NoNoisePolicy", + "Policy": "GaussianWhiteNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, @@ -984,25 +860,26 @@ ] } }, - "pendulum_ddpg_gaussiannoise": { + "pendulum_ddpg_per": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", + "Memory": "PrioritizedExperienceReplay", "Optimizer": "AdamOptimizer", - "Policy": "GaussianWhiteNoisePolicy", + "Policy": "NoNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, - "tau": 0.005, + "tau": 0.001, "lr": 0.001, "critic_lr": 0.001, "exploration_anneal_episodes": 50, "gamma": 0.97, "hidden_layers": [400, 300], "hidden_layers_activation": "relu", - "output_layer_activation": "tanh" + "output_layer_activation": "tanh", + "max_mem_len": 30000 }, "param_range": { "lr": [0.0001, 0.0005, 0.001], @@ -1015,6 +892,39 @@ ] } }, + "pendulum_ddpg_per_linearnoise": { + "problem": "Pendulum-v0", + "Agent": "DDPG", + "HyperOptimizer": "GridSearch", + "Memory": "PrioritizedExperienceReplay", + "Optimizer": "AdamOptimizer", + "Policy": "LinearNoisePolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "batch_size": 64, + "n_epoch": 1, + "tau": 0.005, + "lr": 0.0005, + "critic_lr": 0.001, + "exploration_anneal_episodes": 100, + "gamma": 0.97, + "hidden_layers": [400, 200], + "hidden_layers_activation": "relu", + "output_layer_activation": "tanh", + "max_mem_len": 30000 + }, + "param_range": { + "lr": [0.0001, 0.0005], + "critic_lr": [0.001, 0.005], + "gamma": [0.95, 0.97, 0.99], + "hidden_layers": [ + [200, 100], + [400, 200], + [400, 300], + [800, 400] + ] + } + }, "mountain_dqn": { "problem": "MountainCar-v0", "Agent": "DQN", @@ -1078,22 +988,22 @@ "Policy": "BoltzmannPolicy", "PreProcessor": "NoPreProcessor", "param": { - "batch_size": 32, + "batch_size": 64, "lr": 0.001, "gamma": 0.99, "hidden_layers": [128, 64], "hidden_layers_activation": "sigmoid", "output_layer_activation": "linear", "exploration_anneal_episodes": 50, - "epi_change_lr": 150, - "max_mem_len": 30000 + "epi_change_lr": 150 }, "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05], + "lr": [0.001, 0.005, 0.01], "gamma": [0.99, 0.999], "hidden_layers": [ [400], - [800] + [800], + [1200] ] } }, @@ -1128,7 +1038,7 @@ "problem": "MountainCar-v0", "Agent": "ActorCritic", "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", + "Memory": "PrioritizedExperienceReplay", "Optimizer": "AdamOptimizer", "Policy": "SoftmaxPolicy", "PreProcessor": "NoPreProcessor", @@ -1136,7 +1046,8 @@ "lr": 0.02, "gamma": 0.99, "hidden_layers": [64], - "hidden_layers_activation": "sigmoid" + "hidden_layers_activation": "relu", + "max_mem_len": 50000 }, "param_range": { "lr": [0.001, 0.005, 0.01], diff --git a/test/test_atari.py b/test/test_atari.py new file mode 100644 index 0000000..0888757 --- /dev/null +++ b/test/test_atari.py @@ -0,0 +1,21 @@ +import unittest +import pytest +from os import environ +from rl.experiment import run +from . import conftest +import pandas as pd + + +class AtariTest(unittest.TestCase): + + @unittest.skipIf(environ.get('CI'), "Delay CI test until dev stable") + @classmethod + def test_breakout_dqn(cls): + data_df = run('breakout_dqn') + assert isinstance(data_df, pd.DataFrame) + + @unittest.skipIf(environ.get('CI'), "Delay CI test until dev stable") + @classmethod + def test_breakout_double_dqn(cls): + data_df = run('breakout_double_dqn') + assert isinstance(data_df, pd.DataFrame) diff --git a/test/test_box2d.py b/test/test_box2d.py new file mode 100644 index 0000000..db0ea41 --- /dev/null +++ b/test/test_box2d.py @@ -0,0 +1,29 @@ +import unittest +import pytest +from os import environ +from rl.experiment import run +from . import conftest +import pandas as pd + + +class Box2DTest(unittest.TestCase): + + @classmethod + def test_lunar_dqn(cls): + data_df = run('lunar_dqn') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_lunar_double_dqn(cls): + data_df = run('lunar_double_dqn') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_lunar_freeze(cls): + data_df = run('lunar_freeze') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_walker_ddpg_linearnoise(cls): + data_df = run('walker_ddpg_linearnoise') + assert isinstance(data_df, pd.DataFrame) diff --git a/test/test_advanced.py b/test/test_classic.py similarity index 61% rename from test/test_advanced.py rename to test/test_classic.py index 7407e56..b395ac9 100644 --- a/test/test_advanced.py +++ b/test/test_classic.py @@ -6,50 +6,46 @@ import pandas as pd -class AdvancedTest(unittest.TestCase): +class ClassicTest(unittest.TestCase): @classmethod - def test_sarsa(cls): - data_df = run('rand_sarsa') + def test_quickstart_dqn(cls): + data_df = run('quickstart_dqn') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_exp_sarsa(cls): - data_df = run('exp_sarsa') + def test_dqn_epsilon(cls): + data_df = run('dqn_epsilon') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_offpol_sarsa(cls): - data_df = run('offpol_sarsa') + def test_dqn(cls): + data_df = run('dqn') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_acrobot(cls): - data_df = run('acrobot') + def test_dqn_per(cls): + data_df = run('dqn_per') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_mountain_dqn(cls): - data_df = run('mountain_dqn') + def test_double_dqn(cls): + data_df = run('double_dqn') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_lunar_dqn(cls): - data_df = run('lunar_dqn') + def test_sarsa(cls): + data_df = run('sarsa') assert isinstance(data_df, pd.DataFrame) - @unittest.skipIf(environ.get('CI'), - "Delay CI test until dev stable") @classmethod - def test_breakout_dqn(cls): - data_df = run('breakout_dqn') + def test_exp_sarsa(cls): + data_df = run('exp_sarsa') assert isinstance(data_df, pd.DataFrame) - @unittest.skipIf(environ.get('CI'), - "Delay CI test until dev stable") @classmethod - def test_breakout_double_dqn(cls): - data_df = run('breakout_double_dqn') + def test_offpol_sarsa(cls): + data_df = run('offpol_sarsa') assert isinstance(data_df, pd.DataFrame) @classmethod @@ -58,6 +54,21 @@ def test_cartpole_ac_argmax(cls): assert isinstance(data_df, pd.DataFrame) @classmethod - def test_pendulum_ddpg(cls): - data_df = run('pendulum_ddpg') + def test_dqn_v1(cls): + data_df = run('dqn_v1') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_acrobot(cls): + data_df = run('acrobot') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_pendulum_ddpg_linearnoise(cls): + data_df = run('pendulum_ddpg_linearnoise') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_mountain_dqn(cls): + data_df = run('mountain_dqn') assert isinstance(data_df, pd.DataFrame) diff --git a/test/test_basic.py b/test/test_dev.py similarity index 78% rename from test/test_basic.py rename to test/test_dev.py index 4cac079..0d577d2 100644 --- a/test/test_basic.py +++ b/test/test_dev.py @@ -6,7 +6,7 @@ import pandas as pd -class BasicTest(unittest.TestCase): +class DevTest(unittest.TestCase): @classmethod def test_clean_import(cls): @@ -46,18 +46,3 @@ def test_dqn_pass(cls): # def test_dqn_random_search(cls): # data_df = run('test_dqn_random_search', param_selection=True) # assert isinstance(data_df, pd.DataFrame) - - @classmethod - def test_dqn(cls): - data_df = run('dqn') - assert isinstance(data_df, pd.DataFrame) - - @classmethod - def test_dqn(cls): - data_df = run('double_dqn_per') - assert isinstance(data_df, pd.DataFrame) - - @classmethod - def test_double_dqn(cls): - data_df = run('double_dqn') - assert isinstance(data_df, pd.DataFrame)