From 32e06348b6e2f0ac0290c11eaab1e065889c11b1 Mon Sep 17 00:00:00 2001 From: kengz Date: Mon, 24 Apr 2017 08:51:12 -0400 Subject: [PATCH 01/33] use per for mountain ac --- rl/spec/classic_experiment_specs.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index e007b88..cb07cc2 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -1128,7 +1128,7 @@ "problem": "MountainCar-v0", "Agent": "ActorCritic", "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", + "Memory": "PrioritizedExperienceReplay", "Optimizer": "AdamOptimizer", "Policy": "SoftmaxPolicy", "PreProcessor": "NoPreProcessor", @@ -1136,7 +1136,8 @@ "lr": 0.02, "gamma": 0.99, "hidden_layers": [64], - "hidden_layers_activation": "sigmoid" + "hidden_layers_activation": "relu", + "max_mem_len": 50000 }, "param_range": { "lr": [0.001, 0.005, 0.01], From aea3a3f24fb7b4f59a3cdb430eaa762a5066aaac Mon Sep 17 00:00:00 2001 From: kengz Date: Mon, 24 Apr 2017 08:51:54 -0400 Subject: [PATCH 02/33] mountain per --- rl/spec/classic_experiment_specs.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index cb07cc2..4bb988f 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -1086,7 +1086,7 @@ "output_layer_activation": "linear", "exploration_anneal_episodes": 50, "epi_change_lr": 150, - "max_mem_len": 30000 + "max_mem_len": 50000 }, "param_range": { "lr": [0.005, 0.01, 0.02, 0.05], From 763f78e03692e8595bea58cc660175fefb76936f Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 17:45:14 -0400 Subject: [PATCH 03/33] fix per, add missing memory update to ddpg --- rl/agent/actor_critic.py | 3 +++ rl/agent/ddpg.py | 7 +++++++ rl/agent/dqn.py | 2 ++ rl/memory/prioritized_exp_replay.py | 14 +++++--------- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/rl/agent/actor_critic.py b/rl/agent/actor_critic.py index 945f2c9..c009f03 100644 --- a/rl/agent/actor_critic.py +++ b/rl/agent/actor_critic.py @@ -114,7 +114,10 @@ def train_critic(self, minibatch): actor_delta = Q_next_vals - Q_vals loss = self.critic.train_on_batch(minibatch['states'], Q_targets) + # update memory, needed for PER errors = abs(np.sum(Q_vals - Q_targets, axis=1)) + assert Q_targets.shape == (self.batch_size, 1) + assert errors.shape == (self.batch_size, ) self.memory.update(errors) return loss, actor_delta diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py index cb9efb8..f779d1d 100644 --- a/rl/agent/ddpg.py +++ b/rl/agent/ddpg.py @@ -242,6 +242,7 @@ def train_an_epoch(self): # train critic mu_prime = self.actor.target_predict(minibatch['next_states']) + q_val = self.critic.target_predict(minibatch['states'], mu_prime) q_prime = self.critic.target_predict( minibatch['next_states'], mu_prime) # reshape for element-wise multiplication @@ -250,6 +251,12 @@ def train_an_epoch(self): (1 - minibatch['terminals']) * np.reshape(q_prime, (-1)) y = np.reshape(y, (-1, 1)) + # update memory, needed for PER + errors = abs(np.sum(q_val - y, axis=1)) + assert y.shape == (self.batch_size, 1) + assert errors.shape == (self.batch_size, ) + self.memory.update(errors) + _, _, critic_loss = self.critic.train_tf( minibatch['states'], minibatch['actions'], y) diff --git a/rl/agent/dqn.py b/rl/agent/dqn.py index 26e3ae4..4ef05f9 100644 --- a/rl/agent/dqn.py +++ b/rl/agent/dqn.py @@ -194,6 +194,8 @@ def train_an_epoch(self): loss = self.model.train_on_batch(minibatch['states'], Q_targets) errors = abs(np.sum(Q_states - Q_targets, axis=1)) + assert Q_targets.shape == (self.batch_size, 1) + assert errors.shape == (self.batch_size, ) self.memory.update(errors) return loss diff --git a/rl/memory/prioritized_exp_replay.py b/rl/memory/prioritized_exp_replay.py index d6e8320..41fcde6 100644 --- a/rl/memory/prioritized_exp_replay.py +++ b/rl/memory/prioritized_exp_replay.py @@ -27,21 +27,18 @@ def __init__(self, env_spec, max_mem_len=10000, e=0.01, alpha=0.6, self.prio_tree = SumTree(self.max_mem_len) self.head = 0 - # bump to account for negative terms in reward get_priority - # and we cannot abs(reward) cuz it's sign sensitive - SOLVED_MEAN_REWARD = self.env_spec['problem']['SOLVED_MEAN_REWARD'] or 10000 - self.min_priority = abs(10 * SOLVED_MEAN_REWARD) - def get_priority(self, error): # add min_priority to prevent root of negative = complex - p = (self.min_priority + error + self.e) ** self.alpha + p = (error + self.e) ** self.alpha assert not np.isnan(p) return p def add_exp(self, action, reward, next_state, terminal): '''Round robin memory updating''' - # roughly the error between estimated Q and true q is the reward - error = reward + # init error to reward first, update later + error = abs(reward) + p = self.get_priority(error) + if self.size() < self.max_mem_len: # add as usual super(PrioritizedExperienceReplay, self).add_exp( action, reward, next_state, terminal) @@ -59,7 +56,6 @@ def add_exp(self, action, reward, next_state, terminal): if self.head >= self.max_mem_len: self.head = 0 # reset for round robin - p = self.get_priority(error) self.prio_tree.add(p) assert self.head == self.prio_tree.head, 'prio_tree head is wrong' From 22c1a2d718526874ed1ed1c2dedc34bf99f8e26a Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 17:46:35 -0400 Subject: [PATCH 04/33] add walker ddpg per --- rl/spec/box2d_experiment_specs.json | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index 2ed58f0..638a689 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -327,5 +327,37 @@ [800, 400, 200] ] } + }, + "walker_ddpg_linearnoise_per": { + "problem": "BipedalWalker-v2", + "Agent": "DDPG", + "HyperOptimizer": "GridSearch", + "Memory": "PrioritizedExperienceReplay", + "Optimizer": "AdamOptimizer", + "Policy": "LinearNoisePolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "batch_size": 64, + "n_epoch": 1, + "tau": 0.005, + "lr": 0.001, + "critic_lr": 0.001, + "exploration_anneal_episodes": 100, + "gamma": 0.97, + "hidden_layers": [400, 300], + "hidden_layers_activation": "relu", + "output_layer_activation": "tanh", + "max_mem_len": 100000 + }, + "param_range": { + "lr": [0.0001, 0.0005, 0.001], + "critic_lr": [0.001, 0.005, 0.01], + "gamma": [0.97, 0.99, 0.999], + "hidden_layers": [ + [400, 300], + [600, 300], + [800, 400, 200] + ] + } } } From fc40e8833b99ef387cd7d6316a5fbe66d0efadb5 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 22:26:05 -0400 Subject: [PATCH 05/33] size down per --- rl/spec/box2d_experiment_specs.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index 638a689..c8e0d3e 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -347,7 +347,7 @@ "hidden_layers": [400, 300], "hidden_layers_activation": "relu", "output_layer_activation": "tanh", - "max_mem_len": 100000 + "max_mem_len": 10000 }, "param_range": { "lr": [0.0001, 0.0005, 0.001], From a352c2e53a11bd8ac8d4e443c3bff9635005a50d Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 22:28:04 -0400 Subject: [PATCH 06/33] narrow down params --- rl/spec/box2d_experiment_specs.json | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index c8e0d3e..bbcc3ca 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -340,23 +340,22 @@ "batch_size": 64, "n_epoch": 1, "tau": 0.005, - "lr": 0.001, + "lr": 0.0005, "critic_lr": 0.001, - "exploration_anneal_episodes": 100, "gamma": 0.97, - "hidden_layers": [400, 300], + "hidden_layers": [400, 200], "hidden_layers_activation": "relu", "output_layer_activation": "tanh", "max_mem_len": 10000 }, "param_range": { - "lr": [0.0001, 0.0005, 0.001], - "critic_lr": [0.001, 0.005, 0.01], - "gamma": [0.97, 0.99, 0.999], + "lr": [0.0001, 0.0005], + "critic_lr": [0.001, 0.005], + "gamma": [0.95, 0.97, 0.99], "hidden_layers": [ + [200, 100], [400, 300], - [600, 300], - [800, 400, 200] + [800, 400] ] } } From 9cd6f61e778151b1f0481409435ed79a8563ab85 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 23:04:47 -0400 Subject: [PATCH 07/33] per for dqn v1 --- rl/spec/classic_experiment_specs.json | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 4bb988f..a35d22e 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -410,6 +410,31 @@ ] } }, + "dqn_per_v1": { + "problem": "CartPole-v1", + "Agent": "DQN", + "HyperOptimizer": "GridSearch", + "Memory": "PrioritizedExperienceReplay", + "Optimizer": "AdamOptimizer", + "Policy": "BoltzmannPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "exploration_anneal_episodes": 10, + "gamma": 0.999, + "hidden_layers": [128], + "hidden_layers_activation": "sigmoid", + "lr": 0.005 + }, + "param_range": { + "max_mem_len": [5000, 10000, 20000], + "alpha": [0.6, 0.8, 1.0], + "gamma": [0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, "rand_dqn_v1": { "problem": "CartPole-v1", "Agent": "DQN", From eeba6df3249b617bcfc5c269026fd8c5a963cbb5 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 23:11:58 -0400 Subject: [PATCH 08/33] fix and generalize shape assert --- rl/agent/actor_critic.py | 3 ++- rl/agent/ddpg.py | 3 ++- rl/agent/dqn.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/rl/agent/actor_critic.py b/rl/agent/actor_critic.py index c009f03..aff460e 100644 --- a/rl/agent/actor_critic.py +++ b/rl/agent/actor_critic.py @@ -116,7 +116,8 @@ def train_critic(self, minibatch): # update memory, needed for PER errors = abs(np.sum(Q_vals - Q_targets, axis=1)) - assert Q_targets.shape == (self.batch_size, 1) + assert Q_targets.shape == ( + self.batch_size, self.env_spec['action_dim']) assert errors.shape == (self.batch_size, ) self.memory.update(errors) return loss, actor_delta diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py index f779d1d..5a6db4f 100644 --- a/rl/agent/ddpg.py +++ b/rl/agent/ddpg.py @@ -253,7 +253,8 @@ def train_an_epoch(self): # update memory, needed for PER errors = abs(np.sum(q_val - y, axis=1)) - assert y.shape == (self.batch_size, 1) + assert Q_targets.shape == ( + self.batch_size, self.env_spec['action_dim']) assert errors.shape == (self.batch_size, ) self.memory.update(errors) diff --git a/rl/agent/dqn.py b/rl/agent/dqn.py index 4ef05f9..d407f75 100644 --- a/rl/agent/dqn.py +++ b/rl/agent/dqn.py @@ -194,7 +194,8 @@ def train_an_epoch(self): loss = self.model.train_on_batch(minibatch['states'], Q_targets) errors = abs(np.sum(Q_states - Q_targets, axis=1)) - assert Q_targets.shape == (self.batch_size, 1) + assert Q_targets.shape == ( + self.batch_size, self.env_spec['action_dim']) assert errors.shape == (self.batch_size, ) self.memory.update(errors) return loss From 4156956fe5a37fa440b7b61089fd8d8d5557f0bd Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 23:18:41 -0400 Subject: [PATCH 09/33] fix assert in shape --- rl/agent/ddpg.py | 3 +-- rl/agent/deep_sarsa.py | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py index 5a6db4f..401a7e7 100644 --- a/rl/agent/ddpg.py +++ b/rl/agent/ddpg.py @@ -253,8 +253,7 @@ def train_an_epoch(self): # update memory, needed for PER errors = abs(np.sum(q_val - y, axis=1)) - assert Q_targets.shape == ( - self.batch_size, self.env_spec['action_dim']) + assert y.shape == (self.batch_size, self.env_spec['action_dim']) assert errors.shape == (self.batch_size, ) self.memory.update(errors) diff --git a/rl/agent/deep_sarsa.py b/rl/agent/deep_sarsa.py index a535050..9048bbe 100644 --- a/rl/agent/deep_sarsa.py +++ b/rl/agent/deep_sarsa.py @@ -30,4 +30,10 @@ def train_an_epoch(self): Q_targets = self.compute_Q_targets( minibatch, Q_states, Q_next_states_selected) loss = self.model.train_on_batch(minibatch['states'], Q_targets) + + errors = abs(np.sum(Q_states - Q_targets, axis=1)) + assert Q_targets.shape == ( + self.batch_size, self.env_spec['action_dim']) + assert errors.shape == (self.batch_size, ) + self.memory.update(errors) return loss From 186cc0856aba0d292540ba9f7077cbabb6005b90 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 23:35:27 -0400 Subject: [PATCH 10/33] remove offset in botlzman qstate --- rl/policy/boltzmann.py | 2 -- rl/spec/classic_experiment_specs.json | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/rl/policy/boltzmann.py b/rl/policy/boltzmann.py index ef9069b..f84bdcb 100644 --- a/rl/policy/boltzmann.py +++ b/rl/policy/boltzmann.py @@ -27,7 +27,6 @@ def select_action(self, state): Q_state = agent.model.predict(state)[0] # extract from batch predict assert Q_state.ndim == 1 Q_state = Q_state.astype('float32') # fix precision nan issue - Q_state = Q_state - np.amax(Q_state) # prevent overflow exp_values = np.exp( np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) assert not np.isnan(exp_values).any() @@ -67,7 +66,6 @@ def select_action(self, state): Q_state = Q_state1 + Q_state2 assert Q_state.ndim == 1 Q_state = Q_state.astype('float32') # fix precision nan issue - Q_state = Q_state - np.amax(Q_state) # prevent overflow exp_values = np.exp( np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) assert not np.isnan(exp_values).any() diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index a35d22e..bdffffb 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -427,7 +427,7 @@ }, "param_range": { "max_mem_len": [5000, 10000, 20000], - "alpha": [0.6, 0.8, 1.0], + "alpha": [0.0, 0.6, 0.8, 1.0], "gamma": [0.99, 0.999], "hidden_layers": [ [64], From 13f6f6eda209aace291a76f1b07790755403fc70 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 23:36:23 -0400 Subject: [PATCH 11/33] import np in sarsa --- rl/agent/deep_sarsa.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rl/agent/deep_sarsa.py b/rl/agent/deep_sarsa.py index 9048bbe..f6ddd09 100644 --- a/rl/agent/deep_sarsa.py +++ b/rl/agent/deep_sarsa.py @@ -1,3 +1,4 @@ +import numpy as np from rl.agent.dqn import DQN From d2d2a9c7dde22b651a60c94ecd9038c6988bbb11 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 23:49:06 -0400 Subject: [PATCH 12/33] fix critic assert dim --- rl/agent/actor_critic.py | 4 ++-- rl/policy/actor_critic.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/rl/agent/actor_critic.py b/rl/agent/actor_critic.py index aff460e..7c3ae86 100644 --- a/rl/agent/actor_critic.py +++ b/rl/agent/actor_critic.py @@ -116,8 +116,8 @@ def train_critic(self, minibatch): # update memory, needed for PER errors = abs(np.sum(Q_vals - Q_targets, axis=1)) - assert Q_targets.shape == ( - self.batch_size, self.env_spec['action_dim']) + # Q size is only 1, from critic + assert Q_targets.shape == (self.batch_size, 1) assert errors.shape == (self.batch_size, ) self.memory.update(errors) return loss, actor_delta diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index 900ac7d..dc4c489 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -48,7 +48,6 @@ def select_action(self, state): A_score = agent.actor.predict(state)[0] # extract from batch predict assert A_score.ndim == 1 A_score = A_score.astype('float32') # fix precision nan issue - A_score = A_score - np.amax(A_score) # prevent overflow exp_values = np.exp( np.clip(A_score, -self.clip_val, self.clip_val)) assert not np.isnan(exp_values).any() From e67cced87e07a3bc56a05be206d35c9e32c274e2 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 23:52:34 -0400 Subject: [PATCH 13/33] clipval for boltzmann at 200 --- rl/policy/boltzmann.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rl/policy/boltzmann.py b/rl/policy/boltzmann.py index f84bdcb..ebfb305 100644 --- a/rl/policy/boltzmann.py +++ b/rl/policy/boltzmann.py @@ -18,7 +18,7 @@ def __init__(self, env_spec, self.final_tau = final_tau self.tau = self.init_tau self.exploration_anneal_episodes = exploration_anneal_episodes - self.clip_val = 500 + self.clip_val = 200 log_self(self) def select_action(self, state): From 77311ef7874c6bd8a1e24a6ac3e04718dacecd6b Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 25 Apr 2017 23:55:11 -0400 Subject: [PATCH 14/33] guard overflow again --- rl/policy/boltzmann.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rl/policy/boltzmann.py b/rl/policy/boltzmann.py index ebfb305..e0236dc 100644 --- a/rl/policy/boltzmann.py +++ b/rl/policy/boltzmann.py @@ -67,7 +67,7 @@ def select_action(self, state): assert Q_state.ndim == 1 Q_state = Q_state.astype('float32') # fix precision nan issue exp_values = np.exp( - np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) + np.clip(Q_state / float(self.tau), -self.clip_val, self.clip_val)) assert not np.isnan(exp_values).any() probs = np.array(exp_values / np.sum(exp_values)) probs /= probs.sum() # renormalize to prevent floating pt error From ef40317451baa0603a6a379e481433e73a16a98a Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 00:00:06 -0400 Subject: [PATCH 15/33] restore underflow fix --- rl/policy/actor_critic.py | 1 + rl/policy/boltzmann.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index dc4c489..900ac7d 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -48,6 +48,7 @@ def select_action(self, state): A_score = agent.actor.predict(state)[0] # extract from batch predict assert A_score.ndim == 1 A_score = A_score.astype('float32') # fix precision nan issue + A_score = A_score - np.amax(A_score) # prevent overflow exp_values = np.exp( np.clip(A_score, -self.clip_val, self.clip_val)) assert not np.isnan(exp_values).any() diff --git a/rl/policy/boltzmann.py b/rl/policy/boltzmann.py index e0236dc..04a763c 100644 --- a/rl/policy/boltzmann.py +++ b/rl/policy/boltzmann.py @@ -27,8 +27,9 @@ def select_action(self, state): Q_state = agent.model.predict(state)[0] # extract from batch predict assert Q_state.ndim == 1 Q_state = Q_state.astype('float32') # fix precision nan issue + Q_state = Q_state - np.amax(Q_state) # prevent overflow exp_values = np.exp( - np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) + np.clip(Q_state / float(self.tau), -self.clip_val, self.clip_val)) assert not np.isnan(exp_values).any() probs = np.array(exp_values / np.sum(exp_values)) probs /= probs.sum() # renormalize to prevent floating pt error @@ -66,6 +67,7 @@ def select_action(self, state): Q_state = Q_state1 + Q_state2 assert Q_state.ndim == 1 Q_state = Q_state.astype('float32') # fix precision nan issue + Q_state = Q_state - np.amax(Q_state) # prevent overflow exp_values = np.exp( np.clip(Q_state / float(self.tau), -self.clip_val, self.clip_val)) assert not np.isnan(exp_values).any() From 41f30a9728c581efef54eab1689280361e7dd3f4 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 00:51:03 -0400 Subject: [PATCH 16/33] minor refactor --- rl/agent/dqn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rl/agent/dqn.py b/rl/agent/dqn.py index d407f75..4810e9c 100644 --- a/rl/agent/dqn.py +++ b/rl/agent/dqn.py @@ -190,7 +190,6 @@ def train_an_epoch(self): minibatch) Q_targets = self.compute_Q_targets( minibatch, Q_states, Q_next_states_max) - loss = self.model.train_on_batch(minibatch['states'], Q_targets) errors = abs(np.sum(Q_states - Q_targets, axis=1)) From f4cc42847f546b12451db25fea29a7d37c86ca6e Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 00:59:31 -0400 Subject: [PATCH 17/33] clear out unused specs --- rl/spec/box2d_experiment_specs.json | 31 +--- rl/spec/classic_experiment_specs.json | 246 ++++---------------------- 2 files changed, 39 insertions(+), 238 deletions(-) diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index bbcc3ca..aca9cbb 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -97,35 +97,6 @@ ] } }, - "lunar_double_dqn_per": { - "problem": "LunarLander-v2", - "Agent": "DoubleDQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "DoubleDQNBoltzmannPolicy", - "PreProcessor": "StackStates", - "param": { - "train_per_n_new_exp": 2, - "lr": 0.005, - "gamma": 0.99, - "hidden_layers": [800, 400], - "hidden_layers_activation": "sigmoid", - "output_layer_activation": "linear", - "exploration_anneal_episodes": 150, - "epi_change_lr": 200, - "max_mem_len": 30000 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [400, 200], - [800, 400], - [400, 200, 100] - ] - } - }, "lunar_double_dqn_nopreprocess": { "problem": "LunarLander-v2", "Agent": "DoubleDQN", @@ -328,7 +299,7 @@ ] } }, - "walker_ddpg_linearnoise_per": { + "walker_ddpg_per_linearnoise": { "problem": "BipedalWalker-v2", "Agent": "DDPG", "HyperOptimizer": "GridSearch", diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index bdffffb..2d0fcad 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -76,33 +76,6 @@ ] } }, - "dqn_per": { - "problem": "CartPole-v0", - "Agent": "DQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "BoltzmannPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.95, 0.97, 0.99, 0.999], - "hidden_layers": [ - [16], - [32], - [64], - [16, 8], - [32, 16] - ] - } - }, "rand_dqn": { "problem": "CartPole-v0", "Agent": "DQN", @@ -165,35 +138,6 @@ ] } }, - "double_dqn_per": { - "problem": "CartPole-v0", - "Agent": "DoubleDQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "DoubleDQNBoltzmannPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10, - "e": 0.01, - "alpha": 0.5, - "max_mem_len": 20000 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [16], - [32], - [64], - [32, 16] - ] - } - }, "sarsa": { "problem": "CartPole-v0", "Agent": "DeepSarsa", @@ -222,68 +166,6 @@ ] } }, - "sarsa_epsilon": { - "problem": "CartPole-v0", - "Agent": "DeepSarsa", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "EpsilonGreedyPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 50, - "max_mem_len": 50000 - }, - "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [16], - [32], - [64], - [16, 8], - [200, 100] - ] - } - }, - "rand_sarsa": { - "problem": "CartPole-v0", - "Agent": "DeepSarsa", - "HyperOptimizer": "RandomSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "BoltzmannPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "max_evals": 50, - "lr": 0.01, - "gamma": 0.99, - "hidden_layers": [32], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10 - }, - "param_range": { - "lr": { - "min": 0.0005, - "max": 0.05 - }, - "gamma": { - "min": 0.95, - "max": 0.999 - }, - "hidden_layers": [ - [16], - [32], - [64], - [16, 8], - [32, 16] - ] - } - }, "exp_sarsa": { "problem": "CartPole-v0", "Agent": "DeepExpectedSarsa", @@ -497,32 +379,6 @@ ] } }, - "double_dqn_per_v1": { - "problem": "CartPole-v1", - "Agent": "DoubleDQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "DoubleDQNBoltzmannPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.999, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [16], - [32], - [64], - [32, 16] - ] - } - }, "offpol_sarsa_v1": { "problem": "CartPole-v1", "Agent": "OffPolicySarsa", @@ -627,32 +483,6 @@ ] } }, - "acrobot_double_dqn_per": { - "problem": "Acrobot-v1", - "Agent": "DoubleDQN", - "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", - "Optimizer": "AdamOptimizer", - "Policy": "DoubleDQNBoltzmannPolicy", - "PreProcessor": "StackStates", - "param": { - "train_per_n_new_exp": 1, - "lr": 0.01, - "gamma": 0.99, - "hidden_layers": [32], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 200 - }, - "param_range": { - "lr": [0.001, 0.005, 0.01], - "gamma": [0.97, 0.99, 0.999], - "hidden_layers": [ - [200], - [200, 100], - [400, 200, 100] - ] - } - }, "acrobot_offpol_sarsa": { "problem": "Acrobot-v1", "Agent": "OffPolicySarsa", @@ -882,26 +712,25 @@ ] } }, - "pendulum_ddpg_per": { + "pendulum_ddpg_linearnoise": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", + "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "NoNoisePolicy", + "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, - "tau": 0.001, - "lr": 0.001, - "critic_lr": 0.001, + "tau": 0.005, + "lr": 0.0001, + "critic_lr": 0.005, "exploration_anneal_episodes": 50, "gamma": 0.97, "hidden_layers": [400, 300], "hidden_layers_activation": "relu", - "output_layer_activation": "tanh", - "max_mem_len": 30000 + "output_layer_activation": "tanh" }, "param_range": { "lr": [0.0001, 0.0005, 0.001], @@ -914,53 +743,51 @@ ] } }, - "pendulum_ddpg_per_linearnoise": { + "pendulum_ddpg_ounoise": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "PrioritizedExperienceReplay", + "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "LinearNoisePolicy", + "Policy": "NoNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, "tau": 0.005, - "lr": 0.0005, + "lr": 0.001, "critic_lr": 0.001, - "exploration_anneal_episodes": 100, + "exploration_anneal_episodes": 50, "gamma": 0.97, - "hidden_layers": [400, 200], + "hidden_layers": [400, 300], "hidden_layers_activation": "relu", - "output_layer_activation": "tanh", - "max_mem_len": 30000 + "output_layer_activation": "tanh" }, "param_range": { - "lr": [0.0001, 0.0005], + "lr": [0.0001, 0.0005, 0.001], "critic_lr": [0.001, 0.005], "gamma": [0.95, 0.97, 0.99], "hidden_layers": [ - [200, 100], - [400, 200], [400, 300], - [800, 400] + [800, 400, 200], + [800, 600, 400, 200] ] } }, - "pendulum_ddpg_linearnoise": { + "pendulum_ddpg_gaussiannoise": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "LinearNoisePolicy", + "Policy": "GaussianWhiteNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, "tau": 0.005, - "lr": 0.0001, - "critic_lr": 0.005, + "lr": 0.001, + "critic_lr": 0.001, "exploration_anneal_episodes": 50, "gamma": 0.97, "hidden_layers": [400, 300], @@ -978,25 +805,26 @@ ] } }, - "pendulum_ddpg_ounoise": { + "pendulum_ddpg_per": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", + "Memory": "PrioritizedExperienceReplay", "Optimizer": "AdamOptimizer", "Policy": "NoNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, - "tau": 0.005, + "tau": 0.001, "lr": 0.001, "critic_lr": 0.001, "exploration_anneal_episodes": 50, "gamma": 0.97, "hidden_layers": [400, 300], "hidden_layers_activation": "relu", - "output_layer_activation": "tanh" + "output_layer_activation": "tanh", + "max_mem_len": 30000 }, "param_range": { "lr": [0.0001, 0.0005, 0.001], @@ -1009,34 +837,36 @@ ] } }, - "pendulum_ddpg_gaussiannoise": { + "pendulum_ddpg_per_linearnoise": { "problem": "Pendulum-v0", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", + "Memory": "PrioritizedExperienceReplay", "Optimizer": "AdamOptimizer", - "Policy": "GaussianWhiteNoisePolicy", + "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, "tau": 0.005, - "lr": 0.001, + "lr": 0.0005, "critic_lr": 0.001, - "exploration_anneal_episodes": 50, + "exploration_anneal_episodes": 100, "gamma": 0.97, - "hidden_layers": [400, 300], + "hidden_layers": [400, 200], "hidden_layers_activation": "relu", - "output_layer_activation": "tanh" + "output_layer_activation": "tanh", + "max_mem_len": 30000 }, "param_range": { - "lr": [0.0001, 0.0005, 0.001], + "lr": [0.0001, 0.0005], "critic_lr": [0.001, 0.005], "gamma": [0.95, 0.97, 0.99], "hidden_layers": [ + [200, 100], + [400, 200], [400, 300], - [800, 400, 200], - [800, 600, 400, 200] + [800, 400] ] } }, From 43d89edec48fb47f0ed3b7a2a0a3c58bdd144126 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 01:01:58 -0400 Subject: [PATCH 18/33] drop index col from csv --- rl/analytics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rl/analytics.py b/rl/analytics.py index 57d654a..78a4754 100644 --- a/rl/analytics.py +++ b/rl/analytics.py @@ -459,7 +459,7 @@ def analyze_data(experiment_data_or_experiment_id): data_df.sort_values( ['fitness_score'], ascending=False, inplace=True) - data_df.reset_index(inplace=True) + data_df.reset_index(drop=True, inplace=True) trial_id = experiment_data[0]['trial_id'] save_experiment_data(data_df, trial_id) From 7d5e6921317f699f6a448e1e50a29285a1ebfba0 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 01:30:20 -0400 Subject: [PATCH 19/33] mute per test --- test/test_basic.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/test_basic.py b/test/test_basic.py index 4cac079..c26d7d9 100644 --- a/test/test_basic.py +++ b/test/test_basic.py @@ -52,11 +52,6 @@ def test_dqn(cls): data_df = run('dqn') assert isinstance(data_df, pd.DataFrame) - @classmethod - def test_dqn(cls): - data_df = run('double_dqn_per') - assert isinstance(data_df, pd.DataFrame) - @classmethod def test_double_dqn(cls): data_df = run('double_dqn') From 668729a621fdec179a18b022975f09bb7e4d0cc8 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 07:41:58 -0400 Subject: [PATCH 20/33] fix sarsa test --- test/test_advanced.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_advanced.py b/test/test_advanced.py index 7407e56..7bfe161 100644 --- a/test/test_advanced.py +++ b/test/test_advanced.py @@ -10,7 +10,7 @@ class AdvancedTest(unittest.TestCase): @classmethod def test_sarsa(cls): - data_df = run('rand_sarsa') + data_df = run('sarsa') assert isinstance(data_df, pd.DataFrame) @classmethod From d7f5cecde45e43b5f3f3d66fbe98b9b815ef1e46 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 08:06:37 -0400 Subject: [PATCH 21/33] boltzman fix overflow by np float64; remove offset minus --- rl/memory/prioritized_exp_replay.py | 2 +- rl/policy/actor_critic.py | 7 +++---- rl/policy/boltzmann.py | 16 +++++++--------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/rl/memory/prioritized_exp_replay.py b/rl/memory/prioritized_exp_replay.py index 41fcde6..343ef95 100644 --- a/rl/memory/prioritized_exp_replay.py +++ b/rl/memory/prioritized_exp_replay.py @@ -30,7 +30,7 @@ def __init__(self, env_spec, max_mem_len=10000, e=0.01, alpha=0.6, def get_priority(self, error): # add min_priority to prevent root of negative = complex p = (error + self.e) ** self.alpha - assert not np.isnan(p) + assert np.isfinite(p) return p def add_exp(self, action, reward, next_state, terminal): diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index 900ac7d..7ba2d4a 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -39,7 +39,7 @@ class SoftmaxPolicy(Policy): def __init__(self, env_spec, **kwargs): # absorb generic param without breaking super(SoftmaxPolicy, self).__init__(env_spec) - self.clip_val = 500 + self.clip_val = 500. log_self(self) def select_action(self, state): @@ -47,11 +47,10 @@ def select_action(self, state): state = np.expand_dims(state, axis=0) A_score = agent.actor.predict(state)[0] # extract from batch predict assert A_score.ndim == 1 - A_score = A_score.astype('float32') # fix precision nan issue - A_score = A_score - np.amax(A_score) # prevent overflow + A_score = A_score.astype('float64') # fix precision overflow exp_values = np.exp( np.clip(A_score, -self.clip_val, self.clip_val)) - assert not np.isnan(exp_values).any() + assert np.isfinite(exp_values).all() probs = np.array(exp_values / np.sum(exp_values)) probs /= probs.sum() # renormalize to prevent floating pt error action = np.random.choice(agent.env_spec['actions'], p=probs) diff --git a/rl/policy/boltzmann.py b/rl/policy/boltzmann.py index 04a763c..0fdd53f 100644 --- a/rl/policy/boltzmann.py +++ b/rl/policy/boltzmann.py @@ -18,7 +18,7 @@ def __init__(self, env_spec, self.final_tau = final_tau self.tau = self.init_tau self.exploration_anneal_episodes = exploration_anneal_episodes - self.clip_val = 200 + self.clip_val = 500. log_self(self) def select_action(self, state): @@ -26,11 +26,10 @@ def select_action(self, state): state = np.expand_dims(state, axis=0) Q_state = agent.model.predict(state)[0] # extract from batch predict assert Q_state.ndim == 1 - Q_state = Q_state.astype('float32') # fix precision nan issue - Q_state = Q_state - np.amax(Q_state) # prevent overflow + Q_state = Q_state.astype('float64') # fix precision overflow exp_values = np.exp( - np.clip(Q_state / float(self.tau), -self.clip_val, self.clip_val)) - assert not np.isnan(exp_values).any() + np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) + assert np.isfinite(exp_values).all() probs = np.array(exp_values / np.sum(exp_values)) probs /= probs.sum() # renormalize to prevent floating pt error action = np.random.choice(agent.env_spec['actions'], p=probs) @@ -66,11 +65,10 @@ def select_action(self, state): Q_state2 = agent.model_2.predict(state)[0] Q_state = Q_state1 + Q_state2 assert Q_state.ndim == 1 - Q_state = Q_state.astype('float32') # fix precision nan issue - Q_state = Q_state - np.amax(Q_state) # prevent overflow + Q_state = Q_state.astype('float64') # fix precision overflow exp_values = np.exp( - np.clip(Q_state / float(self.tau), -self.clip_val, self.clip_val)) - assert not np.isnan(exp_values).any() + np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) + assert np.isfinite(exp_values).all() probs = np.array(exp_values / np.sum(exp_values)) probs /= probs.sum() # renormalize to prevent floating pt error action = np.random.choice(agent.env_spec['actions'], p=probs) From efa048e92eee25c3feef609d392079c2a017758d Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 08:14:54 -0400 Subject: [PATCH 22/33] schedule mountain dqn per --- rl/spec/classic_experiment_specs.json | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 2d0fcad..edfee38 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -933,7 +933,7 @@ "Policy": "BoltzmannPolicy", "PreProcessor": "NoPreProcessor", "param": { - "batch_size": 32, + "batch_size": 64, "lr": 0.001, "gamma": 0.99, "hidden_layers": [128, 64], @@ -941,14 +941,15 @@ "output_layer_activation": "linear", "exploration_anneal_episodes": 50, "epi_change_lr": 150, - "max_mem_len": 50000 + "max_mem_len": 30000 }, "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05], + "lr": [0.001, 0.005, 0.01], "gamma": [0.99, 0.999], "hidden_layers": [ [400], - [800] + [800], + [1200] ] } }, From 87812323e410ca5f8e823f2dd91cc69af3343c70 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 08:31:15 -0400 Subject: [PATCH 23/33] auto memlen for PER as 1/3 epi * timestep --- rl/memory/prioritized_exp_replay.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rl/memory/prioritized_exp_replay.py b/rl/memory/prioritized_exp_replay.py index 343ef95..476bbe0 100644 --- a/rl/memory/prioritized_exp_replay.py +++ b/rl/memory/prioritized_exp_replay.py @@ -12,8 +12,13 @@ class PrioritizedExperienceReplay(LinearMemoryWithForgetting): memory unit ''' - def __init__(self, env_spec, max_mem_len=10000, e=0.01, alpha=0.6, + def __init__(self, env_spec, max_mem_len=None, e=0.01, alpha=0.6, **kwargs): + if max_mem_len is None: # auto calculate mem len + max_timestep = env_spec['timestep_limit'] + max_epis = env_spec['problem']['MAX_EPISODES'] + memory_epi = np.ceil(max_epis / 3.).astype(int) + max_mem_len = max(10**6, max_timestep * memory_epi) super(PrioritizedExperienceReplay, self).__init__( env_spec, max_mem_len) self.exp_keys.append('error') From 071e13f007d6b80585692c85a8b422b05409e98d Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 08:49:20 -0400 Subject: [PATCH 24/33] auto mem len for walker, use PER for lunar --- rl/spec/box2d_experiment_specs.json | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index aca9cbb..6b8ce94 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -237,11 +237,11 @@ ] } }, - "lunar_ddpg_linearnoise": { + "lunar_cont_ddpg_per_linearnoise": { "problem": "LunarLanderContinuous-v2", "Agent": "DDPG", "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", + "Memory": "PrioritizedExperienceReplay", "Optimizer": "AdamOptimizer", "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", @@ -316,8 +316,7 @@ "gamma": 0.97, "hidden_layers": [400, 200], "hidden_layers_activation": "relu", - "output_layer_activation": "tanh", - "max_mem_len": 10000 + "output_layer_activation": "tanh" }, "param_range": { "lr": [0.0001, 0.0005], From 04578458b64eed6e2305647a9f4e66657d2e57bb Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 18:19:28 -0400 Subject: [PATCH 25/33] fix assert size for ddpg --- rl/agent/ddpg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py index 401a7e7..eece44e 100644 --- a/rl/agent/ddpg.py +++ b/rl/agent/ddpg.py @@ -253,7 +253,8 @@ def train_an_epoch(self): # update memory, needed for PER errors = abs(np.sum(q_val - y, axis=1)) - assert y.shape == (self.batch_size, self.env_spec['action_dim']) + # Q size is only 1, from critic + assert y.shape == (self.batch_size, 1) assert errors.shape == (self.batch_size, ) self.memory.update(errors) From 2ea4b34fc0012dfceef0eadab0d17231af5c2246 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 18:29:22 -0400 Subject: [PATCH 26/33] reorganize tests --- test/test_atari.py | 23 +++++++++ test/test_box2d.py | 24 ++++++++++ test/{test_advanced.py => test_classic.py} | 56 ++++++++++++---------- test/{test_basic.py => test_dev.py} | 12 +---- 4 files changed, 79 insertions(+), 36 deletions(-) create mode 100644 test/test_atari.py create mode 100644 test/test_box2d.py rename test/{test_advanced.py => test_classic.py} (67%) rename test/{test_basic.py => test_dev.py} (84%) diff --git a/test/test_atari.py b/test/test_atari.py new file mode 100644 index 0000000..6b90674 --- /dev/null +++ b/test/test_atari.py @@ -0,0 +1,23 @@ +import unittest +import pytest +from os import environ +from rl.experiment import run +from . import conftest +import pandas as pd + + +class AtariTest(unittest.TestCase): + + @unittest.skipIf(environ.get('CI'), + "Delay CI test until dev stable") + @classmethod + def test_breakout_dqn(cls): + data_df = run('breakout_dqn') + assert isinstance(data_df, pd.DataFrame) + + @unittest.skipIf(environ.get('CI'), + "Delay CI test until dev stable") + @classmethod + def test_breakout_double_dqn(cls): + data_df = run('breakout_double_dqn') + assert isinstance(data_df, pd.DataFrame) diff --git a/test/test_box2d.py b/test/test_box2d.py new file mode 100644 index 0000000..f3b33cf --- /dev/null +++ b/test/test_box2d.py @@ -0,0 +1,24 @@ +import unittest +import pytest +from os import environ +from rl.experiment import run +from . import conftest +import pandas as pd + + +class Box2DTest(unittest.TestCase): + + @classmethod + def test_lunar_dqn(cls): + data_df = run('lunar_dqn') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_lunar_double_dqn(cls): + data_df = run('lunar_double_dqn') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_walker_ddpg_linearnoise(cls): + data_df = run('walker_ddpg_linearnoise') + assert isinstance(data_df, pd.DataFrame) diff --git a/test/test_advanced.py b/test/test_classic.py similarity index 67% rename from test/test_advanced.py rename to test/test_classic.py index 7bfe161..6f25e86 100644 --- a/test/test_advanced.py +++ b/test/test_classic.py @@ -6,58 +6,64 @@ import pandas as pd -class AdvancedTest(unittest.TestCase): +class ClassicTest(unittest.TestCase): @classmethod - def test_sarsa(cls): - data_df = run('sarsa') + def test_quickstart_dqn(cls): + data_df = run('quickstart_dqn') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_exp_sarsa(cls): - data_df = run('exp_sarsa') + def test_dqn_epsilon(cls): + data_df = run('dqn_epsilon') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_offpol_sarsa(cls): - data_df = run('offpol_sarsa') + def test_dqn(cls): + data_df = run('dqn') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_acrobot(cls): - data_df = run('acrobot') + def test_double_dqn(cls): + data_df = run('double_dqn') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_mountain_dqn(cls): - data_df = run('mountain_dqn') + def test_sarsa(cls): + data_df = run('sarsa') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_exp_sarsa(cls): + data_df = run('exp_sarsa') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_lunar_dqn(cls): - data_df = run('lunar_dqn') + def test_offpol_sarsa(cls): + data_df = run('offpol_sarsa') assert isinstance(data_df, pd.DataFrame) - @unittest.skipIf(environ.get('CI'), - "Delay CI test until dev stable") @classmethod - def test_breakout_dqn(cls): - data_df = run('breakout_dqn') + def test_cartpole_ac_argmax(cls): + data_df = run('cartpole_ac_argmax') assert isinstance(data_df, pd.DataFrame) - @unittest.skipIf(environ.get('CI'), - "Delay CI test until dev stable") @classmethod - def test_breakout_double_dqn(cls): - data_df = run('breakout_double_dqn') + def test_dqn_v1(cls): + data_df = run('dqn_v1') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_cartpole_ac_argmax(cls): - data_df = run('cartpole_ac_argmax') + def test_acrobot(cls): + data_df = run('acrobot') assert isinstance(data_df, pd.DataFrame) @classmethod - def test_pendulum_ddpg(cls): - data_df = run('pendulum_ddpg') + def test_pendulum_ddpg_linearnoise(cls): + data_df = run('pendulum_ddpg_linearnoise') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_mountain_dqn(cls): + data_df = run('mountain_dqn') assert isinstance(data_df, pd.DataFrame) diff --git a/test/test_basic.py b/test/test_dev.py similarity index 84% rename from test/test_basic.py rename to test/test_dev.py index c26d7d9..0d577d2 100644 --- a/test/test_basic.py +++ b/test/test_dev.py @@ -6,7 +6,7 @@ import pandas as pd -class BasicTest(unittest.TestCase): +class DevTest(unittest.TestCase): @classmethod def test_clean_import(cls): @@ -46,13 +46,3 @@ def test_dqn_pass(cls): # def test_dqn_random_search(cls): # data_df = run('test_dqn_random_search', param_selection=True) # assert isinstance(data_df, pd.DataFrame) - - @classmethod - def test_dqn(cls): - data_df = run('dqn') - assert isinstance(data_df, pd.DataFrame) - - @classmethod - def test_double_dqn(cls): - data_df = run('double_dqn') - assert isinstance(data_df, pd.DataFrame) From 21d85788b9cd23e2434ebd37ab12cd331690c857 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 18:47:45 -0400 Subject: [PATCH 27/33] add more tests --- rl/spec/classic_experiment_specs.json | 27 +++++++++++++++++++++++++++ test/test_atari.py | 4 ---- test/test_box2d.py | 5 +++++ test/test_classic.py | 5 +++++ 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index edfee38..20eb9d6 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -76,6 +76,33 @@ ] } }, + "dqn_per": { + "problem": "CartPole-v0", + "Agent": "DQN", + "HyperOptimizer": "GridSearch", + "Memory": "PrioritizedExperienceReplay", + "Optimizer": "AdamOptimizer", + "Policy": "BoltzmannPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid", + "exploration_anneal_episodes": 10 + }, + "param_range": { + "lr": [0.001, 0.005, 0.01, 0.02], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [16], + [32], + [64], + [16, 8], + [32, 16] + ] + } + }, "rand_dqn": { "problem": "CartPole-v0", "Agent": "DQN", diff --git a/test/test_atari.py b/test/test_atari.py index 6b90674..875bec9 100644 --- a/test/test_atari.py +++ b/test/test_atari.py @@ -8,15 +8,11 @@ class AtariTest(unittest.TestCase): - @unittest.skipIf(environ.get('CI'), - "Delay CI test until dev stable") @classmethod def test_breakout_dqn(cls): data_df = run('breakout_dqn') assert isinstance(data_df, pd.DataFrame) - @unittest.skipIf(environ.get('CI'), - "Delay CI test until dev stable") @classmethod def test_breakout_double_dqn(cls): data_df = run('breakout_double_dqn') diff --git a/test/test_box2d.py b/test/test_box2d.py index f3b33cf..db0ea41 100644 --- a/test/test_box2d.py +++ b/test/test_box2d.py @@ -18,6 +18,11 @@ def test_lunar_double_dqn(cls): data_df = run('lunar_double_dqn') assert isinstance(data_df, pd.DataFrame) + @classmethod + def test_lunar_freeze(cls): + data_df = run('lunar_freeze') + assert isinstance(data_df, pd.DataFrame) + @classmethod def test_walker_ddpg_linearnoise(cls): data_df = run('walker_ddpg_linearnoise') diff --git a/test/test_classic.py b/test/test_classic.py index 6f25e86..b395ac9 100644 --- a/test/test_classic.py +++ b/test/test_classic.py @@ -23,6 +23,11 @@ def test_dqn(cls): data_df = run('dqn') assert isinstance(data_df, pd.DataFrame) + @classmethod + def test_dqn_per(cls): + data_df = run('dqn_per') + assert isinstance(data_df, pd.DataFrame) + @classmethod def test_double_dqn(cls): data_df = run('double_dqn') From 96cfcbf0fcfa9df03eaf45a6f39f6046a05ce5a8 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 18:55:09 -0400 Subject: [PATCH 28/33] mute atari to speed up test --- test/test_atari.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_atari.py b/test/test_atari.py index 875bec9..0888757 100644 --- a/test/test_atari.py +++ b/test/test_atari.py @@ -8,11 +8,13 @@ class AtariTest(unittest.TestCase): + @unittest.skipIf(environ.get('CI'), "Delay CI test until dev stable") @classmethod def test_breakout_dqn(cls): data_df = run('breakout_dqn') assert isinstance(data_df, pd.DataFrame) + @unittest.skipIf(environ.get('CI'), "Delay CI test until dev stable") @classmethod def test_breakout_double_dqn(cls): data_df = run('breakout_double_dqn') From 193ab599c252dffaa0dd970e5b1deefe4c2803f4 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 21:51:16 -0400 Subject: [PATCH 29/33] guard continuous action range in policy --- rl/policy/actor_critic.py | 3 +++ rl/policy/noise.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index 7ba2d4a..89f15be 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -82,6 +82,9 @@ def select_action(self, state): a_mean = agent.actor.predict(state)[0] # extract from batch predict action = a_mean + np.random.normal( loc=0.0, scale=self.variance, size=a_mean.shape) + action = np.clip(action, + self.env_spec['action_bound_low'], + self.env_spec['action_bound_high']) return action def update(self, sys_vars): diff --git a/rl/policy/noise.py b/rl/policy/noise.py index fec9507..ea637d3 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -25,6 +25,9 @@ def select_action(self, state): state = np.expand_dims(state, axis=0) if self.env_spec['actions'] == 'continuous': action = agent.actor.predict(state)[0] + self.sample() + action = np.clip(action, + self.env_spec['action_bound_low'], + self.env_spec['action_bound_high']) else: Q_state = agent.actor.predict(state)[0] assert Q_state.ndim == 1 From 93dcb2d45eb8c2441311be0956ea7ca17981f8e5 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 21:59:41 -0400 Subject: [PATCH 30/33] add dqn_per to start per testing --- rl/spec/classic_experiment_specs.json | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 20eb9d6..8c45f46 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -92,14 +92,11 @@ "exploration_anneal_episodes": 10 }, "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.95, 0.97, 0.99, 0.999], + "max_mem_len": [2000, 5000, 10000], + "alpha": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], "hidden_layers": [ - [16], - [32], [64], - [16, 8], - [32, 16] + [128] ] } }, From 9d4ccd3d7e2945f56d01f0095471c3f034669248 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 22:16:11 -0400 Subject: [PATCH 31/33] add epsilonnoise policy --- rl/policy/noise.py | 21 ++++++++++++++++++ rl/spec/classic_experiment_specs.json | 31 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/rl/policy/noise.py b/rl/policy/noise.py index ea637d3..e1b695b 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -1,6 +1,7 @@ import numpy as np from rl.util import log_self from rl.policy.base_policy import Policy +from rl.policy.epsilon_greedy import EpsilonGreedyPolicy class NoNoisePolicy(Policy): @@ -63,6 +64,26 @@ def update(self, sys_vars): self.n_step = sys_vars['epi'] +class EpsilonGreedyNoisePolicy(EpsilonGreedyPolicy, NoNoisePolicy): + + ''' + akin to epsilon greedy decay, + but return random sample instead + ''' + + def sample(self): + if self.e > np.random.rand(): + noise = np.random.uniform( + 0.5 * self.env_spec['action_bound_low'], + 0.5 * self.env_spec['action_bound_high']) + else: + noise = 0 + return noise + + def select_action(self, state): + return NoNoisePolicy.select_action(self, state) + + class AnnealedGaussianPolicy(LinearNoisePolicy): ''' diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 8c45f46..0d423f2 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -736,6 +736,37 @@ ] } }, + "pendulum_ddpg_epsilonnoise": { + "problem": "Pendulum-v0", + "Agent": "DDPG", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "EpsilonGreedyNoisePolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "batch_size": 64, + "n_epoch": 1, + "tau": 0.005, + "lr": 0.001, + "critic_lr": 0.001, + "exploration_anneal_episodes": 50, + "gamma": 0.97, + "hidden_layers": [400, 300], + "hidden_layers_activation": "relu", + "output_layer_activation": "tanh" + }, + "param_range": { + "lr": [0.0001, 0.0005, 0.001], + "critic_lr": [0.001, 0.005], + "gamma": [0.95, 0.97, 0.99], + "hidden_layers": [ + [400, 300], + [800, 400, 200], + [800, 600, 400, 200] + ] + } + }, "pendulum_ddpg_linearnoise": { "problem": "Pendulum-v0", "Agent": "DDPG", From e9de40ec76fa7dc0f19c650bf322ca666f7455eb Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 22:34:47 -0400 Subject: [PATCH 32/33] fix analytics param sourcing --- rl/analytics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rl/analytics.py b/rl/analytics.py index 78a4754..1be9044 100644 --- a/rl/analytics.py +++ b/rl/analytics.py @@ -317,9 +317,10 @@ def compose_data(trial): } # param variables for independent vars of trials + default_param = trial.experiment_spec['param'] param_variables = { - pv: trial.experiment_spec['param'][pv] for - pv in trial.param_variables} + pv: default_param[pv] for + pv in trial.param_variables if pv in default_param} trial.data['metrics'].update(metrics) trial.data['param_variables'] = param_variables From d54676efc6f7dd2c9164850c2243b3818db6c2b2 Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 26 Apr 2017 22:39:40 -0400 Subject: [PATCH 33/33] use default mem_len for mountain per --- rl/spec/classic_experiment_specs.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 0d423f2..65e012f 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -995,8 +995,7 @@ "hidden_layers_activation": "sigmoid", "output_layer_activation": "linear", "exploration_anneal_episodes": 50, - "epi_change_lr": 150, - "max_mem_len": 30000 + "epi_change_lr": 150 }, "param_range": { "lr": [0.001, 0.005, 0.01],