From a665bf35d6ca79f8fcc836859d9801c98cb603a4 Mon Sep 17 00:00:00 2001 From: lgraesser Date: Sat, 8 Apr 2017 17:17:45 -0400 Subject: [PATCH 01/43] Working discrete actor critic model --- rl/agent/actor_critic.py | 138 ++++++++++++++++++++++++++ rl/experiment.py | 3 +- rl/policy/actor_critic.py | 113 +++++++++++++++++++++ rl/spec/classic_experiment_specs.json | 32 +++++- 4 files changed, 282 insertions(+), 4 deletions(-) create mode 100644 rl/agent/actor_critic.py create mode 100644 rl/policy/actor_critic.py diff --git a/rl/agent/actor_critic.py b/rl/agent/actor_critic.py new file mode 100644 index 0000000..13eb3f1 --- /dev/null +++ b/rl/agent/actor_critic.py @@ -0,0 +1,138 @@ +import numpy as np +from rl.agent.dqn import DQN +from rl.util import logger, log_self + + +class ActorCritic(DQN): + + ''' + Actor Critic algorithm. The actor's policy + is adjusted in the direction that will lead to + better actions, guided by the critic + Implementation adapted from + http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html + + Assumes one of the policies in actor_critic.py are being used + ''' + + def __init__(self, env_spec, + train_per_n_new_exp=1, + gamma=0.95, lr=0.1, + epi_change_lr=None, + batch_size=16, n_epoch=5, hidden_layers=None, + hidden_layers_activation='sigmoid', + output_layer_activation='linear', + auto_architecture=False, + num_hidden_layers=3, + first_hidden_layer_size=256, + num_initial_channels=16, + **kwargs): # absorb generic param without breaking + # import only when needed to contain side-effects + from keras.layers.core import Dense + from keras.models import Sequential, load_model + self.Dense = Dense + self.Sequential = Sequential + self.load_model = load_model + + super(ActorCritic, self).__init__(env_spec, + train_per_n_new_exp, + gamma, lr, + epi_change_lr, + batch_size, n_epoch, hidden_layers, + hidden_layers_activation, + output_layer_activation, + auto_architecture, + num_hidden_layers, + first_hidden_layer_size, + num_initial_channels, + **kwargs) + + def build_model(self): + self.build_actor() + self.build_critic() + logger.info("Actor and critic models built") + + def build_actor(self): + actor = self.Sequential() + super(ActorCritic, self).build_hidden_layers(actor) + actor.add(self.Dense(self.env_spec['action_dim'], + init='lecun_uniform', + activation=self.output_layer_activation)) + logger.info("Actor summary") + actor.summary() + self.actor = actor + + def build_critic(self): + critic = self.Sequential() + super(ActorCritic, self).build_hidden_layers(critic) + critic.add(self.Dense(1, + init='lecun_uniform', + activation=self.output_layer_activation)) + logger.info("Critic summary") + critic.summary() + self.critic = critic + + def compile_model(self): + self.actor.compile( + loss='mse', + optimizer=self.optimizer.keras_optimizer) + self.critic.compile( + loss='mse', + optimizer=self.optimizer.keras_optimizer) + logger.info("Actor and critic compiled") + + def recompile_model(self, sys_vars): + ''' + Option to change model optimizer settings + Currently only used for changing the learning rate + Compiling does not affect the model weights + ''' + if self.epi_change_lr is not None: + if (sys_vars['epi'] == self.epi_change_lr and \ + sys_vars['t'] == 0): + self.lr = self.lr / 10.0 + self.optimizer.change_optim_param(**{'lr': self.lr}) + self.actor.compile( + loss='mse', + optimizer=self.optimizer.keras_optimizer) + self.critic.compile( + loss='mse', + optimizer=self.optimizer.keras_optimizer) + logger.info('Actor and critic models recompiled with new settings: ' + 'Learning rate: {}'.format(self.lr)) + + def train_critic(self, minibatch): + Q_vals = np.clip(self.critic.predict(minibatch['states']), + -self.clip_val, self.clip_val) + Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']), + -self.clip_val, self.clip_val) + Q_targets = minibatch['rewards'] + self.gamma * \ + (1 - minibatch['terminals']) * Q_next_vals.squeeze() + Q_targets = np.expand_dims(Q_targets, axis=1) + + actor_delta = Q_next_vals - Q_vals + loss = self.critic.train_on_batch(minibatch['states'], Q_targets) + + errors = abs(np.sum(Q_vals - Q_targets, axis=1)) + self.memory.update(errors) + return loss, actor_delta + + def train_actor(self, minibatch, actor_delta): + old_vals = self.actor.predict(minibatch['states']) + if self.env_spec['actions'] == 'continuous': + A_targets = np.zeros((actor_delta.shape[0], self.env_spec['action_dim'])) + for j in range(A_targets.shape[1]): + A_targets[:,j] = actor_delta.squeeze() + else: + A_targets = minibatch['actions'] * actor_delta + \ + (1 - minibatch['actions']) * old_vals + + loss = self.actor.train_on_batch(minibatch['states'], A_targets) + return loss + + def train_an_epoch(self): + minibatch = self.memory.rand_minibatch(self.batch_size) + critic_loss, actor_delta = self.train_critic(minibatch) + actor_loss = self.train_actor(minibatch, actor_delta) + return critic_loss + actor_loss + diff --git a/rl/experiment.py b/rl/experiment.py index 7ad3b73..3fc47af 100644 --- a/rl/experiment.py +++ b/rl/experiment.py @@ -469,4 +469,5 @@ def run(name_id_spec, times=1, param_selection=False, **kwargs): trial_data = trial.run() experiment_data = [trial_data] - return analyze_data(experiment_data) + # return analyze_data(experiment_data) + return diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py new file mode 100644 index 0000000..137d445 --- /dev/null +++ b/rl/policy/actor_critic.py @@ -0,0 +1,113 @@ +import numpy as np +from rl.policy.base_policy import Policy +from rl.util import log_self + +class ArgmaxPolicy(Policy): + + ''' + The argmax policy for actor critic agents + Agent takes the action with the highest + action score + ''' + + def __init__(self, env_spec, + **kwargs): # absorb generic param without breaking + super(ArgmaxPolicy, self).__init__(env_spec) + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + A_score = agent.actor.predict(state)[0] # extract from batch predict + assert A_score.ndim == 1 + action = np.argmax(A_score) + return action + + def update(self, sys_vars): + pass + +class SoftmaxPolicy(Policy): + + ''' + The softmax policy for actor critic agents + Action is drawn from the prob dist generated + by softmax(acion_scores) + ''' + + def __init__(self, env_spec, + **kwargs): # absorb generic param without breaking + super(SoftmaxPolicy, self).__init__(env_spec) + self.clip_val = 500 + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + A_score = agent.actor.predict(state)[0] # extract from batch predict + assert A_score.ndim == 1 + A_score = A_score.astype('float32') # fix precision nan issue + A_score = A_score - np.amax(A_score) # prevent overflow + exp_values = np.exp( + np.clip(A_score, -self.clip_val, self.clip_val)) + assert not np.isnan(exp_values).any() + probs = np.array(exp_values / np.sum(exp_values)) + probs /= probs.sum() # renormalize to prevent floating pt error + action = np.random.choice(agent.env_spec['actions'], p=probs) + return action + + def update(self, sys_vars): + pass + +class GaussianPolicy(Policy): + + ''' + Continuous policy for actor critic models + Output of the actor network is the mean action + along each dimension. Action chosen is the mean + plus some noise parameterized by the variance + ''' + + def __init__(self, env_spec, + variance=1.0, + **kwargs): # absorb generic param without breaking + super(GaussianPolicy, self).__init__() + self.variance = variance + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + a_mean = agent.actor.predict(state)[0] # extract from batch predict + action = a_mean + np.random.normal(loc=0.0, + scale=self.variance, size=a_mean.shape) + return action + + def update(self, sys_vars): + pass + +class BoundedPolicy(Policy): + + ''' + The bounded policy for actor critic agents + and continous, bounded policy spaces + Action bounded above and below by + - action_bound, + action_bound + ''' + + def __init__(self, env_spec, + action_bound=1.0, + **kwargs): # absorb generic param without breaking + super(BoundedPolicy, self).__init__(env_spec) + self.action_bound = action_bound + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + A_score = agent.actor.predict(state)[0] # extract from batch predict + assert A_score.ndim == 1 + action = np.tanh(A_score) * self.action_bound + return action + + def update(self, sys_vars): + pass \ No newline at end of file diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 394a3c0..eff2ae9 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -80,6 +80,32 @@ ] } }, + "ac_dqn": { + "problem": "CartPole-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.001, 0.005, 0.01, 0.02], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [16], + [32], + [64], + [16, 8], + [32, 16] + ] + } + }, "rand_dqn": { "problem": "CartPole-v0", "Agent": "DQN", @@ -658,18 +684,18 @@ }, "pendulum": { "problem": "Pendulum-v0", - "Agent": "DDPG", + "Agent": "ActorCritic", "HyperOptimizer": "RandomSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "OUNoise", + "Policy": "BoundedPolicy", "PreProcessor": "NoPreProcessor", "param": { "max_evals": 40, "n_epoch": 1, "lr": 0.001, "gamma": 0.99, - "hidden_layers": [400, 300], + "hidden_layers": [256, 128], "hidden_layers_activation": "relu", "exploration_anneal_episodes": 500 }, From c42d99117f90873c305bc85640789bfdf2a6120f Mon Sep 17 00:00:00 2001 From: lgraesser Date: Sat, 8 Apr 2017 17:25:43 -0400 Subject: [PATCH 02/43] Uncommenting analyze data --- rl/experiment.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rl/experiment.py b/rl/experiment.py index 3fc47af..7ad3b73 100644 --- a/rl/experiment.py +++ b/rl/experiment.py @@ -469,5 +469,4 @@ def run(name_id_spec, times=1, param_selection=False, **kwargs): trial_data = trial.run() experiment_data = [trial_data] - # return analyze_data(experiment_data) - return + return analyze_data(experiment_data) From 6f76d42e065167358d8c23912f3e8d27036baabc Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 9 Apr 2017 10:41:55 -0400 Subject: [PATCH 03/43] style fix, scehdule ac experiment --- rl/agent/actor_critic.py | 49 ++++++++++++++------------- rl/policy/actor_critic.py | 26 ++++++++------ rl/spec/classic_experiment_specs.json | 6 ++-- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/rl/agent/actor_critic.py b/rl/agent/actor_critic.py index 13eb3f1..945f2c9 100644 --- a/rl/agent/actor_critic.py +++ b/rl/agent/actor_critic.py @@ -1,6 +1,6 @@ import numpy as np from rl.agent.dqn import DQN -from rl.util import logger, log_self +from rl.util import logger class ActorCritic(DQN): @@ -35,17 +35,17 @@ def __init__(self, env_spec, self.load_model = load_model super(ActorCritic, self).__init__(env_spec, - train_per_n_new_exp, - gamma, lr, - epi_change_lr, - batch_size, n_epoch, hidden_layers, - hidden_layers_activation, - output_layer_activation, - auto_architecture, - num_hidden_layers, - first_hidden_layer_size, - num_initial_channels, - **kwargs) + train_per_n_new_exp, + gamma, lr, + epi_change_lr, + batch_size, n_epoch, hidden_layers, + hidden_layers_activation, + output_layer_activation, + auto_architecture, + num_hidden_layers, + first_hidden_layer_size, + num_initial_channels, + **kwargs) def build_model(self): self.build_actor() @@ -66,8 +66,8 @@ def build_critic(self): critic = self.Sequential() super(ActorCritic, self).build_hidden_layers(critic) critic.add(self.Dense(1, - init='lecun_uniform', - activation=self.output_layer_activation)) + init='lecun_uniform', + activation=self.output_layer_activation)) logger.info("Critic summary") critic.summary() self.critic = critic @@ -88,7 +88,7 @@ def recompile_model(self, sys_vars): Compiling does not affect the model weights ''' if self.epi_change_lr is not None: - if (sys_vars['epi'] == self.epi_change_lr and \ + if (sys_vars['epi'] == self.epi_change_lr and sys_vars['t'] == 0): self.lr = self.lr / 10.0 self.optimizer.change_optim_param(**{'lr': self.lr}) @@ -98,14 +98,15 @@ def recompile_model(self, sys_vars): self.critic.compile( loss='mse', optimizer=self.optimizer.keras_optimizer) - logger.info('Actor and critic models recompiled with new settings: ' - 'Learning rate: {}'.format(self.lr)) + logger.info( + 'Actor and critic models recompiled with new settings: ' + 'Learning rate: {}'.format(self.lr)) def train_critic(self, minibatch): Q_vals = np.clip(self.critic.predict(minibatch['states']), - -self.clip_val, self.clip_val) + -self.clip_val, self.clip_val) Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']), - -self.clip_val, self.clip_val) + -self.clip_val, self.clip_val) Q_targets = minibatch['rewards'] + self.gamma * \ (1 - minibatch['terminals']) * Q_next_vals.squeeze() Q_targets = np.expand_dims(Q_targets, axis=1) @@ -113,19 +114,20 @@ def train_critic(self, minibatch): actor_delta = Q_next_vals - Q_vals loss = self.critic.train_on_batch(minibatch['states'], Q_targets) - errors = abs(np.sum(Q_vals - Q_targets, axis=1)) + errors = abs(np.sum(Q_vals - Q_targets, axis=1)) self.memory.update(errors) return loss, actor_delta def train_actor(self, minibatch, actor_delta): old_vals = self.actor.predict(minibatch['states']) if self.env_spec['actions'] == 'continuous': - A_targets = np.zeros((actor_delta.shape[0], self.env_spec['action_dim'])) + A_targets = np.zeros( + (actor_delta.shape[0], self.env_spec['action_dim'])) for j in range(A_targets.shape[1]): - A_targets[:,j] = actor_delta.squeeze() + A_targets[:, j] = actor_delta.squeeze() else: A_targets = minibatch['actions'] * actor_delta + \ - (1 - minibatch['actions']) * old_vals + (1 - minibatch['actions']) * old_vals loss = self.actor.train_on_batch(minibatch['states'], A_targets) return loss @@ -135,4 +137,3 @@ def train_an_epoch(self): critic_loss, actor_delta = self.train_critic(minibatch) actor_loss = self.train_actor(minibatch, actor_delta) return critic_loss + actor_loss - diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index 137d445..e2664ef 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -2,14 +2,15 @@ from rl.policy.base_policy import Policy from rl.util import log_self + class ArgmaxPolicy(Policy): - + ''' The argmax policy for actor critic agents Agent takes the action with the highest action score ''' - + def __init__(self, env_spec, **kwargs): # absorb generic param without breaking super(ArgmaxPolicy, self).__init__(env_spec) @@ -26,6 +27,7 @@ def select_action(self, state): def update(self, sys_vars): pass + class SoftmaxPolicy(Policy): ''' @@ -58,6 +60,7 @@ def select_action(self, state): def update(self, sys_vars): pass + class GaussianPolicy(Policy): ''' @@ -67,9 +70,9 @@ class GaussianPolicy(Policy): plus some noise parameterized by the variance ''' - def __init__(self, env_spec, - variance=1.0, - **kwargs): # absorb generic param without breaking + def __init__(self, env_spec, + variance=1.0, + **kwargs): # absorb generic param without breaking super(GaussianPolicy, self).__init__() self.variance = variance log_self(self) @@ -78,13 +81,14 @@ def select_action(self, state): agent = self.agent state = np.expand_dims(state, axis=0) a_mean = agent.actor.predict(state)[0] # extract from batch predict - action = a_mean + np.random.normal(loc=0.0, - scale=self.variance, size=a_mean.shape) + action = a_mean + np.random.normal( + loc=0.0, scale=self.variance, size=a_mean.shape) return action - + def update(self, sys_vars): pass + class BoundedPolicy(Policy): ''' @@ -93,9 +97,9 @@ class BoundedPolicy(Policy): Action bounded above and below by - action_bound, + action_bound ''' - + def __init__(self, env_spec, - action_bound=1.0, + action_bound=1.0, **kwargs): # absorb generic param without breaking super(BoundedPolicy, self).__init__(env_spec) self.action_bound = action_bound @@ -110,4 +114,4 @@ def select_action(self, state): return action def update(self, sys_vars): - pass \ No newline at end of file + pass diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index eff2ae9..1fd86c7 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -98,11 +98,9 @@ "lr": [0.001, 0.005, 0.01, 0.02], "gamma": [0.95, 0.97, 0.99, 0.999], "hidden_layers": [ - [16], - [32], [64], - [16, 8], - [32, 16] + [128], + [64, 32] ] } }, From 2f5c64228495c068a67b7da733924526189e307c Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 9 Apr 2017 10:45:48 -0400 Subject: [PATCH 04/43] stylefix --- rl/policy/actor_critic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index e2664ef..603927f 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -7,7 +7,7 @@ class ArgmaxPolicy(Policy): ''' The argmax policy for actor critic agents - Agent takes the action with the highest + Agent takes the action with the highest action score ''' From 0c4bf3a71264af90283b2530d774d38bcad49caf Mon Sep 17 00:00:00 2001 From: lgraesser Date: Sun, 9 Apr 2017 20:40:01 -0400 Subject: [PATCH 05/43] Adding AC specs --- rl/spec/classic_experiment_specs.json | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 1fd86c7..4e06aac 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -103,6 +103,29 @@ [64, 32] ] } + }, + "ac_dqn_2": { + "problem": "CartPole-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "ArgmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } }, "rand_dqn": { "problem": "CartPole-v0", From 1be0cb34e2d8c9ad1ecc7715cdf051c9f29d5898 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 9 Apr 2017 21:57:53 -0400 Subject: [PATCH 06/43] add component locks for ActorCritic and DDPG --- rl/spec/component_locks.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index c657604..cae3eca 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -12,6 +12,33 @@ "DoubleDQNEpsilonGreedyPolicy" ] }, + "actor_critic": { + "type": "mutex", + "details": "actor critic uses custom Q computation in its policy", + "head": "Agent", + "Agent": [ + "ActorCritic" + ], + "Policy": [ + "ArgmaxPolicy", + "BoundedPolicy", + "GaussianPolicy", + "SoftmaxPolicy" + ] + }, + "ddpg": { + "type": "mutex", + "details": "ddpg uses white-noise policy", + "head": "Agent", + "Agent": [ + "DDPG" + ], + "Policy": [ + "AnnealedGaussian", + "GaussianWhiteNoise", + "OUNoise" + ] + }, "discrete_action": { "type": "subset", "details": "discrete components cannot work in continuous action space", From e8aab7cbbe859315394fcbe42f84c51e296845e8 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 9 Apr 2017 22:47:05 -0400 Subject: [PATCH 07/43] schedule ac on cartpole and pendulum --- rl/spec/classic_experiment_specs.json | 113 +++++++++++++++++++------- 1 file changed, 85 insertions(+), 28 deletions(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 8b983b8..0391672 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -80,13 +80,13 @@ ] } }, - "ac_dqn": { + "cartpole_ac_argmax": { "problem": "CartPole-v0", "Agent": "ActorCritic", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "SoftmaxPolicy", + "Policy": "ArgmaxPolicy", "PreProcessor": "NoPreProcessor", "param": { "lr": 0.02, @@ -95,22 +95,21 @@ "hidden_layers_activation": "sigmoid" }, "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], "gamma": [0.95, 0.97, 0.99, 0.999], "hidden_layers": [ [64], - [128], - [64, 32] + [128] ] } }, - "ac_dqn_2": { + "cartpole_ac_softmax": { "problem": "CartPole-v0", "Agent": "ActorCritic", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "ArgmaxPolicy", + "Policy": "SoftmaxPolicy", "PreProcessor": "NoPreProcessor", "param": { "lr": 0.02, @@ -703,38 +702,96 @@ "exploration_anneal_episodes": [200, 400] } }, - "pendulum": { + "pendulum_ac_argmax": { "problem": "Pendulum-v0", "Agent": "ActorCritic", - "HyperOptimizer": "RandomSearch", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "ArgmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, + "pendulum_ac_softmax": { + "problem": "Pendulum-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, + "pendulum_ac_gaussian": { + "problem": "Pendulum-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "GaussianPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, + "pendulum_ac_bounded": { + "problem": "Pendulum-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", "Policy": "BoundedPolicy", "PreProcessor": "NoPreProcessor", "param": { - "max_evals": 40, - "n_epoch": 1, - "lr": 0.001, + "lr": 0.02, "gamma": 0.99, - "hidden_layers": [256, 128], - "hidden_layers_activation": "relu", - "exploration_anneal_episodes": 500 + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" }, "param_range": { - "lr": { - "min": 0.0005, - "max": 0.05 - }, - "gamma": { - "min": 0.90, - "max": 0.9999 - }, + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], "hidden_layers": [ - [400], - [200, 100], - [400, 300] - ], - "exploration_anneal_episodes": [200, 400, 600] + [64], + [128] + ] } }, "mountain_dqn": { From 28d3cc82fc36ba5ab1b02906438f6a985c8cfe30 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 9 Apr 2017 23:10:43 -0400 Subject: [PATCH 08/43] reorder component locks --- rl/spec/component_locks.json | 26 +++++++++++++------------- rl/util.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index cae3eca..9e65035 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -12,31 +12,31 @@ "DoubleDQNEpsilonGreedyPolicy" ] }, - "actor_critic": { + "ddpg": { "type": "mutex", - "details": "actor critic uses custom Q computation in its policy", + "details": "ddpg uses white-noise policy", "head": "Agent", "Agent": [ - "ActorCritic" + "DDPG" ], "Policy": [ - "ArgmaxPolicy", - "BoundedPolicy", - "GaussianPolicy", - "SoftmaxPolicy" + "AnnealedGaussian", + "GaussianWhiteNoise", + "OUNoise" ] }, - "ddpg": { + "actor_critic": { "type": "mutex", - "details": "ddpg uses white-noise policy", + "details": "actor critic uses custom Q computation in its policy", "head": "Agent", "Agent": [ - "DDPG" + "ActorCritic" ], "Policy": [ - "AnnealedGaussian", - "GaussianWhiteNoise", - "OUNoise" + "ArgmaxPolicy", + "BoundedPolicy", + "GaussianPolicy", + "SoftmaxPolicy" ] }, "discrete_action": { diff --git a/rl/util.py b/rl/util.py index 422df42..915f3aa 100644 --- a/rl/util.py +++ b/rl/util.py @@ -57,7 +57,7 @@ def check_lock(lock_name, lock, experiment_spec): raise ValueError( 'All components need to be of the same set, ' 'check component lock "{}" and your spec "{}"'.format( - bin_rest_list, experiment_spec['experiment_name'])) + lock_name, experiment_spec['experiment_name'])) bin_rest = bin_rest_list[0] lock_sig = [bin_head, bin_rest] From 1565a7add1776b6df4a58310fbe323ee3c099e8d Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 9 Apr 2017 23:28:20 -0400 Subject: [PATCH 09/43] add ac discrete component lock, fix and check all ac specs --- rl/policy/actor_critic.py | 2 +- rl/spec/classic_experiment_specs.json | 46 --------------------------- rl/spec/component_locks.json | 29 +++++++++++++++++ 3 files changed, 30 insertions(+), 47 deletions(-) diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index 603927f..5d0912a 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -73,7 +73,7 @@ class GaussianPolicy(Policy): def __init__(self, env_spec, variance=1.0, **kwargs): # absorb generic param without breaking - super(GaussianPolicy, self).__init__() + super(GaussianPolicy, self).__init__(env_spec) self.variance = variance log_self(self) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 0391672..d627a16 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -702,52 +702,6 @@ "exploration_anneal_episodes": [200, 400] } }, - "pendulum_ac_argmax": { - "problem": "Pendulum-v0", - "Agent": "ActorCritic", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "ArgmaxPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid" - }, - "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05, 0.1], - "gamma": [0.95, 0.97, 0.99, 0.999], - "hidden_layers": [ - [64], - [128] - ] - } - }, - "pendulum_ac_softmax": { - "problem": "Pendulum-v0", - "Agent": "ActorCritic", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "SoftmaxPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid" - }, - "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05, 0.1], - "gamma": [0.95, 0.97, 0.99, 0.999], - "hidden_layers": [ - [64], - [128] - ] - } - }, "pendulum_ac_gaussian": { "problem": "Pendulum-v0", "Agent": "ActorCritic", diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 9e65035..8e386be 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -39,6 +39,35 @@ "SoftmaxPolicy" ] }, + "actor_critic_discrete": { + "type": "subset", + "details": "actor critic discrete components cannot work in continuous action space", + "head": "problem", + "problem": [ + "Acrobot-v1", + "AirRaid-v0", + "Alien-v0", + "Assault-v0", + "Breakout-v0", + "CartPole-v0", + "CartPole-v1", + "DevBreakout-v0", + "DevCartPole-v0", + "FlappyBird-v0", + "LunarLander-v2", + "MountainCar-v0", + "MsPacman-v0", + "Pong-v0", + "Qbert-v0", + "Snake-v0", + "SpaceInvader-v0", + "TestPassCartPole-v0" + ], + "Policy": [ + "ArgmaxPolicy", + "SoftmaxPolicy" + ] + }, "discrete_action": { "type": "subset", "details": "discrete components cannot work in continuous action space", From e9ad662c4f38cc5b0f69a14924e08b84998c756e Mon Sep 17 00:00:00 2001 From: kengz Date: Mon, 10 Apr 2017 07:36:54 -0400 Subject: [PATCH 10/43] add variance to pendulum gaussian search, narrow search space --- rl/spec/classic_experiment_specs.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index d627a16..68da654 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -717,8 +717,9 @@ "hidden_layers_activation": "sigmoid" }, "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05, 0.1], - "gamma": [0.95, 0.97, 0.99, 0.999], + "lr": [0.005, 0.01, 0.05], + "gamma": [0.97, 0.99, 0.999], + "variance": [0.1, 0.5, 1.0], "hidden_layers": [ [64], [128] From c3c4ffb9fb09f20d3a0a694f073ed0e66009f194 Mon Sep 17 00:00:00 2001 From: kengz Date: Mon, 10 Apr 2017 23:30:06 -0400 Subject: [PATCH 11/43] add action bounds to env_spec --- rl/agent/q_table.py | 4 +++- rl/experiment.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/rl/agent/q_table.py b/rl/agent/q_table.py index 436ef71..6618463 100644 --- a/rl/agent/q_table.py +++ b/rl/agent/q_table.py @@ -71,7 +71,9 @@ def compile_model(self): def pixelate_state_space(self, resolution=10): '''chunk up the state space hypercube to specified resolution''' - state_bounds = self.env_spec['state_bounds'] + state_bounds = np.transpose( + [self.env_spec['state_bound_low'], + self.env_spec['state_bound_high']]) self.state_pixels = [np.linspace(*sb, num=resolution+1) for sb in state_bounds] return self.state_pixels diff --git a/rl/experiment.py b/rl/experiment.py index 7ad3b73..3023394 100644 --- a/rl/experiment.py +++ b/rl/experiment.py @@ -133,17 +133,23 @@ def set_env_spec(self): if env.action_space.__class__.__name__ == 'Box': # continuous action_dim = env.action_space.shape[0] actions = 'continuous' + action_low = env.action_space.low + action_high = env.action_space.high else: action_dim = env.action_space.n actions = list(range(env.action_space.n)) + action_low = 0 + action_high = 1 env_spec = { 'problem': PROBLEMS[self.problem], 'state_dim': state_dim, - 'state_bounds': np.transpose( - [env.observation_space.low, env.observation_space.high]), + 'state_bound_low': env.observation_space.low, + 'state_bound_high': env.observation_space.high, 'action_dim': action_dim, 'actions': actions, + 'action_bound_low': action_low, + 'action_bound_high': action_high, 'reward_range': env.reward_range, 'timestep_limit': env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') From 7eaed264a8220e673e78cc8e38713a0e631f6110 Mon Sep 17 00:00:00 2001 From: kengz Date: Mon, 10 Apr 2017 23:41:14 -0400 Subject: [PATCH 12/43] fix boundedpolicy to auto-bound from env-spec --- rl/policy/actor_critic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py index 5d0912a..900ac7d 100644 --- a/rl/policy/actor_critic.py +++ b/rl/policy/actor_critic.py @@ -99,17 +99,16 @@ class BoundedPolicy(Policy): ''' def __init__(self, env_spec, - action_bound=1.0, **kwargs): # absorb generic param without breaking super(BoundedPolicy, self).__init__(env_spec) - self.action_bound = action_bound + self.action_bound = env_spec['action_bound_high'] + assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] log_self(self) def select_action(self, state): agent = self.agent state = np.expand_dims(state, axis=0) A_score = agent.actor.predict(state)[0] # extract from batch predict - assert A_score.ndim == 1 action = np.tanh(A_score) * self.action_bound return action From 34b1b58163f8de2fb7b700ab9b520af7f7f637c9 Mon Sep 17 00:00:00 2001 From: lgraesser Date: Mon, 10 Apr 2017 23:58:16 -0400 Subject: [PATCH 13/43] Adding Acrobot specs --- rl/spec/classic_experiment_specs.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 4e06aac..ccbf948 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -544,11 +544,11 @@ "exploration_anneal_episodes": 200 }, "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.95, 0.97, 0.99, 0.999], + "lr": [0.001], + "gamma": [0.99, 0.999], + "max_mem_len": [50000, 100000], "hidden_layers": [ [100], - [200], [200, 100], [400, 200, 100] ] From 3bd227f0e84c63bfd386e6481f9587c1a77c7a2c Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 11 Apr 2017 00:16:01 -0400 Subject: [PATCH 14/43] schedule other ac experiments --- rl/spec/box2d_experiment_specs.json | 24 ++++ rl/spec/classic_experiment_specs.json | 190 ++++++++++++++++---------- 2 files changed, 142 insertions(+), 72 deletions(-) diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index 37fdedc..c8b0d0e 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -241,5 +241,29 @@ [400, 200, 100] ] } + }, + "lunar_ac_softmax": { + "problem": "LunarLander-v2", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.05, 0.1], + "gamma": [0.97, 0.99, 0.999], + "hidden_layers": [ + [400, 200], + [800, 400], + [400, 200, 100] + ] + } } } diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 68da654..f2be34c 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -80,52 +80,6 @@ ] } }, - "cartpole_ac_argmax": { - "problem": "CartPole-v0", - "Agent": "ActorCritic", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "ArgmaxPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid" - }, - "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05, 0.1], - "gamma": [0.95, 0.97, 0.99, 0.999], - "hidden_layers": [ - [64], - [128] - ] - } - }, - "cartpole_ac_softmax": { - "problem": "CartPole-v0", - "Agent": "ActorCritic", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "SoftmaxPolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid" - }, - "param_range": { - "lr": [0.005, 0.01, 0.02, 0.05, 0.1], - "gamma": [0.95, 0.97, 0.99, 0.999], - "hidden_layers": [ - [64], - [128] - ] - } - }, "rand_dqn": { "problem": "CartPole-v0", "Agent": "DQN", @@ -361,6 +315,52 @@ ] } }, + "cartpole_ac_argmax": { + "problem": "CartPole-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "ArgmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, + "cartpole_ac_softmax": { + "problem": "CartPole-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, "dqn_v1": { "problem": "CartPole-v1", "Agent": "DQN", @@ -475,54 +475,52 @@ ] } }, - "freeze_dqn": { - "problem": "CartPole-v0", - "Agent": "FreezeDQN", + "offpol_sarsa_v1": { + "problem": "CartPole-v1", + "Agent": "OffPolicySarsa", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", "Policy": "BoltzmannPolicy", "PreProcessor": "NoPreProcessor", "param": { - "lr": 0.001, - "gamma": 0.99, - "hidden_layers": [32], + "lr": 0.02, + "gamma": 0.999, + "hidden_layers": [128], "hidden_layers_activation": "sigmoid", "exploration_anneal_episodes": 10 }, "param_range": { "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.95, 0.96, 0.97, 0.99], + "gamma": [0.97, 0.99, 0.999], "hidden_layers": [ - [8], - [16], - [32] + [32], + [64], + [128], + [32, 16] ] } }, - "offpol_sarsa_v1": { + "cartpole_v1_ac_softmax": { "problem": "CartPole-v1", - "Agent": "OffPolicySarsa", + "Agent": "ActorCritic", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "BoltzmannPolicy", + "Policy": "SoftmaxPolicy", "PreProcessor": "NoPreProcessor", "param": { "lr": 0.02, - "gamma": 0.999, - "hidden_layers": [128], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10 + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" }, "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], "gamma": [0.97, 0.99, 0.999], "hidden_layers": [ - [32], [64], - [128], - [32, 16] + [128] ] } }, @@ -702,6 +700,30 @@ "exploration_anneal_episodes": [200, 400] } }, + "acrobot_ac_softmax": { + "problem": "Acrobot-v1", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.05, 0.1], + "gamma": [0.97, 0.99, 0.999], + "hidden_layers": [ + [100], + [200], + [200, 100] + ] + } + }, "pendulum_ac_gaussian": { "problem": "Pendulum-v0", "Agent": "ActorCritic", @@ -711,8 +733,8 @@ "Policy": "GaussianPolicy", "PreProcessor": "NoPreProcessor", "param": { - "lr": 0.02, - "gamma": 0.99, + "lr": 0.05, + "gamma": 0.999, "hidden_layers": [64], "hidden_layers_activation": "sigmoid" }, @@ -735,9 +757,9 @@ "Policy": "BoundedPolicy", "PreProcessor": "NoPreProcessor", "param": { - "lr": 0.02, - "gamma": 0.99, - "hidden_layers": [64], + "lr": 0.05, + "gamma": 0.999, + "hidden_layers": [400, 300], "hidden_layers_activation": "sigmoid" }, "param_range": { @@ -858,5 +880,29 @@ [400] ] } + }, + "mountain_ac_softmax": { + "problem": "MountainCar-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.05, 0.1], + "gamma": [0.97, 0.99, 0.999], + "hidden_layers": [ + [200], + [400, 200], + [400, 200, 100] + ] + } } } From e8e6877b3a45ee93325bf3c581d830aae308d438 Mon Sep 17 00:00:00 2001 From: lgraesser Date: Tue, 11 Apr 2017 00:24:10 -0400 Subject: [PATCH 15/43] Fixing mem len param --- rl/spec/classic_experiment_specs.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 398da61..b72e5a8 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -540,7 +540,8 @@ "gamma": 0.999, "hidden_layers": [200, 100], "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 200 + "exploration_anneal_episodes": 200, + "max_mem_len" : 50000 }, "param_range": { "lr": [0.001], From 75198e4c3a626e551560045f5cac686f6cd78b73 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 11 Apr 2017 08:11:25 -0400 Subject: [PATCH 16/43] add ddpg fix attempt --- rl/agent/ddpg.py | 1 + rl/policy/noise.py | 21 +++++++++++++++++---- rl/spec/classic_experiment_specs.json | 26 +++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py index 50e06be..61f046e 100644 --- a/rl/agent/ddpg.py +++ b/rl/agent/ddpg.py @@ -1,3 +1,4 @@ +import numpy as np from rl.agent.dqn import DQN from rl.util import logger, clone_model, clone_optimizer diff --git a/rl/policy/noise.py b/rl/policy/noise.py index 4532f46..0252e47 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -15,8 +15,16 @@ def __init__(self, env_spec, mu, sigma, sigma_min, **kwargs): # absorb generic param without breaking super(AnnealedGaussian, self).__init__(env_spec) - self.size = self.env_spec['action_dim'] - self.n_steps_annealing = self.env_spec['timestep_limit'] / 2 + # epsilon-greedy * noise + self.init_e = 1.0 + self.final_e = 0.0 + self.e = self.init_e + self.exploration_anneal_episodes = 100 + + self.size = env_spec['action_dim'] + self.action_bound = env_spec['action_bound_high'] + assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] + self.n_steps_annealing = env_spec['timestep_limit'] / 2 self.mu = mu self.sigma = sigma self.n_steps = 0 @@ -39,7 +47,8 @@ def select_action(self, state): agent = self.agent state = np.expand_dims(state, axis=0) if self.env_spec['actions'] == 'continuous': - action = agent.actor.predict(state)[0] + self.sample() + action = agent.actor.predict( + state)[0] * self.action_bound + self.sample() * self.e else: Q_state = agent.actor.predict(state)[0] assert Q_state.ndim == 1 @@ -48,7 +57,11 @@ def select_action(self, state): return action def update(self, sys_vars): - pass + epi = sys_vars['epi'] + rise = self.final_e - self.init_e + slope = rise / float(self.exploration_anneal_episodes) + self.e = max(slope * epi + self.init_e, self.final_e) + return self.e class GaussianWhiteNoise(AnnealedGaussian): diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index d18193f..3bfec51 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -539,7 +539,7 @@ "hidden_layers": [200, 100], "hidden_layers_activation": "sigmoid", "exploration_anneal_episodes": 200, - "max_mem_len" : 50000 + "max_mem_len": 50000 }, "param_range": { "lr": [0.001], @@ -772,6 +772,30 @@ ] } }, + "pendulum_ddpg": { + "problem": "Pendulum-v0", + "Agent": "DDPG", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "GaussianWhiteNoise", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.01, + "gamma": 0.999, + "hidden_layers": [400, 300], + "hidden_layers_activation": "sigmoid", + "output_layer_activation": "tanh" + }, + "param_range": { + "lr": [0.001, 0.01], + "gamma": [0.999], + "hidden_layers": [ + [400], + [400, 300] + ] + } + }, "mountain_dqn": { "problem": "MountainCar-v0", "Agent": "DQN", From 47a5f8c181695a93c7eed7f5b643832dbc71bb85 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Apr 2017 12:03:08 -0400 Subject: [PATCH 17/43] ddpg with bounded actions --- rl/policy/noise.py | 30 ++++++++++++++++++++++++++- rl/spec/classic_experiment_specs.json | 6 +++--- rl/spec/component_locks.json | 3 ++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/rl/policy/noise.py b/rl/policy/noise.py index 0252e47..7f82664 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -1,8 +1,36 @@ import numpy as np -from rl.util import logger +from rl.util import logger, log_self from rl.policy.base_policy import Policy +class DDPGBoundedPolicy(Policy): + + ''' + The bounded policy for actor critic agents + and continous, bounded policy spaces + Action bounded above and below by + - action_bound, + action_bound + ''' + + def __init__(self, env_spec, + **kwargs): # absorb generic param without breaking + super(DDPGBoundedPolicy, self).__init__(env_spec) + self.action_bound = env_spec['action_bound_high'] + assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + A_score = agent.actor.predict(state)[0] # extract from batch predict + # action = np.tanh(A_score) * self.action_bound + action = A_score * self.action_bound + return action + + def update(self, sys_vars): + pass + + class AnnealedGaussian(Policy): ''' diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index f769c13..bfbcae0 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -801,13 +801,13 @@ "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "GaussianWhiteNoise", + "Policy": "DDPGBoundedPolicy", "PreProcessor": "NoPreProcessor", "param": { "lr": 0.01, "gamma": 0.999, - "hidden_layers": [400, 300], - "hidden_layers_activation": "sigmoid", + "hidden_layers": [200], + "hidden_layers_activation": "relu", "output_layer_activation": "tanh" }, "param_range": { diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 8e386be..117454c 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -22,7 +22,8 @@ "Policy": [ "AnnealedGaussian", "GaussianWhiteNoise", - "OUNoise" + "OUNoise", + "DDPGBoundedPolicy" ] }, "actor_critic": { From 56e7f957a1187ee58226287beed40188d6e1d752 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Apr 2017 15:06:29 -0400 Subject: [PATCH 18/43] permami broken with reshape to len manually --- rl/agent/permami_ddpg.py | 491 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 491 insertions(+) create mode 100644 rl/agent/permami_ddpg.py diff --git a/rl/agent/permami_ddpg.py b/rl/agent/permami_ddpg.py new file mode 100644 index 0000000..6d0ec8f --- /dev/null +++ b/rl/agent/permami_ddpg.py @@ -0,0 +1,491 @@ +""" +DDPG implementation from https://github.com/pemami4911/deep-rl/blob/master/ddpg/ddpg.py +Implementation of DDPG - Deep Deterministic Policy Gradient +Algorithm and hyperparameter details can be found here: + http://arxiv.org/pdf/1509.02971v2.pdf +The algorithm is tested on the Pendulum-v0 OpenAI gym task +and developed with tflearn + Tensorflow +Author: Patrick Emami +""" +from rl.agent.base_agent import Agent +import tensorflow as tf +import numpy as np +# import gym +# from gym import wrappers +import tflearn + +# from replay_buffer import ReplayBuffer + +# ========================== +# Training Parameters +# ========================== +# Max training steps +MAX_EPISODES = 50000 +# Max episode length +MAX_EP_STEPS = 1000 +# Base learning rate for the Actor network +ACTOR_LEARNING_RATE = 0.0001 +# Base learning rate for the Critic Network +CRITIC_LEARNING_RATE = 0.001 +# Discount factor +GAMMA = 0.99 +# Soft target update param +TAU = 0.001 + +# # =========================== +# # Utility Parameters +# # =========================== +# # Render gym env during training +# RENDER_ENV = True +# # Use Gym Monitor +# GYM_MONITOR_EN = True +# # Gym environment +# ENV_NAME = 'Pendulum-v0' +# # Directory for storing gym results +# MONITOR_DIR = './results/gym_ddpg' +# # Directory for storing tensorboard summary results +# SUMMARY_DIR = './results/tf_ddpg' +# RANDOM_SEED = 1234 +# Size of replay buffer +BUFFER_SIZE = 10000 +MINIBATCH_SIZE = 64 + +# =========================== +# Actor and Critic DNNs +# =========================== + + +class ActorNetwork(object): + """ + Input to the network is the state, output is the action + under a deterministic policy. + The output layer activation is a tanh to keep the action + between -2 and 2 + """ + + def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + self.action_bound = action_bound + self.learning_rate = learning_rate + self.tau = tau + + # Actor Network + self.inputs, self.out, self.scaled_out = self.create_actor_network() + + self.network_params = tf.trainable_variables() + + # Target Network + self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network() + + self.target_network_params = tf.trainable_variables()[ + len(self.network_params):] + + # Op for periodically updating target network with online network + # weights + self.update_target_network_params = \ + [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + + tf.multiply(self.target_network_params[i], 1. - self.tau)) + for i in range(len(self.target_network_params))] + + # This gradient will be provided by the critic network + self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim]) + + # Combine the gradients here + self.actor_gradients = tf.gradients( + self.scaled_out, self.network_params, -self.action_gradient) + + # Optimization Op + self.optimize = tf.train.AdamOptimizer(self.learning_rate).\ + apply_gradients(zip(self.actor_gradients, self.network_params)) + + self.num_trainable_vars = len( + self.network_params) + len(self.target_network_params) + + def create_actor_network(self): + inputs = tflearn.input_data(shape=[None, self.s_dim]) + net = tflearn.fully_connected(inputs, 400, activation='relu') + net = tflearn.fully_connected(net, 300, activation='relu') + # Final layer weights are init to Uniform[-3e-3, 3e-3] + w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) + out = tflearn.fully_connected( + net, self.a_dim, activation='tanh', weights_init=w_init) + # Scale output to -action_bound to action_bound + scaled_out = tf.multiply(out, self.action_bound) + return inputs, out, scaled_out + + def train(self, inputs, a_gradient): + self.sess.run(self.optimize, feed_dict={ + self.inputs: np.reshape(inputs, (len(inputs), 1)), + # self.inputs: inputs, + self.action_gradient: a_gradient + }) + + def predict(self, inputs): + return self.sess.run(self.scaled_out, feed_dict={ + self.inputs: np.reshape(inputs, (len(inputs), 1)) + # self.inputs: inputs + }) + + def predict_target(self, inputs): + return self.sess.run(self.target_scaled_out, feed_dict={ + self.target_inputs: np.reshape(inputs, (len(inputs), 1)) + # self.target_inputs: inputs + }) + + def update_target_network(self): + self.sess.run(self.update_target_network_params) + + def get_num_trainable_vars(self): + return self.num_trainable_vars + + +class CriticNetwork(object): + """ + Input to the network is the state and action, output is Q(s,a). + The action must be obtained from the output of the Actor network. + """ + + def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + self.learning_rate = learning_rate + self.tau = tau + + # Create the critic network + self.inputs, self.action, self.out = self.create_critic_network() + + self.network_params = tf.trainable_variables()[num_actor_vars:] + + # Target Network + self.target_inputs, self.target_action, self.target_out = self.create_critic_network() + + self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] + + # Op for periodically updating target network with online network + # weights with regularization + self.update_target_network_params = \ + [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) + for i in range(len(self.target_network_params))] + + # Network target (y_i) + self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) + + # Define loss and optimization Op + self.loss = tflearn.mean_square(self.predicted_q_value, self.out) + self.optimize = tf.train.AdamOptimizer( + self.learning_rate).minimize(self.loss) + + # Get the gradient of the net w.r.t. the action. + # For each action in the minibatch (i.e., for each x in xs), + # this will sum up the gradients of each critic output in the minibatch + # w.r.t. that action. Each output is independent of all + # actions except for one. + self.action_grads = tf.gradients(self.out, self.action) + + def create_critic_network(self): + inputs = tflearn.input_data(shape=[None, self.s_dim]) + action = tflearn.input_data(shape=[None, self.a_dim]) + net = tflearn.fully_connected(inputs, 400, activation='relu') + + # Add the action tensor in the 2nd hidden layer + # Use two temp layers to get the corresponding weights and biases + t1 = tflearn.fully_connected(net, 300) + t2 = tflearn.fully_connected(action, 300) + + net = tflearn.activation( + tf.matmul(net, t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu') + + # linear layer connected to 1 output representing Q(s,a) + # Weights are init to Uniform[-3e-3, 3e-3] + w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) + out = tflearn.fully_connected(net, 1, weights_init=w_init) + return inputs, action, out + + def train(self, inputs, action, predicted_q_value): + return self.sess.run([self.out, self.optimize], feed_dict={ + self.inputs: np.reshape(inputs, (len(inputs), 1)), + self.action: np.reshape(action, (len(inputs), 1)), + self.predicted_q_value: np.reshape(predicted_q_value, (len(inputs), 1)), + # self.inputs: inputs, + # self.action: action, + # self.predicted_q_value: predicted_q_value + }) + + def predict(self, inputs, action): + return self.sess.run(self.out, feed_dict={ + self.inputs: np.reshape(inputs, (len(inputs), 1)), + self.action: np.reshape(action, (len(inputs), 1)), + # self.inputs: inputs, + # self.action: action + }) + + def predict_target(self, inputs, action): + return self.sess.run(self.target_out, feed_dict={ + self.target_inputs: np.reshape(inputs, (len(inputs), 1)), + self.target_action: np.reshape(action, (len(inputs), 1)), + # self.target_inputs: inputs, + # self.target_action: action + }) + + def action_gradients(self, inputs, actions): + return self.sess.run(self.action_grads, feed_dict={ + self.inputs: np.reshape(inputs, (len(inputs), 1)), + self.action: np.reshape(action, (len(inputs), 1)), + # self.inputs: inputs, + # self.action: actions + }) + + def update_target_network(self): + self.sess.run(self.update_target_network_params) + + + +class PermamiDDPG(Agent): + + ''' + The PermamiDDPG agent (algo), from https://arxiv.org/abs/1509.02971 + reference: https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html + https://github.com/matthiasplappert/keras-rl + ''' + + def __init__(self, *args, **kwargs): + # import only when needed to contain side-effects + # from keras.layers import Dense, Merge + # from keras.models import Sequential + # from keras import backend as K + # self.Dense = Dense + # self.Merge = Merge + # self.Sequential = Sequential + self.sess = tf.Session() + self.epi = 0 + self.n_epoch = 1 + self.batch_size = 64 + + # self.TAU = 0.001 # for target network updates + super(PermamiDDPG, self).__init__(*args, **kwargs) + self.build_model() + self.sess.run(tf.global_variables_initializer()) + + + def build_model(self): + state_dim = self.env_spec['state_dim'] + action_dim = self.env_spec['action_dim'] + action_bound = self.env_spec['action_bound_high'] + self.actor = ActorNetwork(self.sess, state_dim, action_dim, action_bound, + ACTOR_LEARNING_RATE, TAU) + self.critic = CriticNetwork(self.sess, state_dim, action_dim, + CRITIC_LEARNING_RATE, TAU, self.actor.get_num_trainable_vars()) + + def compile_model(self): + pass + + def select_action(self, state): + i = self.epi + action = self.actor.predict(np.reshape(state, (1, 3))) + (1. / (1. + i)) + return action + + def update(self, sys_vars): + self.epi = sys_vars['epi'] + # Update target networks + self.actor.update_target_network() + self.critic.update_target_network() + return + + def to_train(self, sys_vars): + # return sys_vars['t'] > MINIBATCH_SIZE + return True + + def train_an_epoch(self): + minibatch = self.memory.rand_minibatch(self.batch_size) + + target_q = self.critic.predict_target( + minibatch['next_states'], self.actor.predict_target(minibatch['next_states'])) + + y_i = minibatch['rewards'] + self.gamma * \ + (1 - minibatch['terminals']) * target_q + + predicted_q_value, _ = self.critic.train( + minibatch['states'], + minibatch['actions'], + y_i) + # np.reshape(y_i, (self.batch_size, 1))) + + ep_ave_max_q += np.amax(predicted_q_value) + + + # Update the actor policy using the sampled gradient + a_outs = self.actor.predict(minibatch['states']) + grads = self.critic.action_gradients(minibatch['states'], a_outs) + actor_loss = self.actor.train(minibatch['states'], grads[0]) + # return actor_loss + return + + # (Q_states, _states, Q_next_states_max) = self.compute_Q_states( + # minibatch) + # Q_targets = self.compute_Q_targets( + # minibatch, Q_states, Q_next_states_max) + + # loss = self.model.train_on_batch(minibatch['states'], Q_targets) + + # errors = abs(np.sum(Q_states - Q_targets, axis=1)) + # self.memory.update(errors) + # return loss + + def train(self, sys_vars): + ''' + Training is for the Q function (NN) only + otherwise (e.g. policy) see self.update() + step 1,2,3,4 of algo. + ''' + loss_total = 0 + for _epoch in range(self.n_epoch): + loss = self.train_an_epoch() + loss_total += loss + avg_loss = loss_total / self.n_epoch + sys_vars['loss'].append(avg_loss) + return avg_loss + +# =========================== +# Tensorflow Summary Ops +# =========================== + + +# def build_summaries(): +# episode_reward = tf.Variable(0.) +# tf.summary.scalar("Reward", episode_reward) +# episode_ave_max_q = tf.Variable(0.) +# tf.summary.scalar("Qmax Value", episode_ave_max_q) + +# summary_vars = [episode_reward, episode_ave_max_q] +# summary_ops = tf.summary.merge_all() + +# return summary_ops, summary_vars + +# # =========================== +# # Agent Training +# # =========================== + + +# def train(sess, env, actor, critic): + +# # Set up summary Ops +# summary_ops, summary_vars = build_summaries() + +# sess.run(tf.global_variables_initializer()) +# writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) + +# # Initialize target network weights +# actor.update_target_network() +# critic.update_target_network() + +# # Initialize replay memory +# replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) + +# for i in xrange(MAX_EPISODES): + +# s = env.reset() + +# ep_reward = 0 +# ep_ave_max_q = 0 + +# for j in xrange(MAX_EP_STEPS): + +# if RENDER_ENV: +# env.render() + +# # # Added exploration noise +# # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) + +# # s2, r, terminal, info = env.step(a[0]) + +# replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, +# terminal, np.reshape(s2, (actor.s_dim,))) + +# # Keep adding experience to the memory until +# # there are at least minibatch size samples +# if replay_buffer.size() > MINIBATCH_SIZE: +# # s_batch, a_batch, r_batch, t_batch, s2_batch = \ +# # replay_buffer.sample_batch(MINIBATCH_SIZE) + +# # # Calculate targets +# # target_q = critic.predict_target( +# # s2_batch, actor.predict_target(s2_batch)) + +# # y_i = [] +# # for k in xrange(MINIBATCH_SIZE): +# # if t_batch[k]: +# # y_i.append(r_batch[k]) +# # else: +# # y_i.append(r_batch[k] + GAMMA * target_q[k]) + +# # # Update the critic given the targets +# # predicted_q_value, _ = critic.train( +# # s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) + +# # ep_ave_max_q += np.amax(predicted_q_value) + +# # # Update the actor policy using the sampled gradient +# # a_outs = actor.predict(s_batch) +# # grads = critic.action_gradients(s_batch, a_outs) +# # actor.train(s_batch, grads[0]) + +# # # Update target networks +# # actor.update_target_network() +# # critic.update_target_network() + +# s = s2 +# ep_reward += r + +# # if terminal: + +# # summary_str = sess.run(summary_ops, feed_dict={ +# # summary_vars[0]: ep_reward, +# # summary_vars[1]: ep_ave_max_q / float(j) +# # }) + +# # writer.add_summary(summary_str, i) +# # writer.flush() + +# # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ +# # '| Qmax: %.4f' % (ep_ave_max_q / float(j)) + +# # break + + +# def main(_): +# with tf.Session() as sess: + +# env = gym.make(ENV_NAME) +# np.random.seed(RANDOM_SEED) +# tf.set_random_seed(RANDOM_SEED) +# env.seed(RANDOM_SEED) + +# # state_dim = env.observation_space.shape[0] +# # action_dim = env.action_space.shape[0] +# # action_bound = env.action_space.high +# # # Ensure action bound is symmetric +# # assert (env.action_space.high == -env.action_space.low) + +# # actor = ActorNetwork(sess, state_dim, action_dim, action_bound, +# # ACTOR_LEARNING_RATE, TAU) + +# # critic = CriticNetwork(sess, state_dim, action_dim, +# # CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) + +# if GYM_MONITOR_EN: +# if not RENDER_ENV: +# env = wrappers.Monitor( +# env, MONITOR_DIR, video_callable=False, force=True) +# else: +# env = wrappers.Monitor(env, MONITOR_DIR, force=True) + +# train(sess, env, actor, critic) + +# if GYM_MONITOR_EN: +# env.monitor.close() + +# if __name__ == '__main__': +# tf.app.run() \ No newline at end of file From 6215058f187aa8bc77a7bbd1ed71ba6e3589cd96 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Apr 2017 18:48:24 -0400 Subject: [PATCH 19/43] fixing permami shape one at a time; absolutely disgusting code --- rl/agent/permami_ddpg.py | 71 +++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/rl/agent/permami_ddpg.py b/rl/agent/permami_ddpg.py index 6d0ec8f..ea6e1c8 100644 --- a/rl/agent/permami_ddpg.py +++ b/rl/agent/permami_ddpg.py @@ -117,21 +117,30 @@ def create_actor_network(self): def train(self, inputs, a_gradient): self.sess.run(self.optimize, feed_dict={ - self.inputs: np.reshape(inputs, (len(inputs), 1)), - # self.inputs: inputs, + self.inputs: inputs, self.action_gradient: a_gradient }) def predict(self, inputs): + # print('inputs') + # print('inputs') + # print('inputs') + # print('inputs') + # print(inputs.shape) + # print(inputs) return self.sess.run(self.scaled_out, feed_dict={ - self.inputs: np.reshape(inputs, (len(inputs), 1)) - # self.inputs: inputs + self.inputs: inputs }) def predict_target(self, inputs): + print('inputs') + print('inputs') + print('inputs') + print('inputs') + print(inputs.shape) + print(inputs) return self.sess.run(self.target_scaled_out, feed_dict={ - self.target_inputs: np.reshape(inputs, (len(inputs), 1)) - # self.target_inputs: inputs + self.target_inputs: inputs }) def update_target_network(self): @@ -205,37 +214,35 @@ def create_critic_network(self): return inputs, action, out def train(self, inputs, action, predicted_q_value): + print('train shapes') + print('train shapes') + print('train shapes') + print(inputs.shape) + print(action.shape) + print(predicted_q_value.shape) + print(predicted_q_value) return self.sess.run([self.out, self.optimize], feed_dict={ - self.inputs: np.reshape(inputs, (len(inputs), 1)), - self.action: np.reshape(action, (len(inputs), 1)), - self.predicted_q_value: np.reshape(predicted_q_value, (len(inputs), 1)), - # self.inputs: inputs, - # self.action: action, - # self.predicted_q_value: predicted_q_value + self.inputs: inputs, + self.action: np.reshape(action, (-1, self.a_dim)), + self.predicted_q_value: np.reshape(predicted_q_value[0], (-1, 1)) }) def predict(self, inputs, action): return self.sess.run(self.out, feed_dict={ - self.inputs: np.reshape(inputs, (len(inputs), 1)), - self.action: np.reshape(action, (len(inputs), 1)), - # self.inputs: inputs, - # self.action: action + self.inputs: inputs, + self.action: np.reshape(action, (-1, self.a_dim)) }) def predict_target(self, inputs, action): return self.sess.run(self.target_out, feed_dict={ - self.target_inputs: np.reshape(inputs, (len(inputs), 1)), - self.target_action: np.reshape(action, (len(inputs), 1)), - # self.target_inputs: inputs, - # self.target_action: action + self.target_inputs: inputs, + self.target_action: np.reshape(action, (-1, self.a_dim)) }) - def action_gradients(self, inputs, actions): + def action_gradients(self, inputs, action): return self.sess.run(self.action_grads, feed_dict={ - self.inputs: np.reshape(inputs, (len(inputs), 1)), - self.action: np.reshape(action, (len(inputs), 1)), - # self.inputs: inputs, - # self.action: actions + self.inputs: inputs, + self.action: np.reshape(action, (-1, self.a_dim)) }) def update_target_network(self): @@ -263,6 +270,7 @@ def __init__(self, *args, **kwargs): self.epi = 0 self.n_epoch = 1 self.batch_size = 64 + self.gamma = 0.99 # self.TAU = 0.001 # for target network updates super(PermamiDDPG, self).__init__(*args, **kwargs) @@ -274,6 +282,8 @@ def build_model(self): state_dim = self.env_spec['state_dim'] action_dim = self.env_spec['action_dim'] action_bound = self.env_spec['action_bound_high'] + self.s_dim = state_dim + self.a_dim = action_dim self.actor = ActorNetwork(self.sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) self.critic = CriticNetwork(self.sess, state_dim, action_dim, @@ -284,7 +294,7 @@ def compile_model(self): def select_action(self, state): i = self.epi - action = self.actor.predict(np.reshape(state, (1, 3))) + (1. / (1. + i)) + action = self.actor.predict(np.reshape(state, (-1, self.s_dim))) + (1. / (1. + i)) return action def update(self, sys_vars): @@ -300,9 +310,10 @@ def to_train(self, sys_vars): def train_an_epoch(self): minibatch = self.memory.rand_minibatch(self.batch_size) - + s2_batch = np.reshape(minibatch['next_states'], (-1, self.s_dim)) target_q = self.critic.predict_target( - minibatch['next_states'], self.actor.predict_target(minibatch['next_states'])) + s2_batch, + self.actor.predict_target(s2_batch)) y_i = minibatch['rewards'] + self.gamma * \ (1 - minibatch['terminals']) * target_q @@ -313,7 +324,7 @@ def train_an_epoch(self): y_i) # np.reshape(y_i, (self.batch_size, 1))) - ep_ave_max_q += np.amax(predicted_q_value) + # ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient @@ -343,7 +354,7 @@ def train(self, sys_vars): loss_total = 0 for _epoch in range(self.n_epoch): loss = self.train_an_epoch() - loss_total += loss + # loss_total += loss avg_loss = loss_total / self.n_epoch sys_vars['loss'].append(avg_loss) return avg_loss From 434de8bc92a28c87b7c51746394c995269a4c3fa Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Apr 2017 19:05:43 -0400 Subject: [PATCH 20/43] disgusting ddpg hack running --- rl/agent/permami_ddpg.py | 7 ++++++- rl/spec/classic_experiment_specs.json | 24 ++++++++++++++++++++++++ rl/spec/component_locks.json | 3 ++- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/rl/agent/permami_ddpg.py b/rl/agent/permami_ddpg.py index ea6e1c8..87c2920 100644 --- a/rl/agent/permami_ddpg.py +++ b/rl/agent/permami_ddpg.py @@ -295,7 +295,12 @@ def compile_model(self): def select_action(self, state): i = self.epi action = self.actor.predict(np.reshape(state, (-1, self.s_dim))) + (1. / (1. + i)) - return action + print('action shape') + print('action shape') + print('action shape') + print(action) + print(action.shape) + return action[0] def update(self, sys_vars): self.epi = sys_vars['epi'] diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index bfbcae0..0d4acc7 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -819,6 +819,30 @@ ] } }, + "pendulum_permami_ddpg": { + "problem": "Pendulum-v0", + "Agent": "PermamiDDPG", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "DDPGBoundedPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.01, + "gamma": 0.999, + "hidden_layers": [200], + "hidden_layers_activation": "relu", + "output_layer_activation": "tanh" + }, + "param_range": { + "lr": [0.001, 0.01], + "gamma": [0.999], + "hidden_layers": [ + [400], + [400, 300] + ] + } + }, "mountain_dqn": { "problem": "MountainCar-v0", "Agent": "DQN", diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 117454c..25eaae3 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -17,7 +17,8 @@ "details": "ddpg uses white-noise policy", "head": "Agent", "Agent": [ - "DDPG" + "DDPG", + "PermamiDDPG" ], "Policy": [ "AnnealedGaussian", From 4fa1dbc24e9b2b3e9b02863639144b316ff36093 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Apr 2017 19:07:23 -0400 Subject: [PATCH 21/43] comment out print --- rl/agent/permami_ddpg.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/rl/agent/permami_ddpg.py b/rl/agent/permami_ddpg.py index 87c2920..3e8de7d 100644 --- a/rl/agent/permami_ddpg.py +++ b/rl/agent/permami_ddpg.py @@ -133,12 +133,12 @@ def predict(self, inputs): }) def predict_target(self, inputs): - print('inputs') - print('inputs') - print('inputs') - print('inputs') - print(inputs.shape) - print(inputs) + # print('inputs') + # print('inputs') + # print('inputs') + # print('inputs') + # print(inputs.shape) + # print(inputs) return self.sess.run(self.target_scaled_out, feed_dict={ self.target_inputs: inputs }) @@ -214,13 +214,13 @@ def create_critic_network(self): return inputs, action, out def train(self, inputs, action, predicted_q_value): - print('train shapes') - print('train shapes') - print('train shapes') - print(inputs.shape) - print(action.shape) - print(predicted_q_value.shape) - print(predicted_q_value) + # print('train shapes') + # print('train shapes') + # print('train shapes') + # print(inputs.shape) + # print(action.shape) + # print(predicted_q_value.shape) + # print(predicted_q_value) return self.sess.run([self.out, self.optimize], feed_dict={ self.inputs: inputs, self.action: np.reshape(action, (-1, self.a_dim)), @@ -295,11 +295,11 @@ def compile_model(self): def select_action(self, state): i = self.epi action = self.actor.predict(np.reshape(state, (-1, self.s_dim))) + (1. / (1. + i)) - print('action shape') - print('action shape') - print('action shape') - print(action) - print(action.shape) + # print('action shape') + # print('action shape') + # print('action shape') + # print(action) + # print(action.shape) return action[0] def update(self, sys_vars): From a6914012126a460988a4b4b9ee6f865a0f702c42 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Apr 2017 21:16:46 -0400 Subject: [PATCH 22/43] fucking got it, culprit was predicted_q_val shape --- rl/agent/permami_ddpg.py | 61 ++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/rl/agent/permami_ddpg.py b/rl/agent/permami_ddpg.py index 3e8de7d..37067b8 100644 --- a/rl/agent/permami_ddpg.py +++ b/rl/agent/permami_ddpg.py @@ -59,6 +59,7 @@ class ActorNetwork(object): """ Input to the network is the state, output is the action under a deterministic policy. + The output layer activation is a tanh to keep the action between -2 and 2 """ @@ -122,23 +123,11 @@ def train(self, inputs, a_gradient): }) def predict(self, inputs): - # print('inputs') - # print('inputs') - # print('inputs') - # print('inputs') - # print(inputs.shape) - # print(inputs) return self.sess.run(self.scaled_out, feed_dict={ self.inputs: inputs }) def predict_target(self, inputs): - # print('inputs') - # print('inputs') - # print('inputs') - # print('inputs') - # print(inputs.shape) - # print(inputs) return self.sess.run(self.target_scaled_out, feed_dict={ self.target_inputs: inputs }) @@ -154,6 +143,7 @@ class CriticNetwork(object): """ Input to the network is the state and action, output is Q(s,a). The action must be obtained from the output of the Actor network. + """ def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars): @@ -214,35 +204,28 @@ def create_critic_network(self): return inputs, action, out def train(self, inputs, action, predicted_q_value): - # print('train shapes') - # print('train shapes') - # print('train shapes') - # print(inputs.shape) - # print(action.shape) - # print(predicted_q_value.shape) - # print(predicted_q_value) return self.sess.run([self.out, self.optimize], feed_dict={ self.inputs: inputs, - self.action: np.reshape(action, (-1, self.a_dim)), - self.predicted_q_value: np.reshape(predicted_q_value[0], (-1, 1)) + self.action: action, + self.predicted_q_value: predicted_q_value }) def predict(self, inputs, action): return self.sess.run(self.out, feed_dict={ self.inputs: inputs, - self.action: np.reshape(action, (-1, self.a_dim)) + self.action: action }) def predict_target(self, inputs, action): return self.sess.run(self.target_out, feed_dict={ self.target_inputs: inputs, - self.target_action: np.reshape(action, (-1, self.a_dim)) + self.target_action: action }) - def action_gradients(self, inputs, action): + def action_gradients(self, inputs, actions): return self.sess.run(self.action_grads, feed_dict={ self.inputs: inputs, - self.action: np.reshape(action, (-1, self.a_dim)) + self.action: actions }) def update_target_network(self): @@ -310,32 +293,38 @@ def update(self, sys_vars): return def to_train(self, sys_vars): - # return sys_vars['t'] > MINIBATCH_SIZE - return True + return self.memory.size() > MINIBATCH_SIZE + # return True def train_an_epoch(self): minibatch = self.memory.rand_minibatch(self.batch_size) + s_batch = np.reshape(minibatch['states'], (-1, self.s_dim)) + a_batch = np.reshape(minibatch['actions'], (-1, self.a_dim)) s2_batch = np.reshape(minibatch['next_states'], (-1, self.s_dim)) + target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = minibatch['rewards'] + self.gamma * \ - (1 - minibatch['terminals']) * target_q + (1 - minibatch['terminals']) * np.reshape(target_q, (-1)) + y_i = np.reshape(y_i, (-1, 1)) predicted_q_value, _ = self.critic.train( - minibatch['states'], - minibatch['actions'], - y_i) - # np.reshape(y_i, (self.batch_size, 1))) + s_batch, a_batch, y_i) + # minibatch['states'], + # minibatch['actions'], + # y_i) + # # np.reshape(y_i, (self.batch_size, 1))) - # ep_ave_max_q += np.amax(predicted_q_value) + ep_ave_max_q = np.amax(predicted_q_value) + print('epi: ' + self.epi + ' Q_max: '+str(ep_ave_max_q)) # Update the actor policy using the sampled gradient - a_outs = self.actor.predict(minibatch['states']) - grads = self.critic.action_gradients(minibatch['states'], a_outs) - actor_loss = self.actor.train(minibatch['states'], grads[0]) + a_outs = self.actor.predict(s_batch) + grads = self.critic.action_gradients(s_batch, a_outs) + self.actor.train(s_batch, grads[0]) # return actor_loss return From 5a827c354c21d391591a0311d37b5d6095123a0f Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Apr 2017 23:51:15 -0400 Subject: [PATCH 23/43] fix permami typo --- rl/agent/permami_ddpg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rl/agent/permami_ddpg.py b/rl/agent/permami_ddpg.py index 37067b8..1307263 100644 --- a/rl/agent/permami_ddpg.py +++ b/rl/agent/permami_ddpg.py @@ -317,8 +317,8 @@ def train_an_epoch(self): # y_i) # # np.reshape(y_i, (self.batch_size, 1))) - ep_ave_max_q = np.amax(predicted_q_value) - print('epi: ' + self.epi + ' Q_max: '+str(ep_ave_max_q)) + # ep_ave_max_q = np.amax(predicted_q_value) + # print('epi: ' + str(self.epi) + ' Q_max: '+str(ep_ave_max_q)) # Update the actor policy using the sampled gradient From fd17088a9a1708dd46eba61f9b2823db32dbfee2 Mon Sep 17 00:00:00 2001 From: kengz Date: Mon, 17 Apr 2017 08:57:40 -0400 Subject: [PATCH 24/43] runnable ddpg2 from permami, still not working yet --- rl/agent/ddpg_2.py | 366 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 rl/agent/ddpg_2.py diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py new file mode 100644 index 0000000..c75b399 --- /dev/null +++ b/rl/agent/ddpg_2.py @@ -0,0 +1,366 @@ +""" +DDPG implementation from https://github.com/pemami4911/deep-rl/blob/master/ddpg/ddpg.py +Implementation of DDPG - Deep Deterministic Policy Gradient +Algorithm and hyperparameter details can be found here: + http://arxiv.org/pdf/1509.02971v2.pdf +The algorithm is tested on the Pendulum-v0 OpenAI gym task +and developed with tflearn + Tensorflow +Author: Patrick Emami +""" +import numpy as np +from rl.agent.dqn import DQN +from rl.util import logger, clone_model, clone_optimizer + +from rl.agent.base_agent import Agent +import tensorflow as tf +# import gym +# from gym import wrappers +import tflearn + +# from replay_buffer import ReplayBuffer + +# ========================== +# Training Parameters +# ========================== +# Max training steps +MAX_EPISODES = 50000 +# Max episode length +MAX_EP_STEPS = 1000 +# Base learning rate for the Actor network +ACTOR_LEARNING_RATE = 0.001 +# Base learning rate for the Critic Network +CRITIC_LEARNING_RATE = 0.001 +# Discount factor +GAMMA = 0.99 +# Soft target update param +TAU = 0.001 + +# # =========================== +# # Utility Parameters +# # =========================== +# # Render gym env during training +# RENDER_ENV = True +# # Use Gym Monitor +# GYM_MONITOR_EN = True +# # Gym environment +# ENV_NAME = 'Pendulum-v0' +# # Directory for storing gym results +# MONITOR_DIR = './results/gym_ddpg' +# # Directory for storing tensorboard summary results +# SUMMARY_DIR = './results/tf_ddpg' +# RANDOM_SEED = 1234 +# Size of replay buffer +BUFFER_SIZE = 10000 +MINIBATCH_SIZE = 64 + +# =========================== +# Actor and Critic DNNs +# =========================== + + +class ActorNetwork(DQN): + """ + Input to the network is the state, output is the action + under a deterministic policy. + + The output layer activation is a tanh to keep the action + between -2 and 2 + """ + + def __init__(self, *args, **kwargs): + from keras import backend as K + self.K = K + self.tf = self.K.tf + self.sess = self.K.get_session() + self.tau = 0.001 + super(ActorNetwork, self).__init__(*args, **kwargs) + + def build_model(self): + self.model = super(ActorNetwork, self).build_model() + self.target_model = clone_model(self.model) + + self.actor_state = self.model.inputs[0] + self.out = self.model.output + self.network_params = self.model.trainable_weights + + self.target_actor_state = self.target_model.inputs[0] + self.target_out = self.target_model.output + self.target_network_params = self.target_model.trainable_weights + + # Op for updating target network + self.update_target_network_op = [] + for i, t_w in enumerate(self.target_network_params): + op = t_w.assign( + self.tf.multiply(self.tau, self.network_params[i]) + self.tf.multiply(1. - self.tau, t_w)) + self.update_target_network_op.append(op) + + # will be fed as self.action_gradient: critic_grads + self.action_gradient = self.tf.placeholder( + self.tf.float32, [None, self.env_spec['action_dim']]) + + # final gradients op for actor network + # TODO need to scale out + self.actor_gradients = self.tf.gradients( + self.out, self.network_params, -self.action_gradient) + + # Optimization Op + self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients( + zip(self.actor_gradients, self.network_params)) + return self.model + + def compile_model(self): + pass + + def recompile_model(self, sys_vars): + pass + + def update(self): + self.sess.run(self.update_target_network_op) + + def train(self, inputs, a_gradient): + self.sess.run(self.optimize, feed_dict={ + self.actor_state: inputs, + self.action_gradient: a_gradient + }) + + def predict(self, inputs): + return self.sess.run(self.out, feed_dict={ + self.actor_state: inputs + }) + + def predict_target(self, inputs): + return self.sess.run(self.target_out, feed_dict={ + self.target_actor_state: inputs + }) + + +class CriticNetwork(DQN): + """ + Input to the network is the state and action, output is Q(s,a). + The action must be obtained from the output of the Actor network. + + """ + def __init__(self, *args, **kwargs): + from keras.layers import Dense, Merge + from keras import backend as K + self.Merge = Merge + self.K = K + self.tf = self.K.tf + self.sess = self.K.get_session() + self.tau = 0.001 + super(CriticNetwork, self).__init__(*args, **kwargs) + + + def build_critic_models(self): + state_branch = self.Sequential() + state_branch.add(self.Dense( + self.hidden_layers[0], + input_shape=(self.env_spec['state_dim'],), + activation=self.hidden_layers_activation, + init='lecun_uniform')) + + action_branch = self.Sequential() + action_branch.add(self.Dense( + self.hidden_layers[0], + input_shape=(self.env_spec['action_dim'],), + activation=self.hidden_layers_activation, + init='lecun_uniform')) + + input_layer = self.Merge([state_branch, action_branch], mode='concat') + + model = self.Sequential() + model.add(input_layer) + + if (len(self.hidden_layers) > 1): + for i in range(1, len(self.hidden_layers)): + model.add(self.Dense( + self.hidden_layers[i], + init='lecun_uniform', + activation=self.hidden_layers_activation)) + + model.add(self.Dense(1, + init='lecun_uniform', + activation=self.output_layer_activation)) + logger.info('Critic model summary') + model.summary() + self.model = model + + logger.info("Model built") + return self.model + + + def mean_squared_error(self, y_true, y_pred): + return self.K.mean(self.K.square(y_pred - y_true), axis=-1) + + def build_model(self): + self.model = self.build_critic_models() + self.target_model = clone_model(self.model) + + self.critic_state = self.model.inputs[0] + self.critic_action = self.model.inputs[1] + self.out = self.model.output + self.network_params = self.model.trainable_weights + + self.target_critic_state = self.target_model.inputs[0] + self.target_critic_action = self.target_model.inputs[1] + self.target_out = self.target_model.output + self.target_network_params = self.target_model.trainable_weights + + # Op for updating target network + self.update_target_network_op = [] + for i, t_w in enumerate(self.target_network_params): + op = t_w.assign( + self.tf.multiply(self.tau, self.network_params[i]) + self.tf.multiply(1. - self.tau, t_w)) + self.update_target_network_op.append(op) + + # custom loss and optimization Op + self.q_prime = self.tf.placeholder(self.tf.float32, [None, 1]) + self.loss = self.mean_squared_error(self.q_prime, self.out) + self.optimize = self.tf.train.AdamOptimizer(self.lr).minimize(self.loss) + + self.action_gradient = self.tf.gradients(self.out, self.critic_action) + return self.model + + + def train(self, inputs, action, q_prime): + return self.sess.run([self.out, self.optimize], feed_dict={ + self.critic_state: inputs, + self.critic_action: action, + self.q_prime: q_prime + }) + + def predict(self, inputs, action): + return self.sess.run(self.out, feed_dict={ + self.critic_state: inputs, + self.critic_action: action + }) + + def predict_target(self, inputs, action): + return self.sess.run(self.target_out, feed_dict={ + self.target_critic_state: inputs, + self.target_critic_action: action + }) + + def action_gradients(self, inputs, actions): + return self.sess.run(self.action_gradient, feed_dict={ + self.critic_state: inputs, + self.critic_action: actions + }) + + def update(self): + self.sess.run(self.update_target_network_op) + + +class DDPG2(Agent): + + ''' + The DDPG2 agent (algo), from https://arxiv.org/abs/1509.02971 + reference: https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html + https://github.com/matthiasplappert/keras-rl + ''' + + def __init__(self, *args, **kwargs): + # import only when needed to contain side-effects + # from keras.layers import Dense, Merge + # from keras.models import Sequential + from keras import backend as K + # self.Dense = Dense + # self.Merge = Merge + # self.Sequential = Sequential + self.K = K + self.sess = self.K.get_session() + + self.epi = 0 + self.n_epoch = 1 + self.batch_size = 64 + self.gamma = 0.99 + + # self.TAU = 0.001 # for target network updates + super(DDPG2, self).__init__(*args, **kwargs) + self.build_model(*args, **kwargs) + self.sess.run(tf.global_variables_initializer()) + + def build_model(self, *args, **kwargs): + self.actor = ActorNetwork(*args, **kwargs) + self.critic = CriticNetwork(*args, **kwargs) + + def compile_model(self): + pass + + def select_action(self, state): + i = self.epi + action = self.actor.predict(np.reshape( + state, (-1, self.env_spec['state_dim']))) + (1. / (1. + i)) + # print('action shape') + # print('action shape') + # print('action shape') + # print(action) + # print(action.shape) + return action[0] + + def update(self, sys_vars): + self.epi = sys_vars['epi'] + # Update target networks + self.actor.update() + self.critic.update() + return + + def to_train(self, sys_vars): + return self.memory.size() > MINIBATCH_SIZE + # return True + + def train_an_epoch(self): + minibatch = self.memory.rand_minibatch(self.batch_size) + s_batch = np.reshape(minibatch['states'], (-1, self.env_spec['state_dim'])) + a_batch = np.reshape(minibatch['actions'], (-1, self.env_spec['action_dim'])) + s2_batch = np.reshape(minibatch['next_states'], (-1, self.env_spec['state_dim'])) + + target_q = self.critic.predict_target( + s2_batch, + self.actor.predict_target(s2_batch)) + + y = minibatch['rewards'] + self.gamma * \ + (1 - minibatch['terminals']) * np.reshape(target_q, (-1)) + y = np.reshape(y, (-1, 1)) + + predicted_q_value, _ = self.critic.train( + s_batch, a_batch, y) + # minibatch['states'], + # minibatch['actions'], + # y) + # # np.reshape(y, (self.batch_size, 1))) + + # ep_ave_max_q = np.amax(predicted_q_value) + # print('epi: ' + str(self.epi) + ' Q_max: '+str(ep_ave_max_q)) + + # Update the actor policy using the sampled gradient + a_outs = self.actor.predict(s_batch) + grads = self.critic.action_gradients(s_batch, a_outs) + self.actor.train(s_batch, grads[0]) + # return actor_loss + return + + # (Q_states, _states, Q_next_states_max) = self.compute_Q_states( + # minibatch) + # Q_targets = self.compute_Q_targets( + # minibatch, Q_states, Q_next_states_max) + + # loss = self.model.train_on_batch(minibatch['states'], Q_targets) + + # errors = abs(np.sum(Q_states - Q_targets, axis=1)) + # self.memory.update(errors) + # return loss + + def train(self, sys_vars): + ''' + Training is for the Q function (NN) only + otherwise (e.g. policy) see self.update() + step 1,2,3,4 of algo. + ''' + loss_total = 0 + for _epoch in range(self.n_epoch): + loss = self.train_an_epoch() + # loss_total += loss + avg_loss = loss_total / self.n_epoch + sys_vars['loss'].append(avg_loss) + return avg_loss From bc0e3f99094f05cf69f66907c393f00667196d1c Mon Sep 17 00:00:00 2001 From: kengz Date: Mon, 17 Apr 2017 21:38:32 -0400 Subject: [PATCH 25/43] DDPG2 WORKING AT LAST --- rl/agent/ddpg_2.py | 19 +++++++++++------- rl/spec/classic_experiment_specs.json | 29 +++++++++++++++++++++++++-- rl/spec/component_locks.json | 1 + 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py index c75b399..40b5044 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg_2.py @@ -81,10 +81,12 @@ def build_model(self): self.actor_state = self.model.inputs[0] self.out = self.model.output + self.scaled_out = self.tf.multiply(self.out, self.env_spec['action_bound_high']) self.network_params = self.model.trainable_weights self.target_actor_state = self.target_model.inputs[0] self.target_out = self.target_model.output + self.target_scaled_out = self.tf.multiply(self.target_out, self.env_spec['action_bound_high']) self.target_network_params = self.target_model.trainable_weights # Op for updating target network @@ -101,7 +103,7 @@ def build_model(self): # final gradients op for actor network # TODO need to scale out self.actor_gradients = self.tf.gradients( - self.out, self.network_params, -self.action_gradient) + self.scaled_out, self.network_params, -self.action_gradient) # Optimization Op self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients( @@ -124,12 +126,12 @@ def train(self, inputs, a_gradient): }) def predict(self, inputs): - return self.sess.run(self.out, feed_dict={ + return self.sess.run(self.scaled_out, feed_dict={ self.actor_state: inputs }) def predict_target(self, inputs): - return self.sess.run(self.target_out, feed_dict={ + return self.sess.run(self.target_scaled_out, feed_dict={ self.target_actor_state: inputs }) @@ -180,7 +182,7 @@ def build_critic_models(self): model.add(self.Dense(1, init='lecun_uniform', - activation=self.output_layer_activation)) + activation='linear')) logger.info('Critic model summary') model.summary() self.model = model @@ -311,9 +313,12 @@ def to_train(self, sys_vars): def train_an_epoch(self): minibatch = self.memory.rand_minibatch(self.batch_size) - s_batch = np.reshape(minibatch['states'], (-1, self.env_spec['state_dim'])) - a_batch = np.reshape(minibatch['actions'], (-1, self.env_spec['action_dim'])) - s2_batch = np.reshape(minibatch['next_states'], (-1, self.env_spec['state_dim'])) + s_batch = minibatch['states'] + a_batch = minibatch['actions'] + s2_batch = minibatch['next_states'] + # s_batch = np.reshape(minibatch['states'], (-1, self.env_spec['state_dim'])) + # a_batch = np.reshape(minibatch['actions'], (-1, self.env_spec['action_dim'])) + # s2_batch = np.reshape(minibatch['next_states'], (-1, self.env_spec['state_dim'])) target_q = self.critic.predict_target( s2_batch, diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 0d4acc7..3c7f347 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -804,9 +804,10 @@ "Policy": "DDPGBoundedPolicy", "PreProcessor": "NoPreProcessor", "param": { - "lr": 0.01, + "batch_size": 64, + "lr": 0.001, "gamma": 0.999, - "hidden_layers": [200], + "hidden_layers": [400, 300], "hidden_layers_activation": "relu", "output_layer_activation": "tanh" }, @@ -843,6 +844,30 @@ ] } }, + "pendulum_ddpg2": { + "problem": "Pendulum-v0", + "Agent": "DDPG2", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "DDPGBoundedPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.001, + "gamma": 0.999, + "hidden_layers": [400, 300], + "hidden_layers_activation": "relu", + "output_layer_activation": "tanh" + }, + "param_range": { + "lr": [0.001, 0.01], + "gamma": [0.999], + "hidden_layers": [ + [400], + [400, 300] + ] + } + }, "mountain_dqn": { "problem": "MountainCar-v0", "Agent": "DQN", diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 25eaae3..761b5df 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -18,6 +18,7 @@ "head": "Agent", "Agent": [ "DDPG", + "DDPG2", "PermamiDDPG" ], "Policy": [ From ea48d795e2d132ae15af190870f93fe8a619e25c Mon Sep 17 00:00:00 2001 From: kengz Date: Mon, 17 Apr 2017 23:51:07 -0400 Subject: [PATCH 26/43] refactor ddpg and rename methods, variables properly --- rl/agent/ddpg_2.py | 279 ++++++++++++++++----------------------------- 1 file changed, 99 insertions(+), 180 deletions(-) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py index 40b5044..e46fbe5 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg_2.py @@ -1,71 +1,15 @@ -""" -DDPG implementation from https://github.com/pemami4911/deep-rl/blob/master/ddpg/ddpg.py -Implementation of DDPG - Deep Deterministic Policy Gradient -Algorithm and hyperparameter details can be found here: - http://arxiv.org/pdf/1509.02971v2.pdf -The algorithm is tested on the Pendulum-v0 OpenAI gym task -and developed with tflearn + Tensorflow -Author: Patrick Emami -""" import numpy as np +from rl.agent.base_agent import Agent from rl.agent.dqn import DQN from rl.util import logger, clone_model, clone_optimizer -from rl.agent.base_agent import Agent -import tensorflow as tf -# import gym -# from gym import wrappers -import tflearn - -# from replay_buffer import ReplayBuffer - -# ========================== -# Training Parameters -# ========================== -# Max training steps -MAX_EPISODES = 50000 -# Max episode length -MAX_EP_STEPS = 1000 -# Base learning rate for the Actor network -ACTOR_LEARNING_RATE = 0.001 -# Base learning rate for the Critic Network -CRITIC_LEARNING_RATE = 0.001 -# Discount factor -GAMMA = 0.99 -# Soft target update param -TAU = 0.001 - -# # =========================== -# # Utility Parameters -# # =========================== -# # Render gym env during training -# RENDER_ENV = True -# # Use Gym Monitor -# GYM_MONITOR_EN = True -# # Gym environment -# ENV_NAME = 'Pendulum-v0' -# # Directory for storing gym results -# MONITOR_DIR = './results/gym_ddpg' -# # Directory for storing tensorboard summary results -# SUMMARY_DIR = './results/tf_ddpg' -# RANDOM_SEED = 1234 -# Size of replay buffer -BUFFER_SIZE = 10000 -MINIBATCH_SIZE = 64 - -# =========================== -# Actor and Critic DNNs -# =========================== - - -class ActorNetwork(DQN): - """ - Input to the network is the state, output is the action - under a deterministic policy. - - The output layer activation is a tanh to keep the action - between -2 and 2 - """ + +class Actor(DQN): + ''' + Actor of DDPG, with its network and target network + input is states, output is action + very similar to DQN + ''' def __init__(self, *args, **kwargs): from keras import backend as K @@ -73,39 +17,42 @@ def __init__(self, *args, **kwargs): self.tf = self.K.tf self.sess = self.K.get_session() self.tau = 0.001 - super(ActorNetwork, self).__init__(*args, **kwargs) + super(Actor, self).__init__(*args, **kwargs) def build_model(self): - self.model = super(ActorNetwork, self).build_model() + self.model = super(Actor, self).build_model() self.target_model = clone_model(self.model) - self.actor_state = self.model.inputs[0] + self.actor_states = self.model.inputs[0] self.out = self.model.output - self.scaled_out = self.tf.multiply(self.out, self.env_spec['action_bound_high']) + self.scaled_out = self.tf.multiply( + self.out, self.env_spec['action_bound_high']) self.network_params = self.model.trainable_weights - self.target_actor_state = self.target_model.inputs[0] + self.target_actor_states = self.target_model.inputs[0] self.target_out = self.target_model.output - self.target_scaled_out = self.tf.multiply(self.target_out, self.env_spec['action_bound_high']) + self.target_scaled_out = self.tf.multiply( + self.target_out, self.env_spec['action_bound_high']) self.target_network_params = self.target_model.trainable_weights # Op for updating target network self.update_target_network_op = [] for i, t_w in enumerate(self.target_network_params): op = t_w.assign( - self.tf.multiply(self.tau, self.network_params[i]) + self.tf.multiply(1. - self.tau, t_w)) + self.tf.multiply( + self.tau, self.network_params[i] + ) + self.tf.multiply(1. - self.tau, t_w)) self.update_target_network_op.append(op) # will be fed as self.action_gradient: critic_grads self.action_gradient = self.tf.placeholder( self.tf.float32, [None, self.env_spec['action_dim']]) - # final gradients op for actor network - # TODO need to scale out + # actor model gradient op, to be fed from critic self.actor_gradients = self.tf.gradients( self.scaled_out, self.network_params, -self.action_gradient) - # Optimization Op + # Optimization op self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients( zip(self.actor_gradients, self.network_params)) return self.model @@ -119,29 +66,31 @@ def recompile_model(self, sys_vars): def update(self): self.sess.run(self.update_target_network_op) - def train(self, inputs, a_gradient): - self.sess.run(self.optimize, feed_dict={ - self.actor_state: inputs, - self.action_gradient: a_gradient - }) - - def predict(self, inputs): + def predict(self, states): return self.sess.run(self.scaled_out, feed_dict={ - self.actor_state: inputs + self.actor_states: states }) - def predict_target(self, inputs): + def target_predict(self, next_states): return self.sess.run(self.target_scaled_out, feed_dict={ - self.target_actor_state: inputs + self.target_actor_states: next_states + }) + + def train(self, states, critic_action_gradient): + self.sess.run(self.optimize, feed_dict={ + self.actor_states: states, + self.action_gradient: critic_action_gradient }) -class CriticNetwork(DQN): - """ - Input to the network is the state and action, output is Q(s,a). - The action must be obtained from the output of the Actor network. +class Critic(DQN): + + ''' + Critic of DDPG, with its network and target network + input is states and actions, output is Q value + the action is from Actor + ''' - """ def __init__(self, *args, **kwargs): from keras.layers import Dense, Merge from keras import backend as K @@ -150,8 +99,7 @@ def __init__(self, *args, **kwargs): self.tf = self.K.tf self.sess = self.K.get_session() self.tau = 0.001 - super(CriticNetwork, self).__init__(*args, **kwargs) - + super(Critic, self).__init__(*args, **kwargs) def build_critic_models(self): state_branch = self.Sequential() @@ -190,7 +138,6 @@ def build_critic_models(self): logger.info("Model built") return self.model - def mean_squared_error(self, y_true, y_pred): return self.K.mean(self.K.square(y_pred - y_true), axis=-1) @@ -198,13 +145,13 @@ def build_model(self): self.model = self.build_critic_models() self.target_model = clone_model(self.model) - self.critic_state = self.model.inputs[0] - self.critic_action = self.model.inputs[1] + self.critic_states = self.model.inputs[0] + self.critic_actions = self.model.inputs[1] self.out = self.model.output self.network_params = self.model.trainable_weights - self.target_critic_state = self.target_model.inputs[0] - self.target_critic_action = self.target_model.inputs[1] + self.target_critic_states = self.target_model.inputs[0] + self.target_critic_actions = self.target_model.inputs[1] self.target_out = self.target_model.output self.target_network_params = self.target_model.trainable_weights @@ -212,92 +159,87 @@ def build_model(self): self.update_target_network_op = [] for i, t_w in enumerate(self.target_network_params): op = t_w.assign( - self.tf.multiply(self.tau, self.network_params[i]) + self.tf.multiply(1. - self.tau, t_w)) + self.tf.multiply( + self.tau, self.network_params[i] + ) + self.tf.multiply(1. - self.tau, t_w)) self.update_target_network_op.append(op) # custom loss and optimization Op - self.q_prime = self.tf.placeholder(self.tf.float32, [None, 1]) - self.loss = self.mean_squared_error(self.q_prime, self.out) - self.optimize = self.tf.train.AdamOptimizer(self.lr).minimize(self.loss) + self.y = self.tf.placeholder(self.tf.float32, [None, 1]) + self.loss = self.mean_squared_error(self.y, self.out) + self.optimize = self.tf.train.AdamOptimizer( + self.lr).minimize(self.loss) - self.action_gradient = self.tf.gradients(self.out, self.critic_action) + self.action_gradient = self.tf.gradients(self.out, self.critic_actions) return self.model + def update(self): + self.sess.run(self.update_target_network_op) - def train(self, inputs, action, q_prime): - return self.sess.run([self.out, self.optimize], feed_dict={ - self.critic_state: inputs, - self.critic_action: action, - self.q_prime: q_prime - }) + def get_action_gradient(self, states, actions): + return self.sess.run(self.action_gradient, feed_dict={ + self.critic_states: states, + self.critic_actions: actions + })[0] - def predict(self, inputs, action): - return self.sess.run(self.out, feed_dict={ - self.critic_state: inputs, - self.critic_action: action - }) + # def predict(self, inputs, action): + # return self.sess.run(self.out, feed_dict={ + # self.critic_states: inputs, + # self.critic_actions: action + # }) - def predict_target(self, inputs, action): + def target_predict(self, next_states, mu_prime): return self.sess.run(self.target_out, feed_dict={ - self.target_critic_state: inputs, - self.target_critic_action: action + self.target_critic_states: next_states, + self.target_critic_actions: mu_prime }) - def action_gradients(self, inputs, actions): - return self.sess.run(self.action_gradient, feed_dict={ - self.critic_state: inputs, - self.critic_action: actions + def train(self, states, actions, y): + return self.sess.run([self.out, self.optimize], feed_dict={ + self.critic_states: states, + self.critic_actions: actions, + self.y: y }) - def update(self): - self.sess.run(self.update_target_network_op) - class DDPG2(Agent): ''' - The DDPG2 agent (algo), from https://arxiv.org/abs/1509.02971 - reference: https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html - https://github.com/matthiasplappert/keras-rl + DDPG Algorithm, from https://arxiv.org/abs/1509.02971 + has Actor, Critic, and each has its own target network + Implementation referred from https://github.com/pemami4911/deep-rl ''' def __init__(self, *args, **kwargs): # import only when needed to contain side-effects - # from keras.layers import Dense, Merge - # from keras.models import Sequential from keras import backend as K - # self.Dense = Dense - # self.Merge = Merge - # self.Sequential = Sequential self.K = K self.sess = self.K.get_session() + # TODO absorb properly self.epi = 0 self.n_epoch = 1 self.batch_size = 64 self.gamma = 0.99 - # self.TAU = 0.001 # for target network updates super(DDPG2, self).__init__(*args, **kwargs) self.build_model(*args, **kwargs) - self.sess.run(tf.global_variables_initializer()) + self.sess.run(self.K.tf.global_variables_initializer()) def build_model(self, *args, **kwargs): - self.actor = ActorNetwork(*args, **kwargs) - self.critic = CriticNetwork(*args, **kwargs) + # TODO prolly wanna unify self.tf + self.actor = Actor(*args, **kwargs) + self.critic = Critic(*args, **kwargs) def compile_model(self): pass def select_action(self, state): + # TODO externalize to policy i = self.epi + # TODO can we use expand dims? action = self.actor.predict(np.reshape( state, (-1, self.env_spec['state_dim']))) + (1. / (1. + i)) - # print('action shape') - # print('action shape') - # print('action shape') - # print(action) - # print(action.shape) return action[0] def update(self, sys_vars): @@ -305,63 +247,40 @@ def update(self, sys_vars): # Update target networks self.actor.update() self.critic.update() - return def to_train(self, sys_vars): - return self.memory.size() > MINIBATCH_SIZE - # return True + return self.memory.size() > self.batch_size def train_an_epoch(self): minibatch = self.memory.rand_minibatch(self.batch_size) - s_batch = minibatch['states'] - a_batch = minibatch['actions'] - s2_batch = minibatch['next_states'] - # s_batch = np.reshape(minibatch['states'], (-1, self.env_spec['state_dim'])) - # a_batch = np.reshape(minibatch['actions'], (-1, self.env_spec['action_dim'])) - # s2_batch = np.reshape(minibatch['next_states'], (-1, self.env_spec['state_dim'])) - - target_q = self.critic.predict_target( - s2_batch, - self.actor.predict_target(s2_batch)) + # train critic + mu_prime = self.actor.target_predict(minibatch['next_states']) + q_prime = self.critic.target_predict( + minibatch['next_states'], mu_prime) + # TODO double check reshape, justify y = minibatch['rewards'] + self.gamma * \ - (1 - minibatch['terminals']) * np.reshape(target_q, (-1)) + (1 - minibatch['terminals']) * np.reshape(q_prime, (-1)) y = np.reshape(y, (-1, 1)) + # TODO want this to be loss predicted_q_value, _ = self.critic.train( - s_batch, a_batch, y) - # minibatch['states'], - # minibatch['actions'], - # y) - # # np.reshape(y, (self.batch_size, 1))) - - # ep_ave_max_q = np.amax(predicted_q_value) - # print('epi: ' + str(self.epi) + ' Q_max: '+str(ep_ave_max_q)) + minibatch['states'], minibatch['actions'], y) + # train actor # Update the actor policy using the sampled gradient - a_outs = self.actor.predict(s_batch) - grads = self.critic.action_gradients(s_batch, a_outs) - self.actor.train(s_batch, grads[0]) + actions = self.actor.predict(minibatch['states']) + critic_action_gradient = self.critic.get_action_gradient( + minibatch['states'], actions) + # TODO rename all function args consistently + # TODO want this to be loss too + actor_loss = self.actor.train( + minibatch['states'], critic_action_gradient) # return actor_loss + # loss = critic_loss + actor_loss return - # (Q_states, _states, Q_next_states_max) = self.compute_Q_states( - # minibatch) - # Q_targets = self.compute_Q_targets( - # minibatch, Q_states, Q_next_states_max) - - # loss = self.model.train_on_batch(minibatch['states'], Q_targets) - - # errors = abs(np.sum(Q_states - Q_targets, axis=1)) - # self.memory.update(errors) - # return loss - def train(self, sys_vars): - ''' - Training is for the Q function (NN) only - otherwise (e.g. policy) see self.update() - step 1,2,3,4 of algo. - ''' loss_total = 0 for _epoch in range(self.n_epoch): loss = self.train_an_epoch() From 366f2298c5f9328922b10f4381d317b0444bbace Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 00:31:29 -0400 Subject: [PATCH 27/43] use tf losses; return critic_loss from run --- rl/agent/ddpg_2.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py index e46fbe5..a59f366 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg_2.py @@ -77,7 +77,7 @@ def target_predict(self, next_states): }) def train(self, states, critic_action_gradient): - self.sess.run(self.optimize, feed_dict={ + return self.sess.run(self.optimize, feed_dict={ self.actor_states: states, self.action_gradient: critic_action_gradient }) @@ -138,9 +138,6 @@ def build_critic_models(self): logger.info("Model built") return self.model - def mean_squared_error(self, y_true, y_pred): - return self.K.mean(self.K.square(y_pred - y_true), axis=-1) - def build_model(self): self.model = self.build_critic_models() self.target_model = clone_model(self.model) @@ -166,7 +163,7 @@ def build_model(self): # custom loss and optimization Op self.y = self.tf.placeholder(self.tf.float32, [None, 1]) - self.loss = self.mean_squared_error(self.y, self.out) + self.loss = self.tf.losses.mean_squared_error(self.y, self.out) self.optimize = self.tf.train.AdamOptimizer( self.lr).minimize(self.loss) @@ -195,7 +192,7 @@ def target_predict(self, next_states, mu_prime): }) def train(self, states, actions, y): - return self.sess.run([self.out, self.optimize], feed_dict={ + return self.sess.run([self.out, self.optimize, self.loss], feed_dict={ self.critic_states: states, self.critic_actions: actions, self.y: y @@ -238,8 +235,8 @@ def select_action(self, state): # TODO externalize to policy i = self.epi # TODO can we use expand dims? - action = self.actor.predict(np.reshape( - state, (-1, self.env_spec['state_dim']))) + (1. / (1. + i)) + action = self.actor.predict( + np.expand_dims(state, axis=0)) + (1. / (1. + i)) return action[0] def update(self, sys_vars): @@ -258,13 +255,14 @@ def train_an_epoch(self): mu_prime = self.actor.target_predict(minibatch['next_states']) q_prime = self.critic.target_predict( minibatch['next_states'], mu_prime) - # TODO double check reshape, justify + # reshape for element-wise multiplication + # to feed into network, y shape needs to be (?, 1) y = minibatch['rewards'] + self.gamma * \ (1 - minibatch['terminals']) * np.reshape(q_prime, (-1)) y = np.reshape(y, (-1, 1)) # TODO want this to be loss - predicted_q_value, _ = self.critic.train( + predicted_q_value, _, critic_loss = self.critic.train( minibatch['states'], minibatch['actions'], y) # train actor @@ -272,10 +270,10 @@ def train_an_epoch(self): actions = self.actor.predict(minibatch['states']) critic_action_gradient = self.critic.get_action_gradient( minibatch['states'], actions) - # TODO rename all function args consistently # TODO want this to be loss too actor_loss = self.actor.train( minibatch['states'], critic_action_gradient) + # return actor_loss # loss = critic_loss + actor_loss return From 1e79a98c5f51caa616485f46d69cbeefcfe17451 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 00:44:13 -0400 Subject: [PATCH 28/43] restore critic_loss --- rl/agent/ddpg_2.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py index a59f366..0b2b834 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg_2.py @@ -234,7 +234,6 @@ def compile_model(self): def select_action(self, state): # TODO externalize to policy i = self.epi - # TODO can we use expand dims? action = self.actor.predict( np.expand_dims(state, axis=0)) + (1. / (1. + i)) return action[0] @@ -261,8 +260,7 @@ def train_an_epoch(self): (1 - minibatch['terminals']) * np.reshape(q_prime, (-1)) y = np.reshape(y, (-1, 1)) - # TODO want this to be loss - predicted_q_value, _, critic_loss = self.critic.train( + _, _, critic_loss = self.critic.train( minibatch['states'], minibatch['actions'], y) # train actor @@ -270,19 +268,18 @@ def train_an_epoch(self): actions = self.actor.predict(minibatch['states']) critic_action_gradient = self.critic.get_action_gradient( minibatch['states'], actions) - # TODO want this to be loss too - actor_loss = self.actor.train( + # currently cant be gotten + _actor_loss = self.actor.train( minibatch['states'], critic_action_gradient) - # return actor_loss - # loss = critic_loss + actor_loss - return + loss = critic_loss + return loss def train(self, sys_vars): loss_total = 0 for _epoch in range(self.n_epoch): loss = self.train_an_epoch() - # loss_total += loss + loss_total += loss avg_loss = loss_total / self.n_epoch sys_vars['loss'].append(avg_loss) return avg_loss From c4d21b8fa1cd8ec1e0061c2792dbb2beb7efc4b0 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 01:15:32 -0400 Subject: [PATCH 29/43] source ddpg main class from dqn; propagate some param settings properly --- rl/agent/ddpg_2.py | 49 ++++++++++++++++++---------------------------- rl/agent/dqn.py | 2 +- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py index 0b2b834..8ba8752 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg_2.py @@ -1,5 +1,4 @@ import numpy as np -from rl.agent.base_agent import Agent from rl.agent.dqn import DQN from rl.util import logger, clone_model, clone_optimizer @@ -11,12 +10,12 @@ class Actor(DQN): very similar to DQN ''' - def __init__(self, *args, **kwargs): + def __init__(self, *args, tau=0.001, **kwargs): from keras import backend as K self.K = K self.tf = self.K.tf self.sess = self.K.get_session() - self.tau = 0.001 + self.tau = tau super(Actor, self).__init__(*args, **kwargs) def build_model(self): @@ -91,14 +90,14 @@ class Critic(DQN): the action is from Actor ''' - def __init__(self, *args, **kwargs): + def __init__(self, *args, tau=0.001, **kwargs): from keras.layers import Dense, Merge from keras import backend as K self.Merge = Merge self.K = K self.tf = self.K.tf self.sess = self.K.get_session() - self.tau = 0.001 + self.tau = tau super(Critic, self).__init__(*args, **kwargs) def build_critic_models(self): @@ -199,7 +198,7 @@ def train(self, states, actions, y): }) -class DDPG2(Agent): +class DDPG2(DQN): ''' DDPG Algorithm, from https://arxiv.org/abs/1509.02971 @@ -212,40 +211,39 @@ def __init__(self, *args, **kwargs): from keras import backend as K self.K = K self.sess = self.K.get_session() + self.actor = Actor(*args, **kwargs) + self.critic = Critic(*args, **kwargs) + self.sess.run(self.K.tf.global_variables_initializer()) + super(DDPG2, self).__init__(*args, **kwargs) - # TODO absorb properly + # TODO remove self.epi = 0 - self.n_epoch = 1 - self.batch_size = 64 - self.gamma = 0.99 - - super(DDPG2, self).__init__(*args, **kwargs) - self.build_model(*args, **kwargs) - self.sess.run(self.K.tf.global_variables_initializer()) - def build_model(self, *args, **kwargs): - # TODO prolly wanna unify self.tf - self.actor = Actor(*args, **kwargs) - self.critic = Critic(*args, **kwargs) + def build_model(self): + pass def compile_model(self): pass + def recompile_model(self): + pass + def select_action(self, state): # TODO externalize to policy i = self.epi action = self.actor.predict( np.expand_dims(state, axis=0)) + (1. / (1. + i)) return action[0] + # return self.policy.select_action(state) def update(self, sys_vars): + # TODO shd be in policy self.epi = sys_vars['epi'] # Update target networks self.actor.update() self.critic.update() - - def to_train(self, sys_vars): - return self.memory.size() > self.batch_size + self.policy.update(sys_vars) + self.update_n_epoch(sys_vars) def train_an_epoch(self): minibatch = self.memory.rand_minibatch(self.batch_size) @@ -274,12 +272,3 @@ def train_an_epoch(self): loss = critic_loss return loss - - def train(self, sys_vars): - loss_total = 0 - for _epoch in range(self.n_epoch): - loss = self.train_an_epoch() - loss_total += loss - avg_loss = loss_total / self.n_epoch - sys_vars['loss'].append(avg_loss) - return avg_loss diff --git a/rl/agent/dqn.py b/rl/agent/dqn.py index 7905270..26e3ae4 100644 --- a/rl/agent/dqn.py +++ b/rl/agent/dqn.py @@ -193,7 +193,7 @@ def train_an_epoch(self): loss = self.model.train_on_batch(minibatch['states'], Q_targets) - errors = abs(np.sum(Q_states - Q_targets, axis=1)) + errors = abs(np.sum(Q_states - Q_targets, axis=1)) self.memory.update(errors) return loss From 0be8381deaef83cd5cac71e50b3d45fb271e8dd0 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 01:18:33 -0400 Subject: [PATCH 30/43] add compatible spec --- rl/spec/classic_experiment_specs.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 3c7f347..f9a9995 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -853,6 +853,9 @@ "Policy": "DDPGBoundedPolicy", "PreProcessor": "NoPreProcessor", "param": { + "batch_size": 64, + "n_epoch": 1, + "tau": 0.001, "lr": 0.001, "gamma": 0.999, "hidden_layers": [400, 300], From 7e4c28e62bdd6564315b581bad98d8d3df81ade8 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 01:26:03 -0400 Subject: [PATCH 31/43] externalize select action to policy --- rl/agent/ddpg_2.py | 12 +----------- rl/policy/noise.py | 24 ++++++++++++++++++++++++ rl/spec/classic_experiment_specs.json | 2 +- rl/spec/component_locks.json | 3 ++- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py index 8ba8752..485de93 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg_2.py @@ -216,9 +216,6 @@ def __init__(self, *args, **kwargs): self.sess.run(self.K.tf.global_variables_initializer()) super(DDPG2, self).__init__(*args, **kwargs) - # TODO remove - self.epi = 0 - def build_model(self): pass @@ -229,16 +226,9 @@ def recompile_model(self): pass def select_action(self, state): - # TODO externalize to policy - i = self.epi - action = self.actor.predict( - np.expand_dims(state, axis=0)) + (1. / (1. + i)) - return action[0] - # return self.policy.select_action(state) + return self.policy.select_action(state) def update(self, sys_vars): - # TODO shd be in policy - self.epi = sys_vars['epi'] # Update target networks self.actor.update() self.critic.update() diff --git a/rl/policy/noise.py b/rl/policy/noise.py index 7f82664..503441f 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -3,6 +3,30 @@ from rl.policy.base_policy import Policy +class DDPGLinearNoisePolicy(Policy): + + ''' + policy with linearly decaying noise (1. / (1. + self.epi)) + TODO absorb under noise too + ''' + + def __init__(self, env_spec, + **kwargs): # absorb generic param without breaking + super(DDPGLinearNoisePolicy, self).__init__(env_spec) + self.epi = 0 # init + log_self(self) + + def select_action(self, state): + # TODO externalize to policy + # TODO also try to externalize bounded + action = self.agent.actor.predict( + np.expand_dims(state, axis=0)) + (1. / (1. + self.epi)) + return action[0] + + def update(self, sys_vars): + self.epi = sys_vars['epi'] + 1 + + class DDPGBoundedPolicy(Policy): ''' diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index f9a9995..c0f228a 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -850,7 +850,7 @@ "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "DDPGBoundedPolicy", + "Policy": "DDPGLinearNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 761b5df..aa865a7 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -25,7 +25,8 @@ "AnnealedGaussian", "GaussianWhiteNoise", "OUNoise", - "DDPGBoundedPolicy" + "DDPGBoundedPolicy", + "DDPGLinearNoisePolicy" ] }, "actor_critic": { From 402fb716ff2619cde911e9ad2e0d56856c78f5e7 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 08:15:17 -0400 Subject: [PATCH 32/43] refactor noise policies for ddpg --- rl/policy/noise.py | 147 +++++++++++++------------- rl/spec/classic_experiment_specs.json | 5 +- rl/spec/component_locks.json | 6 +- 3 files changed, 82 insertions(+), 76 deletions(-) diff --git a/rl/policy/noise.py b/rl/policy/noise.py index 503441f..b671e17 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -3,86 +3,111 @@ from rl.policy.base_policy import Policy -class DDPGLinearNoisePolicy(Policy): +class NoisePolicy(Policy): ''' - policy with linearly decaying noise (1. / (1. + self.epi)) - TODO absorb under noise too + The base class for noise policy for DDPG ''' def __init__(self, env_spec, **kwargs): # absorb generic param without breaking - super(DDPGLinearNoisePolicy, self).__init__(env_spec) - self.epi = 0 # init + super(NoisePolicy, self).__init__(env_spec) log_self(self) + def sample(self): + '''implement noise here''' + return 0 + def select_action(self, state): - # TODO externalize to policy - # TODO also try to externalize bounded - action = self.agent.actor.predict( - np.expand_dims(state, axis=0)) + (1. / (1. + self.epi)) - return action[0] + agent = self.agent + state = np.expand_dims(state, axis=0) + if self.env_spec['actions'] == 'continuous': + action = agent.actor.predict(state)[0] + self.sample() + else: + Q_state = agent.actor.predict(state)[0] + assert Q_state.ndim == 1 + action = np.argmax(Q_state) + return action def update(self, sys_vars): - self.epi = sys_vars['epi'] + 1 + pass -class DDPGBoundedPolicy(Policy): +class LinearNoisePolicy(NoisePolicy): ''' - The bounded policy for actor critic agents - and continous, bounded policy spaces - Action bounded above and below by - - action_bound, + action_bound + policy with linearly decaying noise (1. / (1. + self.epi)) ''' - def __init__(self, env_spec, + def __init__(self, env_spec, exploration_anneal_episodes=20, **kwargs): # absorb generic param without breaking - super(DDPGBoundedPolicy, self).__init__(env_spec) - self.action_bound = env_spec['action_bound_high'] - assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] + super(LinearNoisePolicy, self).__init__(env_spec) + self.exploration_anneal_episodes = exploration_anneal_episodes + self.n_step = 0 # init log_self(self) - def select_action(self, state): - agent = self.agent - state = np.expand_dims(state, axis=0) - A_score = agent.actor.predict(state)[0] # extract from batch predict - # action = np.tanh(A_score) * self.action_bound - action = A_score * self.action_bound - return action + def sample(self): + noise = (1. / (1. + self.n_step)) + return noise def update(self, sys_vars): - pass + epi = sys_vars['epi'] + if epi >= self.exploration_anneal_episodes: + self.n_step = np.inf # noise divide to zero + else: + self.n_step = sys_vars['epi'] + +# class DDPGBoundedPolicy(NoisePolicy): -class AnnealedGaussian(Policy): +# ''' +# The bounded policy for actor critic agents +# and continous, bounded policy spaces +# Action bounded above and below by +# - action_bound, + action_bound +# ''' + +# def __init__(self, env_spec, +# **kwargs): # absorb generic param without breaking +# super(DDPGBoundedPolicy, self).__init__(env_spec) +# self.action_bound = env_spec['action_bound_high'] +# assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] +# log_self(self) + +# def sample(self): +# return 0 + +# def select_action(self, state): +# agent = self.agent +# state = np.expand_dims(state, axis=0) +# A_score = agent.actor.predict(state)[0] # extract from batch predict +# # action = np.tanh(A_score) * self.action_bound +# action = A_score * self.action_bound +# return action + +# def update(self, sys_vars): +# pass + + +class AnnealedGaussian(LinearNoisePolicy): ''' - Noise policy, mainly for DDPG. - Original inspiration from + Base class of random noise policy for DDPG + Adopted from https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py ''' - def __init__(self, env_spec, + def __init__(self, env_spec, exploration_anneal_episodes, mu, sigma, sigma_min, **kwargs): # absorb generic param without breaking - super(AnnealedGaussian, self).__init__(env_spec) - # epsilon-greedy * noise - self.init_e = 1.0 - self.final_e = 0.0 - self.e = self.init_e - self.exploration_anneal_episodes = 100 - + super(AnnealedGaussian, self).__init__( + env_spec, exploration_anneal_episodes) self.size = env_spec['action_dim'] - self.action_bound = env_spec['action_bound_high'] - assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] - self.n_steps_annealing = env_spec['timestep_limit'] / 2 self.mu = mu self.sigma = sigma - self.n_steps = 0 if sigma_min is not None: - self.m = -float(sigma - sigma_min) / float(self.n_steps_annealing) + self.m = -(sigma - sigma_min) / self.exploration_anneal_episodes self.c = sigma self.sigma_min = sigma_min else: @@ -92,41 +117,21 @@ def __init__(self, env_spec, @property def current_sigma(self): - sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c) + sigma = max(self.sigma_min, self.m * self.n_step + self.c) return sigma - def select_action(self, state): - agent = self.agent - state = np.expand_dims(state, axis=0) - if self.env_spec['actions'] == 'continuous': - action = agent.actor.predict( - state)[0] * self.action_bound + self.sample() * self.e - else: - Q_state = agent.actor.predict(state)[0] - assert Q_state.ndim == 1 - action = np.argmax(Q_state) - logger.info(str(Q_state)+' '+str(action)) - return action - - def update(self, sys_vars): - epi = sys_vars['epi'] - rise = self.final_e - self.init_e - slope = rise / float(self.exploration_anneal_episodes) - self.e = max(slope * epi + self.init_e, self.final_e) - return self.e - class GaussianWhiteNoise(AnnealedGaussian): - def __init__(self, env_spec, + def __init__(self, env_spec, exploration_anneal_episodes=20, mu=0., sigma=.3, sigma_min=None, **kwargs): # absorb generic param without breaking super(GaussianWhiteNoise, self).__init__( - env_spec, mu, sigma, sigma_min) + env_spec, exploration_anneal_episodes, + mu, sigma, sigma_min) def sample(self): sample = np.random.normal(self.mu, self.current_sigma, self.size) - self.n_steps += 1 return sample @@ -137,11 +142,12 @@ class OUNoise(AnnealedGaussian): http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab ''' - def __init__(self, env_spec, + def __init__(self, env_spec, exploration_anneal_episodes=20, theta=.15, mu=0., sigma=.3, dt=1e-2, x0=None, sigma_min=None, **kwargs): # absorb generic param without breaking super(OUNoise, self).__init__( - env_spec, mu, sigma, sigma_min, + env_spec, exploration_anneal_episodes, + mu, sigma, sigma_min, **kwargs) self.theta = theta self.mu = mu @@ -157,5 +163,4 @@ def sample(self): (self.mu - self.x_prev) * self.dt + self.current_sigma * \ np.sqrt(self.dt) * np.random.normal(size=self.size) self.x_prev = x - self.n_steps += 1 return x diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index c0f228a..56cde50 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -850,13 +850,14 @@ "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "DDPGLinearNoisePolicy", + "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, "tau": 0.001, - "lr": 0.001, + "lr": 0.0001, + "critic_lr": 0.001, "gamma": 0.999, "hidden_layers": [400, 300], "hidden_layers_activation": "relu", diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index aa865a7..6dc0e2f 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -23,10 +23,10 @@ ], "Policy": [ "AnnealedGaussian", - "GaussianWhiteNoise", - "OUNoise", "DDPGBoundedPolicy", - "DDPGLinearNoisePolicy" + "GaussianWhiteNoise", + "LinearNoisePolicy", + "OUNoise" ] }, "actor_critic": { From 22c331f3a7e4eab7cfc29c8a54bb153483e739cd Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 08:15:38 -0400 Subject: [PATCH 33/43] separate critic_lr for Critic --- rl/agent/ddpg_2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py index 485de93..be80418 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg_2.py @@ -90,7 +90,7 @@ class Critic(DQN): the action is from Actor ''' - def __init__(self, *args, tau=0.001, **kwargs): + def __init__(self, *args, tau=0.001, critic_lr=0.001, **kwargs): from keras.layers import Dense, Merge from keras import backend as K self.Merge = Merge @@ -98,6 +98,7 @@ def __init__(self, *args, tau=0.001, **kwargs): self.tf = self.K.tf self.sess = self.K.get_session() self.tau = tau + self.critic_lr = critic_lr # suggestion: 10 x actor_lr super(Critic, self).__init__(*args, **kwargs) def build_critic_models(self): @@ -164,7 +165,7 @@ def build_model(self): self.y = self.tf.placeholder(self.tf.float32, [None, 1]) self.loss = self.tf.losses.mean_squared_error(self.y, self.out) self.optimize = self.tf.train.AdamOptimizer( - self.lr).minimize(self.loss) + self.critic_lr).minimize(self.loss) self.action_gradient = self.tf.gradients(self.out, self.critic_actions) return self.model From 9ab35e9cc715b87781bae85d5e229e5e26f65f08 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 08:24:06 -0400 Subject: [PATCH 34/43] rename base to NoNoisePolicy as proper --- rl/agent/ddpg_2.py | 2 +- rl/policy/noise.py | 11 ++++++----- rl/spec/component_locks.json | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg_2.py index be80418..0462ca7 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg_2.py @@ -130,7 +130,7 @@ def build_critic_models(self): model.add(self.Dense(1, init='lecun_uniform', - activation='linear')) + activation='linear')) # fixed logger.info('Critic model summary') model.summary() self.model = model diff --git a/rl/policy/noise.py b/rl/policy/noise.py index b671e17..75459b8 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -3,19 +3,20 @@ from rl.policy.base_policy import Policy -class NoisePolicy(Policy): +class NoNoisePolicy(Policy): ''' The base class for noise policy for DDPG + default is no noise ''' def __init__(self, env_spec, **kwargs): # absorb generic param without breaking - super(NoisePolicy, self).__init__(env_spec) + super(NoNoisePolicy, self).__init__(env_spec) log_self(self) def sample(self): - '''implement noise here''' + '''implement noise here, default is none''' return 0 def select_action(self, state): @@ -33,7 +34,7 @@ def update(self, sys_vars): pass -class LinearNoisePolicy(NoisePolicy): +class LinearNoisePolicy(NoNoisePolicy): ''' policy with linearly decaying noise (1. / (1. + self.epi)) @@ -58,7 +59,7 @@ def update(self, sys_vars): self.n_step = sys_vars['epi'] -# class DDPGBoundedPolicy(NoisePolicy): +# class DDPGBoundedPolicy(NoNoisePolicy): # ''' # The bounded policy for actor critic agents diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 6dc0e2f..9a271f5 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -22,10 +22,10 @@ "PermamiDDPG" ], "Policy": [ - "AnnealedGaussian", "DDPGBoundedPolicy", "GaussianWhiteNoise", "LinearNoisePolicy", + "NoNoisePolicy", "OUNoise" ] }, From fb6059d82ad85528775f66a6eaea51a80bee5103 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 08:28:40 -0400 Subject: [PATCH 35/43] remove DDPGBoundedPolicy, already built in to DDPG --- rl/policy/noise.py | 31 --------------------------- rl/spec/classic_experiment_specs.json | 5 +++-- rl/spec/component_locks.json | 1 - 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/rl/policy/noise.py b/rl/policy/noise.py index 75459b8..d8e4fd0 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -59,37 +59,6 @@ def update(self, sys_vars): self.n_step = sys_vars['epi'] -# class DDPGBoundedPolicy(NoNoisePolicy): - -# ''' -# The bounded policy for actor critic agents -# and continous, bounded policy spaces -# Action bounded above and below by -# - action_bound, + action_bound -# ''' - -# def __init__(self, env_spec, -# **kwargs): # absorb generic param without breaking -# super(DDPGBoundedPolicy, self).__init__(env_spec) -# self.action_bound = env_spec['action_bound_high'] -# assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] -# log_self(self) - -# def sample(self): -# return 0 - -# def select_action(self, state): -# agent = self.agent -# state = np.expand_dims(state, axis=0) -# A_score = agent.actor.predict(state)[0] # extract from batch predict -# # action = np.tanh(A_score) * self.action_bound -# action = A_score * self.action_bound -# return action - -# def update(self, sys_vars): -# pass - - class AnnealedGaussian(LinearNoisePolicy): ''' diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 56cde50..d8c223d 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -801,7 +801,7 @@ "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "DDPGBoundedPolicy", + "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, @@ -826,7 +826,7 @@ "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "DDPGBoundedPolicy", + "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { "lr": 0.01, @@ -858,6 +858,7 @@ "tau": 0.001, "lr": 0.0001, "critic_lr": 0.001, + "exploration_anneal_episodes": 50, "gamma": 0.999, "hidden_layers": [400, 300], "hidden_layers_activation": "relu", diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 9a271f5..9330457 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -22,7 +22,6 @@ "PermamiDDPG" ], "Policy": [ - "DDPGBoundedPolicy", "GaussianWhiteNoise", "LinearNoisePolicy", "NoNoisePolicy", From a1524d10c1821262606525b30e19bac0d8f07486 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 08:37:21 -0400 Subject: [PATCH 36/43] remove useless ddpg examples --- rl/agent/ddpg.py | 187 ---------- rl/agent/permami_ddpg.py | 496 -------------------------- rl/spec/classic_experiment_specs.json | 55 +-- rl/spec/component_locks.json | 4 +- rl/spec/problems.json | 2 +- 5 files changed, 6 insertions(+), 738 deletions(-) delete mode 100644 rl/agent/ddpg.py delete mode 100644 rl/agent/permami_ddpg.py diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py deleted file mode 100644 index 61f046e..0000000 --- a/rl/agent/ddpg.py +++ /dev/null @@ -1,187 +0,0 @@ -import numpy as np -from rl.agent.dqn import DQN -from rl.util import logger, clone_model, clone_optimizer - - -class DDPG(DQN): - - ''' - The DDPG agent (algo), from https://arxiv.org/abs/1509.02971 - reference: https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html - https://github.com/matthiasplappert/keras-rl - ''' - - def __init__(self, *args, **kwargs): - # import only when needed to contain side-effects - from keras.layers import Dense, Merge - from keras.models import Sequential - from keras import backend as K - self.Dense = Dense - self.Merge = Merge - self.Sequential = Sequential - self.K = K - - self.TAU = 0.001 # for target network updates - super(DDPG, self).__init__(*args, **kwargs) - - def compile(self, memory, optimizer, policy, preprocessor): - # override to make 4 optimizers - self.optimizer = optimizer - # clone for actor, critic networks - self.optimizer.actor_keras_optimizer = clone_optimizer( - self.optimizer.keras_optimizer) - self.optimizer.target_actor_keras_optimizer = clone_optimizer( - self.optimizer.keras_optimizer) - self.optimizer.critic_keras_optimizer = clone_optimizer( - self.optimizer.keras_optimizer) - self.optimizer.target_critic_keras_optimizer = clone_optimizer( - self.optimizer.keras_optimizer) - del self.optimizer.keras_optimizer - - super(DDPG, self).compile(memory, self.optimizer, policy, preprocessor) - - def build_actor_models(self): - model = self.Sequential() - self.build_hidden_layers(model) - model.add(self.Dense(self.env_spec['action_dim'], - init='lecun_uniform', - activation=self.output_layer_activation)) - logger.info('Actor model summary') - model.summary() - self.actor = model - self.target_actor = clone_model(self.actor) - - def build_critic_models(self): - state_branch = self.Sequential() - state_branch.add(self.Dense( - self.hidden_layers[0], - input_shape=(self.env_spec['state_dim'],), - activation=self.hidden_layers_activation, - init='lecun_uniform')) - - action_branch = self.Sequential() - action_branch.add(self.Dense( - self.hidden_layers[0], - input_shape=(self.env_spec['action_dim'],), - activation=self.hidden_layers_activation, - init='lecun_uniform')) - - input_layer = self.Merge([state_branch, action_branch], mode='concat') - - model = self.Sequential() - model.add(input_layer) - - if (len(self.hidden_layers) > 1): - for i in range(1, len(self.hidden_layers)): - model.add(self.Dense( - self.hidden_layers[i], - init='lecun_uniform', - activation=self.hidden_layers_activation)) - - model.add(self.Dense(1, - init='lecun_uniform', - activation=self.output_layer_activation)) - logger.info('Critic model summary') - model.summary() - self.critic = model - self.target_critic = clone_model(self.critic) - - def build_model(self): - self.build_actor_models() - self.build_critic_models() - - def custom_critic_loss(self, y_true, y_pred): - return self.K.mean(self.K.square(y_true - y_pred)) - - def compile_model(self): - self.actor_state = self.actor.inputs[0] - self.action_gradient = self.K.placeholder( - shape=(None, self.env_spec['action_dim'])) - self.actor_grads = self.K.tf.gradients( - self.actor.output, self.actor.trainable_weights, - -self.action_gradient) - self.actor_optimize = self.K.tf.train.AdamOptimizer( - self.lr).apply_gradients( - zip(self.actor_grads, self.actor.trainable_weights)) - - self.critic_state = self.critic.inputs[0] - self.critic_action = self.critic.inputs[1] - self.critic_action_grads = self.K.tf.gradients( - self.critic.output, self.critic_action) - - # self.actor.compile( - # loss='mse', - # optimizer=self.optimizer.actor_keras_optimizer) - self.target_actor.compile( - loss='mse', - optimizer=self.optimizer.target_actor_keras_optimizer) - logger.info("Actor Models compiled") - - self.critic.compile( - loss=self.custom_critic_loss, - optimizer=self.optimizer.critic_keras_optimizer) - self.target_critic.compile( - loss='mse', - optimizer=self.optimizer.target_critic_keras_optimizer) - logger.info("Critic Models compiled") - - def update(self, sys_vars): - '''Agent update apart from training the Q function''' - self.policy.update(sys_vars) - self.update_n_epoch(sys_vars) - - def train_critic(self, minibatch): - '''update critic network using K-mean loss''' - mu_prime = self.target_actor.predict(minibatch['next_states']) - Q_prime = self.target_critic.predict( - [minibatch['next_states'], mu_prime]) - y = minibatch['rewards'] + self.gamma * \ - (1 - minibatch['terminals']) * Q_prime - critic_loss = self.critic.train_on_batch( - [minibatch['states'], minibatch['actions']], y) - errors = abs(np.sum(Q_prime - y, axis=1)) - self.memory.update(errors) - return critic_loss - - def train_actor(self, minibatch): - '''update actor network using sampled gradient''' - actions = self.actor.predict(minibatch['states']) - # critic_grads = critic.gradients(minibatch['states'], actions) - critic_grads = self.K.get_session().run( - self.critic_action_grads, feed_dict={ - self.critic_state: minibatch['states'], - self.critic_action: actions - })[0] - - # actor.train(minibatch['states'], critic_grads) - self.K.get_session().run(self.actor_optimize, feed_dict={ - self.actor_state: minibatch['states'], - self.action_gradient: critic_grads - }) - actor_loss = 0 - return actor_loss - - def train_target_networks(self): - '''update both target networks''' - actor_weights = self.actor.get_weights() - target_actor_weights = self.target_actor.get_weights() - for i, _w in enumerate(actor_weights): - target_actor_weights[i] = self.TAU * actor_weights[i] + ( - 1 - self.TAU) * target_actor_weights[i] - self.target_actor.set_weights(target_actor_weights) - - critic_weights = self.critic.get_weights() - target_critic_weights = self.target_critic.get_weights() - for i, _w in enumerate(critic_weights): - target_critic_weights[i] = self.TAU * critic_weights[i] + ( - 1 - self.TAU) * target_critic_weights[i] - self.target_critic.set_weights(target_critic_weights) - - def train_an_epoch(self): - minibatch = self.memory.rand_minibatch(self.batch_size) - critic_loss = self.train_critic(minibatch) - actor_loss = self.train_actor(minibatch) - self.train_target_networks() - - loss = critic_loss + actor_loss - return loss diff --git a/rl/agent/permami_ddpg.py b/rl/agent/permami_ddpg.py deleted file mode 100644 index 1307263..0000000 --- a/rl/agent/permami_ddpg.py +++ /dev/null @@ -1,496 +0,0 @@ -""" -DDPG implementation from https://github.com/pemami4911/deep-rl/blob/master/ddpg/ddpg.py -Implementation of DDPG - Deep Deterministic Policy Gradient -Algorithm and hyperparameter details can be found here: - http://arxiv.org/pdf/1509.02971v2.pdf -The algorithm is tested on the Pendulum-v0 OpenAI gym task -and developed with tflearn + Tensorflow -Author: Patrick Emami -""" -from rl.agent.base_agent import Agent -import tensorflow as tf -import numpy as np -# import gym -# from gym import wrappers -import tflearn - -# from replay_buffer import ReplayBuffer - -# ========================== -# Training Parameters -# ========================== -# Max training steps -MAX_EPISODES = 50000 -# Max episode length -MAX_EP_STEPS = 1000 -# Base learning rate for the Actor network -ACTOR_LEARNING_RATE = 0.0001 -# Base learning rate for the Critic Network -CRITIC_LEARNING_RATE = 0.001 -# Discount factor -GAMMA = 0.99 -# Soft target update param -TAU = 0.001 - -# # =========================== -# # Utility Parameters -# # =========================== -# # Render gym env during training -# RENDER_ENV = True -# # Use Gym Monitor -# GYM_MONITOR_EN = True -# # Gym environment -# ENV_NAME = 'Pendulum-v0' -# # Directory for storing gym results -# MONITOR_DIR = './results/gym_ddpg' -# # Directory for storing tensorboard summary results -# SUMMARY_DIR = './results/tf_ddpg' -# RANDOM_SEED = 1234 -# Size of replay buffer -BUFFER_SIZE = 10000 -MINIBATCH_SIZE = 64 - -# =========================== -# Actor and Critic DNNs -# =========================== - - -class ActorNetwork(object): - """ - Input to the network is the state, output is the action - under a deterministic policy. - - The output layer activation is a tanh to keep the action - between -2 and 2 - """ - - def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau): - self.sess = sess - self.s_dim = state_dim - self.a_dim = action_dim - self.action_bound = action_bound - self.learning_rate = learning_rate - self.tau = tau - - # Actor Network - self.inputs, self.out, self.scaled_out = self.create_actor_network() - - self.network_params = tf.trainable_variables() - - # Target Network - self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network() - - self.target_network_params = tf.trainable_variables()[ - len(self.network_params):] - - # Op for periodically updating target network with online network - # weights - self.update_target_network_params = \ - [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + - tf.multiply(self.target_network_params[i], 1. - self.tau)) - for i in range(len(self.target_network_params))] - - # This gradient will be provided by the critic network - self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim]) - - # Combine the gradients here - self.actor_gradients = tf.gradients( - self.scaled_out, self.network_params, -self.action_gradient) - - # Optimization Op - self.optimize = tf.train.AdamOptimizer(self.learning_rate).\ - apply_gradients(zip(self.actor_gradients, self.network_params)) - - self.num_trainable_vars = len( - self.network_params) + len(self.target_network_params) - - def create_actor_network(self): - inputs = tflearn.input_data(shape=[None, self.s_dim]) - net = tflearn.fully_connected(inputs, 400, activation='relu') - net = tflearn.fully_connected(net, 300, activation='relu') - # Final layer weights are init to Uniform[-3e-3, 3e-3] - w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) - out = tflearn.fully_connected( - net, self.a_dim, activation='tanh', weights_init=w_init) - # Scale output to -action_bound to action_bound - scaled_out = tf.multiply(out, self.action_bound) - return inputs, out, scaled_out - - def train(self, inputs, a_gradient): - self.sess.run(self.optimize, feed_dict={ - self.inputs: inputs, - self.action_gradient: a_gradient - }) - - def predict(self, inputs): - return self.sess.run(self.scaled_out, feed_dict={ - self.inputs: inputs - }) - - def predict_target(self, inputs): - return self.sess.run(self.target_scaled_out, feed_dict={ - self.target_inputs: inputs - }) - - def update_target_network(self): - self.sess.run(self.update_target_network_params) - - def get_num_trainable_vars(self): - return self.num_trainable_vars - - -class CriticNetwork(object): - """ - Input to the network is the state and action, output is Q(s,a). - The action must be obtained from the output of the Actor network. - - """ - - def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars): - self.sess = sess - self.s_dim = state_dim - self.a_dim = action_dim - self.learning_rate = learning_rate - self.tau = tau - - # Create the critic network - self.inputs, self.action, self.out = self.create_critic_network() - - self.network_params = tf.trainable_variables()[num_actor_vars:] - - # Target Network - self.target_inputs, self.target_action, self.target_out = self.create_critic_network() - - self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] - - # Op for periodically updating target network with online network - # weights with regularization - self.update_target_network_params = \ - [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) - for i in range(len(self.target_network_params))] - - # Network target (y_i) - self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) - - # Define loss and optimization Op - self.loss = tflearn.mean_square(self.predicted_q_value, self.out) - self.optimize = tf.train.AdamOptimizer( - self.learning_rate).minimize(self.loss) - - # Get the gradient of the net w.r.t. the action. - # For each action in the minibatch (i.e., for each x in xs), - # this will sum up the gradients of each critic output in the minibatch - # w.r.t. that action. Each output is independent of all - # actions except for one. - self.action_grads = tf.gradients(self.out, self.action) - - def create_critic_network(self): - inputs = tflearn.input_data(shape=[None, self.s_dim]) - action = tflearn.input_data(shape=[None, self.a_dim]) - net = tflearn.fully_connected(inputs, 400, activation='relu') - - # Add the action tensor in the 2nd hidden layer - # Use two temp layers to get the corresponding weights and biases - t1 = tflearn.fully_connected(net, 300) - t2 = tflearn.fully_connected(action, 300) - - net = tflearn.activation( - tf.matmul(net, t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu') - - # linear layer connected to 1 output representing Q(s,a) - # Weights are init to Uniform[-3e-3, 3e-3] - w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) - out = tflearn.fully_connected(net, 1, weights_init=w_init) - return inputs, action, out - - def train(self, inputs, action, predicted_q_value): - return self.sess.run([self.out, self.optimize], feed_dict={ - self.inputs: inputs, - self.action: action, - self.predicted_q_value: predicted_q_value - }) - - def predict(self, inputs, action): - return self.sess.run(self.out, feed_dict={ - self.inputs: inputs, - self.action: action - }) - - def predict_target(self, inputs, action): - return self.sess.run(self.target_out, feed_dict={ - self.target_inputs: inputs, - self.target_action: action - }) - - def action_gradients(self, inputs, actions): - return self.sess.run(self.action_grads, feed_dict={ - self.inputs: inputs, - self.action: actions - }) - - def update_target_network(self): - self.sess.run(self.update_target_network_params) - - - -class PermamiDDPG(Agent): - - ''' - The PermamiDDPG agent (algo), from https://arxiv.org/abs/1509.02971 - reference: https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html - https://github.com/matthiasplappert/keras-rl - ''' - - def __init__(self, *args, **kwargs): - # import only when needed to contain side-effects - # from keras.layers import Dense, Merge - # from keras.models import Sequential - # from keras import backend as K - # self.Dense = Dense - # self.Merge = Merge - # self.Sequential = Sequential - self.sess = tf.Session() - self.epi = 0 - self.n_epoch = 1 - self.batch_size = 64 - self.gamma = 0.99 - - # self.TAU = 0.001 # for target network updates - super(PermamiDDPG, self).__init__(*args, **kwargs) - self.build_model() - self.sess.run(tf.global_variables_initializer()) - - - def build_model(self): - state_dim = self.env_spec['state_dim'] - action_dim = self.env_spec['action_dim'] - action_bound = self.env_spec['action_bound_high'] - self.s_dim = state_dim - self.a_dim = action_dim - self.actor = ActorNetwork(self.sess, state_dim, action_dim, action_bound, - ACTOR_LEARNING_RATE, TAU) - self.critic = CriticNetwork(self.sess, state_dim, action_dim, - CRITIC_LEARNING_RATE, TAU, self.actor.get_num_trainable_vars()) - - def compile_model(self): - pass - - def select_action(self, state): - i = self.epi - action = self.actor.predict(np.reshape(state, (-1, self.s_dim))) + (1. / (1. + i)) - # print('action shape') - # print('action shape') - # print('action shape') - # print(action) - # print(action.shape) - return action[0] - - def update(self, sys_vars): - self.epi = sys_vars['epi'] - # Update target networks - self.actor.update_target_network() - self.critic.update_target_network() - return - - def to_train(self, sys_vars): - return self.memory.size() > MINIBATCH_SIZE - # return True - - def train_an_epoch(self): - minibatch = self.memory.rand_minibatch(self.batch_size) - s_batch = np.reshape(minibatch['states'], (-1, self.s_dim)) - a_batch = np.reshape(minibatch['actions'], (-1, self.a_dim)) - s2_batch = np.reshape(minibatch['next_states'], (-1, self.s_dim)) - - target_q = self.critic.predict_target( - s2_batch, - self.actor.predict_target(s2_batch)) - - y_i = minibatch['rewards'] + self.gamma * \ - (1 - minibatch['terminals']) * np.reshape(target_q, (-1)) - y_i = np.reshape(y_i, (-1, 1)) - - predicted_q_value, _ = self.critic.train( - s_batch, a_batch, y_i) - # minibatch['states'], - # minibatch['actions'], - # y_i) - # # np.reshape(y_i, (self.batch_size, 1))) - - # ep_ave_max_q = np.amax(predicted_q_value) - # print('epi: ' + str(self.epi) + ' Q_max: '+str(ep_ave_max_q)) - - - # Update the actor policy using the sampled gradient - a_outs = self.actor.predict(s_batch) - grads = self.critic.action_gradients(s_batch, a_outs) - self.actor.train(s_batch, grads[0]) - # return actor_loss - return - - # (Q_states, _states, Q_next_states_max) = self.compute_Q_states( - # minibatch) - # Q_targets = self.compute_Q_targets( - # minibatch, Q_states, Q_next_states_max) - - # loss = self.model.train_on_batch(minibatch['states'], Q_targets) - - # errors = abs(np.sum(Q_states - Q_targets, axis=1)) - # self.memory.update(errors) - # return loss - - def train(self, sys_vars): - ''' - Training is for the Q function (NN) only - otherwise (e.g. policy) see self.update() - step 1,2,3,4 of algo. - ''' - loss_total = 0 - for _epoch in range(self.n_epoch): - loss = self.train_an_epoch() - # loss_total += loss - avg_loss = loss_total / self.n_epoch - sys_vars['loss'].append(avg_loss) - return avg_loss - -# =========================== -# Tensorflow Summary Ops -# =========================== - - -# def build_summaries(): -# episode_reward = tf.Variable(0.) -# tf.summary.scalar("Reward", episode_reward) -# episode_ave_max_q = tf.Variable(0.) -# tf.summary.scalar("Qmax Value", episode_ave_max_q) - -# summary_vars = [episode_reward, episode_ave_max_q] -# summary_ops = tf.summary.merge_all() - -# return summary_ops, summary_vars - -# # =========================== -# # Agent Training -# # =========================== - - -# def train(sess, env, actor, critic): - -# # Set up summary Ops -# summary_ops, summary_vars = build_summaries() - -# sess.run(tf.global_variables_initializer()) -# writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) - -# # Initialize target network weights -# actor.update_target_network() -# critic.update_target_network() - -# # Initialize replay memory -# replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) - -# for i in xrange(MAX_EPISODES): - -# s = env.reset() - -# ep_reward = 0 -# ep_ave_max_q = 0 - -# for j in xrange(MAX_EP_STEPS): - -# if RENDER_ENV: -# env.render() - -# # # Added exploration noise -# # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) - -# # s2, r, terminal, info = env.step(a[0]) - -# replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, -# terminal, np.reshape(s2, (actor.s_dim,))) - -# # Keep adding experience to the memory until -# # there are at least minibatch size samples -# if replay_buffer.size() > MINIBATCH_SIZE: -# # s_batch, a_batch, r_batch, t_batch, s2_batch = \ -# # replay_buffer.sample_batch(MINIBATCH_SIZE) - -# # # Calculate targets -# # target_q = critic.predict_target( -# # s2_batch, actor.predict_target(s2_batch)) - -# # y_i = [] -# # for k in xrange(MINIBATCH_SIZE): -# # if t_batch[k]: -# # y_i.append(r_batch[k]) -# # else: -# # y_i.append(r_batch[k] + GAMMA * target_q[k]) - -# # # Update the critic given the targets -# # predicted_q_value, _ = critic.train( -# # s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) - -# # ep_ave_max_q += np.amax(predicted_q_value) - -# # # Update the actor policy using the sampled gradient -# # a_outs = actor.predict(s_batch) -# # grads = critic.action_gradients(s_batch, a_outs) -# # actor.train(s_batch, grads[0]) - -# # # Update target networks -# # actor.update_target_network() -# # critic.update_target_network() - -# s = s2 -# ep_reward += r - -# # if terminal: - -# # summary_str = sess.run(summary_ops, feed_dict={ -# # summary_vars[0]: ep_reward, -# # summary_vars[1]: ep_ave_max_q / float(j) -# # }) - -# # writer.add_summary(summary_str, i) -# # writer.flush() - -# # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ -# # '| Qmax: %.4f' % (ep_ave_max_q / float(j)) - -# # break - - -# def main(_): -# with tf.Session() as sess: - -# env = gym.make(ENV_NAME) -# np.random.seed(RANDOM_SEED) -# tf.set_random_seed(RANDOM_SEED) -# env.seed(RANDOM_SEED) - -# # state_dim = env.observation_space.shape[0] -# # action_dim = env.action_space.shape[0] -# # action_bound = env.action_space.high -# # # Ensure action bound is symmetric -# # assert (env.action_space.high == -env.action_space.low) - -# # actor = ActorNetwork(sess, state_dim, action_dim, action_bound, -# # ACTOR_LEARNING_RATE, TAU) - -# # critic = CriticNetwork(sess, state_dim, action_dim, -# # CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) - -# if GYM_MONITOR_EN: -# if not RENDER_ENV: -# env = wrappers.Monitor( -# env, MONITOR_DIR, video_callable=False, force=True) -# else: -# env = wrappers.Monitor(env, MONITOR_DIR, force=True) - -# train(sess, env, actor, critic) - -# if GYM_MONITOR_EN: -# env.monitor.close() - -# if __name__ == '__main__': -# tf.app.run() \ No newline at end of file diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index d8c223d..7878981 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -803,55 +803,6 @@ "Optimizer": "AdamOptimizer", "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", - "param": { - "batch_size": 64, - "lr": 0.001, - "gamma": 0.999, - "hidden_layers": [400, 300], - "hidden_layers_activation": "relu", - "output_layer_activation": "tanh" - }, - "param_range": { - "lr": [0.001, 0.01], - "gamma": [0.999], - "hidden_layers": [ - [400], - [400, 300] - ] - } - }, - "pendulum_permami_ddpg": { - "problem": "Pendulum-v0", - "Agent": "PermamiDDPG", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "LinearNoisePolicy", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.01, - "gamma": 0.999, - "hidden_layers": [200], - "hidden_layers_activation": "relu", - "output_layer_activation": "tanh" - }, - "param_range": { - "lr": [0.001, 0.01], - "gamma": [0.999], - "hidden_layers": [ - [400], - [400, 300] - ] - } - }, - "pendulum_ddpg2": { - "problem": "Pendulum-v0", - "Agent": "DDPG2", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "LinearNoisePolicy", - "PreProcessor": "NoPreProcessor", "param": { "batch_size": 64, "n_epoch": 1, @@ -865,8 +816,10 @@ "output_layer_activation": "tanh" }, "param_range": { - "lr": [0.001, 0.01], - "gamma": [0.999], + "tau": [0.001, 0.005, 0.01], + "lr": [0.0001, 0.001, 0.01], + "critic_lr": [0.001, 0.01], + "gamma": [0.97, 0.99, 0.999], "hidden_layers": [ [400], [400, 300] diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 9330457..7b963f7 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -17,9 +17,7 @@ "details": "ddpg uses white-noise policy", "head": "Agent", "Agent": [ - "DDPG", - "DDPG2", - "PermamiDDPG" + "DDPG" ], "Policy": [ "GaussianWhiteNoise", diff --git a/rl/spec/problems.json b/rl/spec/problems.json index 40afb3f..a0337ff 100644 --- a/rl/spec/problems.json +++ b/rl/spec/problems.json @@ -44,7 +44,7 @@ "Pendulum-v0": { "GYM_ENV_NAME": "Pendulum-v0", "SOLVED_MEAN_REWARD": null, - "MAX_EPISODES": 1000, + "MAX_EPISODES": 300, "REWARD_MEAN_LEN": 100 }, "LunarLander-v2": { From da54f53e0f1eda2f800ed9e504fb985ff5b22305 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 08:38:18 -0400 Subject: [PATCH 37/43] rename ddpg2 to ddpg --- rl/agent/{ddpg_2.py => ddpg.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename rl/agent/{ddpg_2.py => ddpg.py} (99%) diff --git a/rl/agent/ddpg_2.py b/rl/agent/ddpg.py similarity index 99% rename from rl/agent/ddpg_2.py rename to rl/agent/ddpg.py index 0462ca7..7e89f94 100644 --- a/rl/agent/ddpg_2.py +++ b/rl/agent/ddpg.py @@ -199,7 +199,7 @@ def train(self, states, actions, y): }) -class DDPG2(DQN): +class DDPG(DQN): ''' DDPG Algorithm, from https://arxiv.org/abs/1509.02971 @@ -215,7 +215,7 @@ def __init__(self, *args, **kwargs): self.actor = Actor(*args, **kwargs) self.critic = Critic(*args, **kwargs) self.sess.run(self.K.tf.global_variables_initializer()) - super(DDPG2, self).__init__(*args, **kwargs) + super(DDPG, self).__init__(*args, **kwargs) def build_model(self): pass From d30fe835750a6669672b78928558320941f21e50 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 08:57:25 -0400 Subject: [PATCH 38/43] stylefix --- rl/agent/ddpg.py | 21 +++++++++++---------- rl/policy/noise.py | 3 ++- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py index 7e89f94..cb9efb8 100644 --- a/rl/agent/ddpg.py +++ b/rl/agent/ddpg.py @@ -1,6 +1,6 @@ import numpy as np from rl.agent.dqn import DQN -from rl.util import logger, clone_model, clone_optimizer +from rl.util import logger, clone_model class Actor(DQN): @@ -62,7 +62,7 @@ def compile_model(self): def recompile_model(self, sys_vars): pass - def update(self): + def update(self, sys_vars): self.sess.run(self.update_target_network_op) def predict(self, states): @@ -75,7 +75,7 @@ def target_predict(self, next_states): self.target_actor_states: next_states }) - def train(self, states, critic_action_gradient): + def train_tf(self, states, critic_action_gradient): return self.sess.run(self.optimize, feed_dict={ self.actor_states: states, self.action_gradient: critic_action_gradient @@ -93,6 +93,7 @@ class Critic(DQN): def __init__(self, *args, tau=0.001, critic_lr=0.001, **kwargs): from keras.layers import Dense, Merge from keras import backend as K + self.Dense = Dense self.Merge = Merge self.K = K self.tf = self.K.tf @@ -170,7 +171,7 @@ def build_model(self): self.action_gradient = self.tf.gradients(self.out, self.critic_actions) return self.model - def update(self): + def update(self, sys_vars): self.sess.run(self.update_target_network_op) def get_action_gradient(self, states, actions): @@ -191,7 +192,7 @@ def target_predict(self, next_states, mu_prime): self.target_critic_actions: mu_prime }) - def train(self, states, actions, y): + def train_tf(self, states, actions, y): return self.sess.run([self.out, self.optimize, self.loss], feed_dict={ self.critic_states: states, self.critic_actions: actions, @@ -223,7 +224,7 @@ def build_model(self): def compile_model(self): pass - def recompile_model(self): + def recompile_model(self, sys_vars): pass def select_action(self, state): @@ -231,8 +232,8 @@ def select_action(self, state): def update(self, sys_vars): # Update target networks - self.actor.update() - self.critic.update() + self.actor.update(sys_vars) + self.critic.update(sys_vars) self.policy.update(sys_vars) self.update_n_epoch(sys_vars) @@ -249,7 +250,7 @@ def train_an_epoch(self): (1 - minibatch['terminals']) * np.reshape(q_prime, (-1)) y = np.reshape(y, (-1, 1)) - _, _, critic_loss = self.critic.train( + _, _, critic_loss = self.critic.train_tf( minibatch['states'], minibatch['actions'], y) # train actor @@ -258,7 +259,7 @@ def train_an_epoch(self): critic_action_gradient = self.critic.get_action_gradient( minibatch['states'], actions) # currently cant be gotten - _actor_loss = self.actor.train( + _actorloss = self.actor.train_tf( minibatch['states'], critic_action_gradient) loss = critic_loss diff --git a/rl/policy/noise.py b/rl/policy/noise.py index d8e4fd0..baa30c0 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -1,5 +1,5 @@ import numpy as np -from rl.util import logger, log_self +from rl.util import log_self from rl.policy.base_policy import Policy @@ -17,6 +17,7 @@ def __init__(self, env_spec, def sample(self): '''implement noise here, default is none''' + assert 'actions' in self.env_spec return 0 def select_action(self, state): From e469f72af0fae25c32253e5091e214e0bd86b32a Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 23:35:01 -0400 Subject: [PATCH 39/43] warn instead of break for component lock --- rl/util.py | 122 ++++++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/rl/util.py b/rl/util.py index 915f3aa..23e3e65 100644 --- a/rl/util.py +++ b/rl/util.py @@ -23,6 +23,65 @@ } +# parse_args to add flag +parser = argparse.ArgumentParser(description='Set flags for functions') +parser.add_argument("-b", "--blind", + help="dont render graphics", + action="store_const", + dest="render", + const=False, + default=True) +parser.add_argument("-d", "--debug", + help="activate debug log", + action="store_const", + dest="loglevel", + const=logging.DEBUG, + default=logging.INFO) +parser.add_argument("-e", "--experiment", + help="specify experiment to run", + action="store", + type=str, + nargs='?', + dest="experiment", + default="dev_dqn") +parser.add_argument("-p", "--param_selection", + help="run parameter selection if present", + action="store_true", + dest="param_selection", + default=False) +parser.add_argument("-q", "--quiet", + help="change log to warning level", + action="store_const", + dest="loglevel", + const=logging.WARNING, + default=logging.INFO) +parser.add_argument("-t", "--times", + help="number of times session is run", + action="store", + nargs='?', + type=int, + dest="times", + default=1) +parser.add_argument("-x", "--max_episodes", + help="manually set environment max episodes", + action="store", + nargs='?', + type=int, + dest="max_epis", + default=-1) +args = parser.parse_args([]) if environ.get('CI') else parser.parse_args() + +# Goddam python logger +logger = logging.getLogger(__name__) +handler = logging.StreamHandler(sys.stdout) +handler.setFormatter( + logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')) +logger.setLevel(args.loglevel) +logger.addHandler(handler) +logger.propagate = False +environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # mute tf warnings on optimized setup + + def check_equal(iterator): '''check if list contains all the same elements''' iterator = iter(iterator) @@ -54,7 +113,7 @@ def check_lock(lock_name, lock, experiment_spec): # rest must all have the same signature rest_equal = check_equal(bin_rest_list) if not rest_equal: - raise ValueError( + logger.warn( 'All components need to be of the same set, ' 'check component lock "{}" and your spec "{}"'.format( lock_name, experiment_spec['experiment_name'])) @@ -63,7 +122,7 @@ def check_lock(lock_name, lock, experiment_spec): lock_sig = [bin_head, bin_rest] lock_valid = lock_sig in valid_lock_sig_list if not lock_valid: - raise ValueError( + logger.warn( 'Component lock violated: "{}", spec: "{}"'.format( lock_name, experiment_spec['experiment_name'])) return lock_valid @@ -127,65 +186,6 @@ def import_guard_asset(): PROBLEMS, EXPERIMENT_SPECS = import_guard_asset() -# parse_args to add flag -parser = argparse.ArgumentParser(description='Set flags for functions') -parser.add_argument("-b", "--blind", - help="dont render graphics", - action="store_const", - dest="render", - const=False, - default=True) -parser.add_argument("-d", "--debug", - help="activate debug log", - action="store_const", - dest="loglevel", - const=logging.DEBUG, - default=logging.INFO) -parser.add_argument("-e", "--experiment", - help="specify experiment to run", - action="store", - type=str, - nargs='?', - dest="experiment", - default="dev_dqn") -parser.add_argument("-p", "--param_selection", - help="run parameter selection if present", - action="store_true", - dest="param_selection", - default=False) -parser.add_argument("-q", "--quiet", - help="change log to warning level", - action="store_const", - dest="loglevel", - const=logging.WARNING, - default=logging.INFO) -parser.add_argument("-t", "--times", - help="number of times session is run", - action="store", - nargs='?', - type=int, - dest="times", - default=1) -parser.add_argument("-x", "--max_episodes", - help="manually set environment max episodes", - action="store", - nargs='?', - type=int, - dest="max_epis", - default=-1) -args = parser.parse_args([]) if environ.get('CI') else parser.parse_args() - -# Goddam python logger -logger = logging.getLogger(__name__) -handler = logging.StreamHandler(sys.stdout) -handler.setFormatter( - logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')) -logger.setLevel(args.loglevel) -logger.addHandler(handler) -logger.propagate = False -environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # mute tf warnings on optimized setup - - def log_self(subject): max_info_len = 300 info = '{}, param: {}'.format( From 3b766a98083ea4e2b4298eef307c13f4295912c1 Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 23:37:51 -0400 Subject: [PATCH 40/43] mute double dqn recompile both models till performance is fixed --- rl/agent/double_dqn.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/rl/agent/double_dqn.py b/rl/agent/double_dqn.py index 92e104d..3fd859d 100644 --- a/rl/agent/double_dqn.py +++ b/rl/agent/double_dqn.py @@ -41,14 +41,15 @@ def switch_models(self): self.optimizer.keras_optimizer = self.optimizer.keras_optimizer_2 self.optimizer.keras_optimizer_2 = temp_optimizer - def recompile_model(self, sys_vars): - '''rotate and recompile both models''' - if self.epi_change_lr is not None: - self.switch_models() # to model_2 - super(DoubleDQN, self).recompile_model(sys_vars) - self.switch_models() # back to model - super(DoubleDQN, self).recompile_model(sys_vars) - return self.model + # def recompile_model(self, sys_vars): + # '''rotate and recompile both models''' + # # TODO fix this, double recompile breaks solving power + # if self.epi_change_lr is not None: + # self.switch_models() # to model_2 + # super(DoubleDQN, self).recompile_model(sys_vars) + # self.switch_models() # back to model + # super(DoubleDQN, self).recompile_model(sys_vars) + # return self.model def compute_Q_states(self, minibatch): (Q_states, Q_next_states_select, _max) = super( From c650b0476a6fa610a36a3ba4056c903e5e77334d Mon Sep 17 00:00:00 2001 From: kengz Date: Tue, 18 Apr 2017 23:47:11 -0400 Subject: [PATCH 41/43] fix graph rendering on single trial by mpl backend switching --- rl/analytics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rl/analytics.py b/rl/analytics.py index 63cbc80..4d24ad1 100644 --- a/rl/analytics.py +++ b/rl/analytics.py @@ -8,8 +8,10 @@ warnings.filterwarnings("ignore", module="matplotlib") -MPL_BACKEND = 'agg' if ( - environ.get('CI') or platform.system() == 'Darwin') else 'TkAgg' +if platform.system() == 'Darwin': + MPL_BACKEND = 'agg' if args.param_selection else 'macosx' +else: + MPL_BACKEND = 'TkAgg' STATS_COLS = [ 'best_session_epi', From be6c2a9472408a4043db968be931340681cc577d Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 19 Apr 2017 00:01:29 -0400 Subject: [PATCH 42/43] rename noise policies properly --- rl/policy/noise.py | 12 ++++++------ rl/spec/component_locks.json | 4 ++-- rl/spec/dev_experiment_specs.json | 25 ------------------------- 3 files changed, 8 insertions(+), 33 deletions(-) diff --git a/rl/policy/noise.py b/rl/policy/noise.py index baa30c0..fec9507 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -60,7 +60,7 @@ def update(self, sys_vars): self.n_step = sys_vars['epi'] -class AnnealedGaussian(LinearNoisePolicy): +class AnnealedGaussianPolicy(LinearNoisePolicy): ''' Base class of random noise policy for DDPG @@ -71,7 +71,7 @@ class AnnealedGaussian(LinearNoisePolicy): def __init__(self, env_spec, exploration_anneal_episodes, mu, sigma, sigma_min, **kwargs): # absorb generic param without breaking - super(AnnealedGaussian, self).__init__( + super(AnnealedGaussianPolicy, self).__init__( env_spec, exploration_anneal_episodes) self.size = env_spec['action_dim'] self.mu = mu @@ -92,12 +92,12 @@ def current_sigma(self): return sigma -class GaussianWhiteNoise(AnnealedGaussian): +class GaussianWhiteNoisePolicy(AnnealedGaussianPolicy): def __init__(self, env_spec, exploration_anneal_episodes=20, mu=0., sigma=.3, sigma_min=None, **kwargs): # absorb generic param without breaking - super(GaussianWhiteNoise, self).__init__( + super(GaussianWhiteNoisePolicy, self).__init__( env_spec, exploration_anneal_episodes, mu, sigma, sigma_min) @@ -106,7 +106,7 @@ def sample(self): return sample -class OUNoise(AnnealedGaussian): +class OUNoisePolicy(AnnealedGaussianPolicy): ''' Based on @@ -116,7 +116,7 @@ class OUNoise(AnnealedGaussian): def __init__(self, env_spec, exploration_anneal_episodes=20, theta=.15, mu=0., sigma=.3, dt=1e-2, x0=None, sigma_min=None, **kwargs): # absorb generic param without breaking - super(OUNoise, self).__init__( + super(OUNoisePolicy, self).__init__( env_spec, exploration_anneal_episodes, mu, sigma, sigma_min, **kwargs) diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index 7b963f7..759ef0f 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -20,10 +20,10 @@ "DDPG" ], "Policy": [ - "GaussianWhiteNoise", + "GaussianWhiteNoisePolicy", "LinearNoisePolicy", "NoNoisePolicy", - "OUNoise" + "OUNoisePolicy" ] }, "actor_critic": { diff --git a/rl/spec/dev_experiment_specs.json b/rl/spec/dev_experiment_specs.json index be835e9..eadbd36 100644 --- a/rl/spec/dev_experiment_specs.json +++ b/rl/spec/dev_experiment_specs.json @@ -120,30 +120,5 @@ "gamma": [0.97, 0.99], "lr": [0.01, 0.1] } - }, - "dev": { - "problem": "CartPole-v0", - "Agent": "DDPG", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "OUNoise", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.01, - "decay": 0.0, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "output_layer_activation": "linear", - "exploration_anneal_episodes": 10, - "auto_architecture": false, - "num_hidden_layers": 3, - "first_hidden_layer_size": 512 - }, - "param_range": { - "gamma": [0.97, 0.99], - "lr": [0.01, 0.1] - } } } From 0b50a69c3c25ff30a212f03876bb028982a2672f Mon Sep 17 00:00:00 2001 From: kengz Date: Wed, 19 Apr 2017 00:03:57 -0400 Subject: [PATCH 43/43] add ac, ddpg tests --- test/test_advanced.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_advanced.py b/test/test_advanced.py index eec9607..7407e56 100644 --- a/test/test_advanced.py +++ b/test/test_advanced.py @@ -51,3 +51,13 @@ def test_breakout_dqn(cls): def test_breakout_double_dqn(cls): data_df = run('breakout_double_dqn') assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_cartpole_ac_argmax(cls): + data_df = run('cartpole_ac_argmax') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_pendulum_ddpg(cls): + data_df = run('pendulum_ddpg') + assert isinstance(data_df, pd.DataFrame)