diff --git a/rl/agent/actor_critic.py b/rl/agent/actor_critic.py new file mode 100644 index 0000000..945f2c9 --- /dev/null +++ b/rl/agent/actor_critic.py @@ -0,0 +1,139 @@ +import numpy as np +from rl.agent.dqn import DQN +from rl.util import logger + + +class ActorCritic(DQN): + + ''' + Actor Critic algorithm. The actor's policy + is adjusted in the direction that will lead to + better actions, guided by the critic + Implementation adapted from + http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html + + Assumes one of the policies in actor_critic.py are being used + ''' + + def __init__(self, env_spec, + train_per_n_new_exp=1, + gamma=0.95, lr=0.1, + epi_change_lr=None, + batch_size=16, n_epoch=5, hidden_layers=None, + hidden_layers_activation='sigmoid', + output_layer_activation='linear', + auto_architecture=False, + num_hidden_layers=3, + first_hidden_layer_size=256, + num_initial_channels=16, + **kwargs): # absorb generic param without breaking + # import only when needed to contain side-effects + from keras.layers.core import Dense + from keras.models import Sequential, load_model + self.Dense = Dense + self.Sequential = Sequential + self.load_model = load_model + + super(ActorCritic, self).__init__(env_spec, + train_per_n_new_exp, + gamma, lr, + epi_change_lr, + batch_size, n_epoch, hidden_layers, + hidden_layers_activation, + output_layer_activation, + auto_architecture, + num_hidden_layers, + first_hidden_layer_size, + num_initial_channels, + **kwargs) + + def build_model(self): + self.build_actor() + self.build_critic() + logger.info("Actor and critic models built") + + def build_actor(self): + actor = self.Sequential() + super(ActorCritic, self).build_hidden_layers(actor) + actor.add(self.Dense(self.env_spec['action_dim'], + init='lecun_uniform', + activation=self.output_layer_activation)) + logger.info("Actor summary") + actor.summary() + self.actor = actor + + def build_critic(self): + critic = self.Sequential() + super(ActorCritic, self).build_hidden_layers(critic) + critic.add(self.Dense(1, + init='lecun_uniform', + activation=self.output_layer_activation)) + logger.info("Critic summary") + critic.summary() + self.critic = critic + + def compile_model(self): + self.actor.compile( + loss='mse', + optimizer=self.optimizer.keras_optimizer) + self.critic.compile( + loss='mse', + optimizer=self.optimizer.keras_optimizer) + logger.info("Actor and critic compiled") + + def recompile_model(self, sys_vars): + ''' + Option to change model optimizer settings + Currently only used for changing the learning rate + Compiling does not affect the model weights + ''' + if self.epi_change_lr is not None: + if (sys_vars['epi'] == self.epi_change_lr and + sys_vars['t'] == 0): + self.lr = self.lr / 10.0 + self.optimizer.change_optim_param(**{'lr': self.lr}) + self.actor.compile( + loss='mse', + optimizer=self.optimizer.keras_optimizer) + self.critic.compile( + loss='mse', + optimizer=self.optimizer.keras_optimizer) + logger.info( + 'Actor and critic models recompiled with new settings: ' + 'Learning rate: {}'.format(self.lr)) + + def train_critic(self, minibatch): + Q_vals = np.clip(self.critic.predict(minibatch['states']), + -self.clip_val, self.clip_val) + Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']), + -self.clip_val, self.clip_val) + Q_targets = minibatch['rewards'] + self.gamma * \ + (1 - minibatch['terminals']) * Q_next_vals.squeeze() + Q_targets = np.expand_dims(Q_targets, axis=1) + + actor_delta = Q_next_vals - Q_vals + loss = self.critic.train_on_batch(minibatch['states'], Q_targets) + + errors = abs(np.sum(Q_vals - Q_targets, axis=1)) + self.memory.update(errors) + return loss, actor_delta + + def train_actor(self, minibatch, actor_delta): + old_vals = self.actor.predict(minibatch['states']) + if self.env_spec['actions'] == 'continuous': + A_targets = np.zeros( + (actor_delta.shape[0], self.env_spec['action_dim'])) + for j in range(A_targets.shape[1]): + A_targets[:, j] = actor_delta.squeeze() + else: + A_targets = minibatch['actions'] * actor_delta + \ + (1 - minibatch['actions']) * old_vals + + loss = self.actor.train_on_batch(minibatch['states'], A_targets) + return loss + + def train_an_epoch(self): + minibatch = self.memory.rand_minibatch(self.batch_size) + critic_loss, actor_delta = self.train_critic(minibatch) + actor_loss = self.train_actor(minibatch, actor_delta) + return critic_loss + actor_loss diff --git a/rl/agent/ddpg.py b/rl/agent/ddpg.py index 50e06be..cb9efb8 100644 --- a/rl/agent/ddpg.py +++ b/rl/agent/ddpg.py @@ -1,54 +1,106 @@ +import numpy as np from rl.agent.dqn import DQN -from rl.util import logger, clone_model, clone_optimizer +from rl.util import logger, clone_model -class DDPG(DQN): +class Actor(DQN): + ''' + Actor of DDPG, with its network and target network + input is states, output is action + very similar to DQN + ''' + + def __init__(self, *args, tau=0.001, **kwargs): + from keras import backend as K + self.K = K + self.tf = self.K.tf + self.sess = self.K.get_session() + self.tau = tau + super(Actor, self).__init__(*args, **kwargs) + + def build_model(self): + self.model = super(Actor, self).build_model() + self.target_model = clone_model(self.model) + + self.actor_states = self.model.inputs[0] + self.out = self.model.output + self.scaled_out = self.tf.multiply( + self.out, self.env_spec['action_bound_high']) + self.network_params = self.model.trainable_weights + + self.target_actor_states = self.target_model.inputs[0] + self.target_out = self.target_model.output + self.target_scaled_out = self.tf.multiply( + self.target_out, self.env_spec['action_bound_high']) + self.target_network_params = self.target_model.trainable_weights + + # Op for updating target network + self.update_target_network_op = [] + for i, t_w in enumerate(self.target_network_params): + op = t_w.assign( + self.tf.multiply( + self.tau, self.network_params[i] + ) + self.tf.multiply(1. - self.tau, t_w)) + self.update_target_network_op.append(op) + + # will be fed as self.action_gradient: critic_grads + self.action_gradient = self.tf.placeholder( + self.tf.float32, [None, self.env_spec['action_dim']]) + + # actor model gradient op, to be fed from critic + self.actor_gradients = self.tf.gradients( + self.scaled_out, self.network_params, -self.action_gradient) + + # Optimization op + self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients( + zip(self.actor_gradients, self.network_params)) + return self.model + + def compile_model(self): + pass + + def recompile_model(self, sys_vars): + pass + + def update(self, sys_vars): + self.sess.run(self.update_target_network_op) + + def predict(self, states): + return self.sess.run(self.scaled_out, feed_dict={ + self.actor_states: states + }) + + def target_predict(self, next_states): + return self.sess.run(self.target_scaled_out, feed_dict={ + self.target_actor_states: next_states + }) + + def train_tf(self, states, critic_action_gradient): + return self.sess.run(self.optimize, feed_dict={ + self.actor_states: states, + self.action_gradient: critic_action_gradient + }) + + +class Critic(DQN): ''' - The DDPG agent (algo), from https://arxiv.org/abs/1509.02971 - reference: https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html - https://github.com/matthiasplappert/keras-rl + Critic of DDPG, with its network and target network + input is states and actions, output is Q value + the action is from Actor ''' - def __init__(self, *args, **kwargs): - # import only when needed to contain side-effects + def __init__(self, *args, tau=0.001, critic_lr=0.001, **kwargs): from keras.layers import Dense, Merge - from keras.models import Sequential from keras import backend as K self.Dense = Dense self.Merge = Merge - self.Sequential = Sequential self.K = K - - self.TAU = 0.001 # for target network updates - super(DDPG, self).__init__(*args, **kwargs) - - def compile(self, memory, optimizer, policy, preprocessor): - # override to make 4 optimizers - self.optimizer = optimizer - # clone for actor, critic networks - self.optimizer.actor_keras_optimizer = clone_optimizer( - self.optimizer.keras_optimizer) - self.optimizer.target_actor_keras_optimizer = clone_optimizer( - self.optimizer.keras_optimizer) - self.optimizer.critic_keras_optimizer = clone_optimizer( - self.optimizer.keras_optimizer) - self.optimizer.target_critic_keras_optimizer = clone_optimizer( - self.optimizer.keras_optimizer) - del self.optimizer.keras_optimizer - - super(DDPG, self).compile(memory, self.optimizer, policy, preprocessor) - - def build_actor_models(self): - model = self.Sequential() - self.build_hidden_layers(model) - model.add(self.Dense(self.env_spec['action_dim'], - init='lecun_uniform', - activation=self.output_layer_activation)) - logger.info('Actor model summary') - model.summary() - self.actor = model - self.target_actor = clone_model(self.actor) + self.tf = self.K.tf + self.sess = self.K.get_session() + self.tau = tau + self.critic_lr = critic_lr # suggestion: 10 x actor_lr + super(Critic, self).__init__(*args, **kwargs) def build_critic_models(self): state_branch = self.Sequential() @@ -79,108 +131,136 @@ def build_critic_models(self): model.add(self.Dense(1, init='lecun_uniform', - activation=self.output_layer_activation)) + activation='linear')) # fixed logger.info('Critic model summary') model.summary() - self.critic = model - self.target_critic = clone_model(self.critic) + self.model = model + + logger.info("Model built") + return self.model def build_model(self): - self.build_actor_models() - self.build_critic_models() + self.model = self.build_critic_models() + self.target_model = clone_model(self.model) + + self.critic_states = self.model.inputs[0] + self.critic_actions = self.model.inputs[1] + self.out = self.model.output + self.network_params = self.model.trainable_weights - def custom_critic_loss(self, y_true, y_pred): - return self.K.mean(self.K.square(y_true - y_pred)) + self.target_critic_states = self.target_model.inputs[0] + self.target_critic_actions = self.target_model.inputs[1] + self.target_out = self.target_model.output + self.target_network_params = self.target_model.trainable_weights + + # Op for updating target network + self.update_target_network_op = [] + for i, t_w in enumerate(self.target_network_params): + op = t_w.assign( + self.tf.multiply( + self.tau, self.network_params[i] + ) + self.tf.multiply(1. - self.tau, t_w)) + self.update_target_network_op.append(op) + + # custom loss and optimization Op + self.y = self.tf.placeholder(self.tf.float32, [None, 1]) + self.loss = self.tf.losses.mean_squared_error(self.y, self.out) + self.optimize = self.tf.train.AdamOptimizer( + self.critic_lr).minimize(self.loss) + + self.action_gradient = self.tf.gradients(self.out, self.critic_actions) + return self.model + + def update(self, sys_vars): + self.sess.run(self.update_target_network_op) + + def get_action_gradient(self, states, actions): + return self.sess.run(self.action_gradient, feed_dict={ + self.critic_states: states, + self.critic_actions: actions + })[0] + + # def predict(self, inputs, action): + # return self.sess.run(self.out, feed_dict={ + # self.critic_states: inputs, + # self.critic_actions: action + # }) + + def target_predict(self, next_states, mu_prime): + return self.sess.run(self.target_out, feed_dict={ + self.target_critic_states: next_states, + self.target_critic_actions: mu_prime + }) + + def train_tf(self, states, actions, y): + return self.sess.run([self.out, self.optimize, self.loss], feed_dict={ + self.critic_states: states, + self.critic_actions: actions, + self.y: y + }) + + +class DDPG(DQN): + + ''' + DDPG Algorithm, from https://arxiv.org/abs/1509.02971 + has Actor, Critic, and each has its own target network + Implementation referred from https://github.com/pemami4911/deep-rl + ''' + + def __init__(self, *args, **kwargs): + # import only when needed to contain side-effects + from keras import backend as K + self.K = K + self.sess = self.K.get_session() + self.actor = Actor(*args, **kwargs) + self.critic = Critic(*args, **kwargs) + self.sess.run(self.K.tf.global_variables_initializer()) + super(DDPG, self).__init__(*args, **kwargs) + + def build_model(self): + pass def compile_model(self): - self.actor_state = self.actor.inputs[0] - self.action_gradient = self.K.placeholder( - shape=(None, self.env_spec['action_dim'])) - self.actor_grads = self.K.tf.gradients( - self.actor.output, self.actor.trainable_weights, - -self.action_gradient) - self.actor_optimize = self.K.tf.train.AdamOptimizer( - self.lr).apply_gradients( - zip(self.actor_grads, self.actor.trainable_weights)) - - self.critic_state = self.critic.inputs[0] - self.critic_action = self.critic.inputs[1] - self.critic_action_grads = self.K.tf.gradients( - self.critic.output, self.critic_action) - - # self.actor.compile( - # loss='mse', - # optimizer=self.optimizer.actor_keras_optimizer) - self.target_actor.compile( - loss='mse', - optimizer=self.optimizer.target_actor_keras_optimizer) - logger.info("Actor Models compiled") - - self.critic.compile( - loss=self.custom_critic_loss, - optimizer=self.optimizer.critic_keras_optimizer) - self.target_critic.compile( - loss='mse', - optimizer=self.optimizer.target_critic_keras_optimizer) - logger.info("Critic Models compiled") + pass + + def recompile_model(self, sys_vars): + pass + + def select_action(self, state): + return self.policy.select_action(state) def update(self, sys_vars): - '''Agent update apart from training the Q function''' + # Update target networks + self.actor.update(sys_vars) + self.critic.update(sys_vars) self.policy.update(sys_vars) self.update_n_epoch(sys_vars) - def train_critic(self, minibatch): - '''update critic network using K-mean loss''' - mu_prime = self.target_actor.predict(minibatch['next_states']) - Q_prime = self.target_critic.predict( - [minibatch['next_states'], mu_prime]) - y = minibatch['rewards'] + self.gamma * \ - (1 - minibatch['terminals']) * Q_prime - critic_loss = self.critic.train_on_batch( - [minibatch['states'], minibatch['actions']], y) - errors = abs(np.sum(Q_prime - y, axis=1)) - self.memory.update(errors) - return critic_loss - - def train_actor(self, minibatch): - '''update actor network using sampled gradient''' - actions = self.actor.predict(minibatch['states']) - # critic_grads = critic.gradients(minibatch['states'], actions) - critic_grads = self.K.get_session().run( - self.critic_action_grads, feed_dict={ - self.critic_state: minibatch['states'], - self.critic_action: actions - })[0] - - # actor.train(minibatch['states'], critic_grads) - self.K.get_session().run(self.actor_optimize, feed_dict={ - self.actor_state: minibatch['states'], - self.action_gradient: critic_grads - }) - actor_loss = 0 - return actor_loss - - def train_target_networks(self): - '''update both target networks''' - actor_weights = self.actor.get_weights() - target_actor_weights = self.target_actor.get_weights() - for i, _w in enumerate(actor_weights): - target_actor_weights[i] = self.TAU * actor_weights[i] + ( - 1 - self.TAU) * target_actor_weights[i] - self.target_actor.set_weights(target_actor_weights) - - critic_weights = self.critic.get_weights() - target_critic_weights = self.target_critic.get_weights() - for i, _w in enumerate(critic_weights): - target_critic_weights[i] = self.TAU * critic_weights[i] + ( - 1 - self.TAU) * target_critic_weights[i] - self.target_critic.set_weights(target_critic_weights) - def train_an_epoch(self): minibatch = self.memory.rand_minibatch(self.batch_size) - critic_loss = self.train_critic(minibatch) - actor_loss = self.train_actor(minibatch) - self.train_target_networks() - loss = critic_loss + actor_loss + # train critic + mu_prime = self.actor.target_predict(minibatch['next_states']) + q_prime = self.critic.target_predict( + minibatch['next_states'], mu_prime) + # reshape for element-wise multiplication + # to feed into network, y shape needs to be (?, 1) + y = minibatch['rewards'] + self.gamma * \ + (1 - minibatch['terminals']) * np.reshape(q_prime, (-1)) + y = np.reshape(y, (-1, 1)) + + _, _, critic_loss = self.critic.train_tf( + minibatch['states'], minibatch['actions'], y) + + # train actor + # Update the actor policy using the sampled gradient + actions = self.actor.predict(minibatch['states']) + critic_action_gradient = self.critic.get_action_gradient( + minibatch['states'], actions) + # currently cant be gotten + _actorloss = self.actor.train_tf( + minibatch['states'], critic_action_gradient) + + loss = critic_loss return loss diff --git a/rl/agent/double_dqn.py b/rl/agent/double_dqn.py index 92e104d..3fd859d 100644 --- a/rl/agent/double_dqn.py +++ b/rl/agent/double_dqn.py @@ -41,14 +41,15 @@ def switch_models(self): self.optimizer.keras_optimizer = self.optimizer.keras_optimizer_2 self.optimizer.keras_optimizer_2 = temp_optimizer - def recompile_model(self, sys_vars): - '''rotate and recompile both models''' - if self.epi_change_lr is not None: - self.switch_models() # to model_2 - super(DoubleDQN, self).recompile_model(sys_vars) - self.switch_models() # back to model - super(DoubleDQN, self).recompile_model(sys_vars) - return self.model + # def recompile_model(self, sys_vars): + # '''rotate and recompile both models''' + # # TODO fix this, double recompile breaks solving power + # if self.epi_change_lr is not None: + # self.switch_models() # to model_2 + # super(DoubleDQN, self).recompile_model(sys_vars) + # self.switch_models() # back to model + # super(DoubleDQN, self).recompile_model(sys_vars) + # return self.model def compute_Q_states(self, minibatch): (Q_states, Q_next_states_select, _max) = super( diff --git a/rl/agent/dqn.py b/rl/agent/dqn.py index 7905270..26e3ae4 100644 --- a/rl/agent/dqn.py +++ b/rl/agent/dqn.py @@ -193,7 +193,7 @@ def train_an_epoch(self): loss = self.model.train_on_batch(minibatch['states'], Q_targets) - errors = abs(np.sum(Q_states - Q_targets, axis=1)) + errors = abs(np.sum(Q_states - Q_targets, axis=1)) self.memory.update(errors) return loss diff --git a/rl/agent/q_table.py b/rl/agent/q_table.py index 436ef71..6618463 100644 --- a/rl/agent/q_table.py +++ b/rl/agent/q_table.py @@ -71,7 +71,9 @@ def compile_model(self): def pixelate_state_space(self, resolution=10): '''chunk up the state space hypercube to specified resolution''' - state_bounds = self.env_spec['state_bounds'] + state_bounds = np.transpose( + [self.env_spec['state_bound_low'], + self.env_spec['state_bound_high']]) self.state_pixels = [np.linspace(*sb, num=resolution+1) for sb in state_bounds] return self.state_pixels diff --git a/rl/analytics.py b/rl/analytics.py index 63cbc80..4d24ad1 100644 --- a/rl/analytics.py +++ b/rl/analytics.py @@ -8,8 +8,10 @@ warnings.filterwarnings("ignore", module="matplotlib") -MPL_BACKEND = 'agg' if ( - environ.get('CI') or platform.system() == 'Darwin') else 'TkAgg' +if platform.system() == 'Darwin': + MPL_BACKEND = 'agg' if args.param_selection else 'macosx' +else: + MPL_BACKEND = 'TkAgg' STATS_COLS = [ 'best_session_epi', diff --git a/rl/experiment.py b/rl/experiment.py index 7ad3b73..3023394 100644 --- a/rl/experiment.py +++ b/rl/experiment.py @@ -133,17 +133,23 @@ def set_env_spec(self): if env.action_space.__class__.__name__ == 'Box': # continuous action_dim = env.action_space.shape[0] actions = 'continuous' + action_low = env.action_space.low + action_high = env.action_space.high else: action_dim = env.action_space.n actions = list(range(env.action_space.n)) + action_low = 0 + action_high = 1 env_spec = { 'problem': PROBLEMS[self.problem], 'state_dim': state_dim, - 'state_bounds': np.transpose( - [env.observation_space.low, env.observation_space.high]), + 'state_bound_low': env.observation_space.low, + 'state_bound_high': env.observation_space.high, 'action_dim': action_dim, 'actions': actions, + 'action_bound_low': action_low, + 'action_bound_high': action_high, 'reward_range': env.reward_range, 'timestep_limit': env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') diff --git a/rl/policy/actor_critic.py b/rl/policy/actor_critic.py new file mode 100644 index 0000000..900ac7d --- /dev/null +++ b/rl/policy/actor_critic.py @@ -0,0 +1,116 @@ +import numpy as np +from rl.policy.base_policy import Policy +from rl.util import log_self + + +class ArgmaxPolicy(Policy): + + ''' + The argmax policy for actor critic agents + Agent takes the action with the highest + action score + ''' + + def __init__(self, env_spec, + **kwargs): # absorb generic param without breaking + super(ArgmaxPolicy, self).__init__(env_spec) + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + A_score = agent.actor.predict(state)[0] # extract from batch predict + assert A_score.ndim == 1 + action = np.argmax(A_score) + return action + + def update(self, sys_vars): + pass + + +class SoftmaxPolicy(Policy): + + ''' + The softmax policy for actor critic agents + Action is drawn from the prob dist generated + by softmax(acion_scores) + ''' + + def __init__(self, env_spec, + **kwargs): # absorb generic param without breaking + super(SoftmaxPolicy, self).__init__(env_spec) + self.clip_val = 500 + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + A_score = agent.actor.predict(state)[0] # extract from batch predict + assert A_score.ndim == 1 + A_score = A_score.astype('float32') # fix precision nan issue + A_score = A_score - np.amax(A_score) # prevent overflow + exp_values = np.exp( + np.clip(A_score, -self.clip_val, self.clip_val)) + assert not np.isnan(exp_values).any() + probs = np.array(exp_values / np.sum(exp_values)) + probs /= probs.sum() # renormalize to prevent floating pt error + action = np.random.choice(agent.env_spec['actions'], p=probs) + return action + + def update(self, sys_vars): + pass + + +class GaussianPolicy(Policy): + + ''' + Continuous policy for actor critic models + Output of the actor network is the mean action + along each dimension. Action chosen is the mean + plus some noise parameterized by the variance + ''' + + def __init__(self, env_spec, + variance=1.0, + **kwargs): # absorb generic param without breaking + super(GaussianPolicy, self).__init__(env_spec) + self.variance = variance + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + a_mean = agent.actor.predict(state)[0] # extract from batch predict + action = a_mean + np.random.normal( + loc=0.0, scale=self.variance, size=a_mean.shape) + return action + + def update(self, sys_vars): + pass + + +class BoundedPolicy(Policy): + + ''' + The bounded policy for actor critic agents + and continous, bounded policy spaces + Action bounded above and below by + - action_bound, + action_bound + ''' + + def __init__(self, env_spec, + **kwargs): # absorb generic param without breaking + super(BoundedPolicy, self).__init__(env_spec) + self.action_bound = env_spec['action_bound_high'] + assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] + log_self(self) + + def select_action(self, state): + agent = self.agent + state = np.expand_dims(state, axis=0) + A_score = agent.actor.predict(state)[0] # extract from batch predict + action = np.tanh(A_score) * self.action_bound + return action + + def update(self, sys_vars): + pass diff --git a/rl/policy/noise.py b/rl/policy/noise.py index 4532f46..fec9507 100644 --- a/rl/policy/noise.py +++ b/rl/policy/noise.py @@ -1,39 +1,24 @@ import numpy as np -from rl.util import logger +from rl.util import log_self from rl.policy.base_policy import Policy -class AnnealedGaussian(Policy): +class NoNoisePolicy(Policy): ''' - Noise policy, mainly for DDPG. - Original inspiration from - https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py + The base class for noise policy for DDPG + default is no noise ''' def __init__(self, env_spec, - mu, sigma, sigma_min, **kwargs): # absorb generic param without breaking - super(AnnealedGaussian, self).__init__(env_spec) - self.size = self.env_spec['action_dim'] - self.n_steps_annealing = self.env_spec['timestep_limit'] / 2 - self.mu = mu - self.sigma = sigma - self.n_steps = 0 - - if sigma_min is not None: - self.m = -float(sigma - sigma_min) / float(self.n_steps_annealing) - self.c = sigma - self.sigma_min = sigma_min - else: - self.m = 0. - self.c = sigma - self.sigma_min = sigma + super(NoNoisePolicy, self).__init__(env_spec) + log_self(self) - @property - def current_sigma(self): - sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c) - return sigma + def sample(self): + '''implement noise here, default is none''' + assert 'actions' in self.env_spec + return 0 def select_action(self, state): agent = self.agent @@ -44,39 +29,96 @@ def select_action(self, state): Q_state = agent.actor.predict(state)[0] assert Q_state.ndim == 1 action = np.argmax(Q_state) - logger.info(str(Q_state)+' '+str(action)) return action def update(self, sys_vars): pass -class GaussianWhiteNoise(AnnealedGaussian): +class LinearNoisePolicy(NoNoisePolicy): - def __init__(self, env_spec, + ''' + policy with linearly decaying noise (1. / (1. + self.epi)) + ''' + + def __init__(self, env_spec, exploration_anneal_episodes=20, + **kwargs): # absorb generic param without breaking + super(LinearNoisePolicy, self).__init__(env_spec) + self.exploration_anneal_episodes = exploration_anneal_episodes + self.n_step = 0 # init + log_self(self) + + def sample(self): + noise = (1. / (1. + self.n_step)) + return noise + + def update(self, sys_vars): + epi = sys_vars['epi'] + if epi >= self.exploration_anneal_episodes: + self.n_step = np.inf # noise divide to zero + else: + self.n_step = sys_vars['epi'] + + +class AnnealedGaussianPolicy(LinearNoisePolicy): + + ''' + Base class of random noise policy for DDPG + Adopted from + https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py + ''' + + def __init__(self, env_spec, exploration_anneal_episodes, + mu, sigma, sigma_min, + **kwargs): # absorb generic param without breaking + super(AnnealedGaussianPolicy, self).__init__( + env_spec, exploration_anneal_episodes) + self.size = env_spec['action_dim'] + self.mu = mu + self.sigma = sigma + + if sigma_min is not None: + self.m = -(sigma - sigma_min) / self.exploration_anneal_episodes + self.c = sigma + self.sigma_min = sigma_min + else: + self.m = 0. + self.c = sigma + self.sigma_min = sigma + + @property + def current_sigma(self): + sigma = max(self.sigma_min, self.m * self.n_step + self.c) + return sigma + + +class GaussianWhiteNoisePolicy(AnnealedGaussianPolicy): + + def __init__(self, env_spec, exploration_anneal_episodes=20, mu=0., sigma=.3, sigma_min=None, **kwargs): # absorb generic param without breaking - super(GaussianWhiteNoise, self).__init__( - env_spec, mu, sigma, sigma_min) + super(GaussianWhiteNoisePolicy, self).__init__( + env_spec, exploration_anneal_episodes, + mu, sigma, sigma_min) def sample(self): sample = np.random.normal(self.mu, self.current_sigma, self.size) - self.n_steps += 1 return sample -class OUNoise(AnnealedGaussian): +class OUNoisePolicy(AnnealedGaussianPolicy): ''' Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab ''' - def __init__(self, env_spec, + def __init__(self, env_spec, exploration_anneal_episodes=20, theta=.15, mu=0., sigma=.3, dt=1e-2, x0=None, sigma_min=None, **kwargs): # absorb generic param without breaking - super(OUNoise, self).__init__( - env_spec, mu, sigma, sigma_min, + super(OUNoisePolicy, self).__init__( + env_spec, exploration_anneal_episodes, + mu, sigma, sigma_min, **kwargs) self.theta = theta self.mu = mu @@ -92,5 +134,4 @@ def sample(self): (self.mu - self.x_prev) * self.dt + self.current_sigma * \ np.sqrt(self.dt) * np.random.normal(size=self.size) self.x_prev = x - self.n_steps += 1 return x diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index 37fdedc..c8b0d0e 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -241,5 +241,29 @@ [400, 200, 100] ] } + }, + "lunar_ac_softmax": { + "problem": "LunarLander-v2", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.05, 0.1], + "gamma": [0.97, 0.99, 0.999], + "hidden_layers": [ + [400, 200], + [800, 400], + [400, 200, 100] + ] + } } } diff --git a/rl/spec/classic_experiment_specs.json b/rl/spec/classic_experiment_specs.json index 3658017..7878981 100644 --- a/rl/spec/classic_experiment_specs.json +++ b/rl/spec/classic_experiment_specs.json @@ -338,6 +338,52 @@ ] } }, + "cartpole_ac_argmax": { + "problem": "CartPole-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "ArgmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, + "cartpole_ac_softmax": { + "problem": "CartPole-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, "dqn_v1": { "problem": "CartPole-v1", "Agent": "DQN", @@ -452,54 +498,52 @@ ] } }, - "freeze_dqn": { - "problem": "CartPole-v0", - "Agent": "FreezeDQN", + "offpol_sarsa_v1": { + "problem": "CartPole-v1", + "Agent": "OffPolicySarsa", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", "Policy": "BoltzmannPolicy", "PreProcessor": "NoPreProcessor", "param": { - "lr": 0.001, - "gamma": 0.99, - "hidden_layers": [32], + "lr": 0.02, + "gamma": 0.999, + "hidden_layers": [128], "hidden_layers_activation": "sigmoid", "exploration_anneal_episodes": 10 }, "param_range": { "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.95, 0.96, 0.97, 0.99], + "gamma": [0.97, 0.99, 0.999], "hidden_layers": [ - [8], - [16], - [32] + [32], + [64], + [128], + [32, 16] ] } }, - "offpol_sarsa_v1": { + "cartpole_v1_ac_softmax": { "problem": "CartPole-v1", - "Agent": "OffPolicySarsa", + "Agent": "ActorCritic", "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "BoltzmannPolicy", + "Policy": "SoftmaxPolicy", "PreProcessor": "NoPreProcessor", "param": { "lr": 0.02, - "gamma": 0.999, - "hidden_layers": [128], - "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 10 + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" }, "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], "gamma": [0.97, 0.99, 0.999], "hidden_layers": [ - [32], [64], - [128], - [32, 16] + [128] ] } }, @@ -517,14 +561,15 @@ "gamma": 0.999, "hidden_layers": [200, 100], "hidden_layers_activation": "sigmoid", - "exploration_anneal_episodes": 200 + "exploration_anneal_episodes": 200, + "max_mem_len": 50000 }, "param_range": { - "lr": [0.001, 0.005, 0.01, 0.02], - "gamma": [0.95, 0.97, 0.99, 0.999], + "lr": [0.001], + "gamma": [0.99, 0.999], + "max_mem_len": [50000, 100000], "hidden_layers": [ [100], - [200], [200, 100], [400, 200, 100] ] @@ -679,38 +724,106 @@ "exploration_anneal_episodes": [200, 400] } }, - "pendulum": { + "acrobot_ac_softmax": { + "problem": "Acrobot-v1", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.05, 0.1], + "gamma": [0.97, 0.99, 0.999], + "hidden_layers": [ + [100], + [200], + [200, 100] + ] + } + }, + "pendulum_ac_gaussian": { + "problem": "Pendulum-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "GaussianPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.05, + "gamma": 0.999, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.05], + "gamma": [0.97, 0.99, 0.999], + "variance": [0.1, 0.5, 1.0], + "hidden_layers": [ + [64], + [128] + ] + } + }, + "pendulum_ac_bounded": { + "problem": "Pendulum-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "BoundedPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.05, + "gamma": 0.999, + "hidden_layers": [400, 300], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.02, 0.05, 0.1], + "gamma": [0.95, 0.97, 0.99, 0.999], + "hidden_layers": [ + [64], + [128] + ] + } + }, + "pendulum_ddpg": { "problem": "Pendulum-v0", "Agent": "DDPG", - "HyperOptimizer": "RandomSearch", + "HyperOptimizer": "GridSearch", "Memory": "LinearMemoryWithForgetting", "Optimizer": "AdamOptimizer", - "Policy": "OUNoise", + "Policy": "LinearNoisePolicy", "PreProcessor": "NoPreProcessor", "param": { - "max_evals": 40, + "batch_size": 64, "n_epoch": 1, - "lr": 0.001, - "gamma": 0.99, + "tau": 0.001, + "lr": 0.0001, + "critic_lr": 0.001, + "exploration_anneal_episodes": 50, + "gamma": 0.999, "hidden_layers": [400, 300], "hidden_layers_activation": "relu", - "exploration_anneal_episodes": 500 + "output_layer_activation": "tanh" }, "param_range": { - "lr": { - "min": 0.0005, - "max": 0.05 - }, - "gamma": { - "min": 0.90, - "max": 0.9999 - }, + "tau": [0.001, 0.005, 0.01], + "lr": [0.0001, 0.001, 0.01], + "critic_lr": [0.001, 0.01], + "gamma": [0.97, 0.99, 0.999], "hidden_layers": [ [400], - [200, 100], [400, 300] - ], - "exploration_anneal_episodes": [200, 400, 600] + ] } }, "mountain_dqn": { @@ -822,5 +935,29 @@ [400] ] } + }, + "mountain_ac_softmax": { + "problem": "MountainCar-v0", + "Agent": "ActorCritic", + "HyperOptimizer": "GridSearch", + "Memory": "LinearMemoryWithForgetting", + "Optimizer": "AdamOptimizer", + "Policy": "SoftmaxPolicy", + "PreProcessor": "NoPreProcessor", + "param": { + "lr": 0.02, + "gamma": 0.99, + "hidden_layers": [64], + "hidden_layers_activation": "sigmoid" + }, + "param_range": { + "lr": [0.005, 0.01, 0.05, 0.1], + "gamma": [0.97, 0.99, 0.999], + "hidden_layers": [ + [200], + [400, 200], + [400, 200, 100] + ] + } } } diff --git a/rl/spec/component_locks.json b/rl/spec/component_locks.json index c657604..759ef0f 100644 --- a/rl/spec/component_locks.json +++ b/rl/spec/component_locks.json @@ -12,6 +12,63 @@ "DoubleDQNEpsilonGreedyPolicy" ] }, + "ddpg": { + "type": "mutex", + "details": "ddpg uses white-noise policy", + "head": "Agent", + "Agent": [ + "DDPG" + ], + "Policy": [ + "GaussianWhiteNoisePolicy", + "LinearNoisePolicy", + "NoNoisePolicy", + "OUNoisePolicy" + ] + }, + "actor_critic": { + "type": "mutex", + "details": "actor critic uses custom Q computation in its policy", + "head": "Agent", + "Agent": [ + "ActorCritic" + ], + "Policy": [ + "ArgmaxPolicy", + "BoundedPolicy", + "GaussianPolicy", + "SoftmaxPolicy" + ] + }, + "actor_critic_discrete": { + "type": "subset", + "details": "actor critic discrete components cannot work in continuous action space", + "head": "problem", + "problem": [ + "Acrobot-v1", + "AirRaid-v0", + "Alien-v0", + "Assault-v0", + "Breakout-v0", + "CartPole-v0", + "CartPole-v1", + "DevBreakout-v0", + "DevCartPole-v0", + "FlappyBird-v0", + "LunarLander-v2", + "MountainCar-v0", + "MsPacman-v0", + "Pong-v0", + "Qbert-v0", + "Snake-v0", + "SpaceInvader-v0", + "TestPassCartPole-v0" + ], + "Policy": [ + "ArgmaxPolicy", + "SoftmaxPolicy" + ] + }, "discrete_action": { "type": "subset", "details": "discrete components cannot work in continuous action space", diff --git a/rl/spec/dev_experiment_specs.json b/rl/spec/dev_experiment_specs.json index be835e9..eadbd36 100644 --- a/rl/spec/dev_experiment_specs.json +++ b/rl/spec/dev_experiment_specs.json @@ -120,30 +120,5 @@ "gamma": [0.97, 0.99], "lr": [0.01, 0.1] } - }, - "dev": { - "problem": "CartPole-v0", - "Agent": "DDPG", - "HyperOptimizer": "GridSearch", - "Memory": "LinearMemoryWithForgetting", - "Optimizer": "AdamOptimizer", - "Policy": "OUNoise", - "PreProcessor": "NoPreProcessor", - "param": { - "lr": 0.01, - "decay": 0.0, - "gamma": 0.99, - "hidden_layers": [64], - "hidden_layers_activation": "sigmoid", - "output_layer_activation": "linear", - "exploration_anneal_episodes": 10, - "auto_architecture": false, - "num_hidden_layers": 3, - "first_hidden_layer_size": 512 - }, - "param_range": { - "gamma": [0.97, 0.99], - "lr": [0.01, 0.1] - } } } diff --git a/rl/spec/problems.json b/rl/spec/problems.json index 40afb3f..a0337ff 100644 --- a/rl/spec/problems.json +++ b/rl/spec/problems.json @@ -44,7 +44,7 @@ "Pendulum-v0": { "GYM_ENV_NAME": "Pendulum-v0", "SOLVED_MEAN_REWARD": null, - "MAX_EPISODES": 1000, + "MAX_EPISODES": 300, "REWARD_MEAN_LEN": 100 }, "LunarLander-v2": { diff --git a/rl/util.py b/rl/util.py index 422df42..23e3e65 100644 --- a/rl/util.py +++ b/rl/util.py @@ -23,6 +23,65 @@ } +# parse_args to add flag +parser = argparse.ArgumentParser(description='Set flags for functions') +parser.add_argument("-b", "--blind", + help="dont render graphics", + action="store_const", + dest="render", + const=False, + default=True) +parser.add_argument("-d", "--debug", + help="activate debug log", + action="store_const", + dest="loglevel", + const=logging.DEBUG, + default=logging.INFO) +parser.add_argument("-e", "--experiment", + help="specify experiment to run", + action="store", + type=str, + nargs='?', + dest="experiment", + default="dev_dqn") +parser.add_argument("-p", "--param_selection", + help="run parameter selection if present", + action="store_true", + dest="param_selection", + default=False) +parser.add_argument("-q", "--quiet", + help="change log to warning level", + action="store_const", + dest="loglevel", + const=logging.WARNING, + default=logging.INFO) +parser.add_argument("-t", "--times", + help="number of times session is run", + action="store", + nargs='?', + type=int, + dest="times", + default=1) +parser.add_argument("-x", "--max_episodes", + help="manually set environment max episodes", + action="store", + nargs='?', + type=int, + dest="max_epis", + default=-1) +args = parser.parse_args([]) if environ.get('CI') else parser.parse_args() + +# Goddam python logger +logger = logging.getLogger(__name__) +handler = logging.StreamHandler(sys.stdout) +handler.setFormatter( + logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')) +logger.setLevel(args.loglevel) +logger.addHandler(handler) +logger.propagate = False +environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # mute tf warnings on optimized setup + + def check_equal(iterator): '''check if list contains all the same elements''' iterator = iter(iterator) @@ -54,16 +113,16 @@ def check_lock(lock_name, lock, experiment_spec): # rest must all have the same signature rest_equal = check_equal(bin_rest_list) if not rest_equal: - raise ValueError( + logger.warn( 'All components need to be of the same set, ' 'check component lock "{}" and your spec "{}"'.format( - bin_rest_list, experiment_spec['experiment_name'])) + lock_name, experiment_spec['experiment_name'])) bin_rest = bin_rest_list[0] lock_sig = [bin_head, bin_rest] lock_valid = lock_sig in valid_lock_sig_list if not lock_valid: - raise ValueError( + logger.warn( 'Component lock violated: "{}", spec: "{}"'.format( lock_name, experiment_spec['experiment_name'])) return lock_valid @@ -127,65 +186,6 @@ def import_guard_asset(): PROBLEMS, EXPERIMENT_SPECS = import_guard_asset() -# parse_args to add flag -parser = argparse.ArgumentParser(description='Set flags for functions') -parser.add_argument("-b", "--blind", - help="dont render graphics", - action="store_const", - dest="render", - const=False, - default=True) -parser.add_argument("-d", "--debug", - help="activate debug log", - action="store_const", - dest="loglevel", - const=logging.DEBUG, - default=logging.INFO) -parser.add_argument("-e", "--experiment", - help="specify experiment to run", - action="store", - type=str, - nargs='?', - dest="experiment", - default="dev_dqn") -parser.add_argument("-p", "--param_selection", - help="run parameter selection if present", - action="store_true", - dest="param_selection", - default=False) -parser.add_argument("-q", "--quiet", - help="change log to warning level", - action="store_const", - dest="loglevel", - const=logging.WARNING, - default=logging.INFO) -parser.add_argument("-t", "--times", - help="number of times session is run", - action="store", - nargs='?', - type=int, - dest="times", - default=1) -parser.add_argument("-x", "--max_episodes", - help="manually set environment max episodes", - action="store", - nargs='?', - type=int, - dest="max_epis", - default=-1) -args = parser.parse_args([]) if environ.get('CI') else parser.parse_args() - -# Goddam python logger -logger = logging.getLogger(__name__) -handler = logging.StreamHandler(sys.stdout) -handler.setFormatter( - logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')) -logger.setLevel(args.loglevel) -logger.addHandler(handler) -logger.propagate = False -environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # mute tf warnings on optimized setup - - def log_self(subject): max_info_len = 300 info = '{}, param: {}'.format( diff --git a/test/test_advanced.py b/test/test_advanced.py index eec9607..7407e56 100644 --- a/test/test_advanced.py +++ b/test/test_advanced.py @@ -51,3 +51,13 @@ def test_breakout_dqn(cls): def test_breakout_double_dqn(cls): data_df = run('breakout_double_dqn') assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_cartpole_ac_argmax(cls): + data_df = run('cartpole_ac_argmax') + assert isinstance(data_df, pd.DataFrame) + + @classmethod + def test_pendulum_ddpg(cls): + data_df = run('pendulum_ddpg') + assert isinstance(data_df, pd.DataFrame)