diff --git a/rl/memory/prioritized_exp_replay.py b/rl/memory/prioritized_exp_replay.py index 90c2ee0..9d1a239 100644 --- a/rl/memory/prioritized_exp_replay.py +++ b/rl/memory/prioritized_exp_replay.py @@ -27,15 +27,16 @@ def __init__(self, env_spec, max_mem_len=10000, e=0.01, alpha=0.6, self.prio_tree = SumTree(self.max_mem_len) self.head = 0 + # bump to account for negative terms in reward get_priority + # and we cannot abs(reward) cuz it's sign sensitive SOLVED_MEAN_REWARD = self.env_spec['problem']['SOLVED_MEAN_REWARD'] - if SOLVED_MEAN_REWARD > 0: - self.min_priority = 0 - else: - self.min_priority = abs(10 * SOLVED_MEAN_REWARD) + self.min_priority = abs(10 * SOLVED_MEAN_REWARD) def get_priority(self, error): # add min_priority to prevent root of negative = complex - return (self.min_priority + error + self.e) ** self.alpha + p = (self.min_priority + error + self.e) ** self.alpha + assert not np.isnan(p) + return p def add_exp(self, action, reward, next_state, terminal): '''Round robin memory updating''' diff --git a/rl/spec/box2d_experiment_specs.json b/rl/spec/box2d_experiment_specs.json index c8b6971..c379ec4 100644 --- a/rl/spec/box2d_experiment_specs.json +++ b/rl/spec/box2d_experiment_specs.json @@ -107,14 +107,14 @@ "PreProcessor": "StackStates", "param": { "train_per_n_new_exp": 5, - "batch_size": 32, "lr": 0.005, "gamma": 0.99, "hidden_layers": [800, 400], "hidden_layers_activation": "sigmoid", "output_layer_activation": "linear", "exploration_anneal_episodes": 150, - "epi_change_lr": 200 + "epi_change_lr": 200, + "max_mem_len": 20000 }, "param_range": { "lr": [0.001, 0.005, 0.01],