-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_ace.py
237 lines (206 loc) · 13.8 KB
/
run_ace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import os
import gym
import gym_puddle
import random
import argparse
import numpy as np
from src import utils
from tqdm import tqdm
from pathlib import Path
from joblib import Parallel, delayed
from src.algorithms.ace import BinaryACE
from src.algorithms.etd import BinaryETD
from src.algorithms.tdc import BinaryTDC, BinaryGQ
from src.algorithms.tdrc import BinaryTDRC
from src.algorithms.fhat import BinaryFHat
from src.function_approximation.tile_coder import TileCoder
from src.environments.pw import puddleworld
from evaluate_policies import evaluate_policy
def run_ace(experience_memmap, policies_memmap, performance_memmap, run_num, config_num, parameters, random_seed, experience_memmap_test, num_test_eval):
# If this run and configuration has already been done (i.e., previous run timed out), exit early:
if np.count_nonzero(policies_memmap[config_num]['policies'][run_num]) != 0:
return
alpha_a, alpha_w, alpha_v, lambda_c, eta = parameters
# If this is the first run with a set of parameters, save the parameters:
if run_num == 0:
policies_memmap[config_num]['parameters'] = (alpha_a, alpha_w, alpha_v, lambda_c, eta, args.gamma, args.num_tiles_per_dim, args.num_tilings, args.bias_unit)
performance_memmap[config_num]['parameters'] = (alpha_a, alpha_w, alpha_v, lambda_c, eta, args.gamma, args.num_tiles_per_dim, args.num_tilings, args.bias_unit)
# Create the environment to evaluate the learned policy in:
if args.environment == 'pw':
env = puddleworld()
else:
import gym_puddle
env = gym.make(args.environment).unwrapped
env.seed(random_seed)
rng = env.np_random
actor = BinaryACE(env.action_space.n, tc.total_num_tiles, alpha_a / tc.num_active_features)
if args.all_actions:
critic = BinaryGQ(env.action_space.n, tc.total_num_tiles, alpha_w / tc.num_active_features, alpha_v / tc.num_active_features, lambda_c)
elif args.critic == 'ETD':
critic = BinaryETD(tc.total_num_tiles, alpha_w / tc.num_active_features, lambda_c)
else:
critic = BinaryTDRC(tc.total_num_tiles, alpha_w / tc.num_active_features, lambda_c)
if args.direct_f:
# Initialize the function approximator being used to estimate the emphatic weightings:
fhat = BinaryFHat(tc.total_num_tiles, alpha_v / tc.num_active_features, args.normalize)
i = eval(args.interest_function) # Create the interest function to use.
mu = eval(args.behaviour_policy, {'np': np, 'env': env}) # Create the behaviour policy and give it access to numpy and the env.
policies = np.zeros(num_policies, dtype=policy_dtype)
performance = np.zeros((num_policies, args.num_evaluation_runs), dtype=float)
performance_excursions = np.zeros((num_policies, num_test_eval), dtype=float)
np.seterr(divide='raise', over='raise', invalid='raise')
try:
transitions = experience_memmap[run_num]
gamma_t = 0.
f_t = 0.
rho_tm1 = 1.
indices_t = tc.encode(transitions[0][0])
for t, transition in enumerate(transitions):
# Save and evaluate the learned policy if it's a checkpoint timestep:
if t % args.checkpoint_interval == 0:
performance[t // args.checkpoint_interval] = [evaluate_policy(actor, tc, env, rng, args.max_timesteps) for _ in range(args.num_evaluation_runs)]
perf_excursions = []
for sample in range(num_test_eval):
env.state = experience_memmap_test[run_num][sample][0]
perf_excursions.append(evaluate_policy(actor, tc, env, rng, args.max_timesteps,state=experience_memmap_test[run_num][sample][0]))
performance_excursions[t // args.checkpoint_interval] = perf_excursions
policies[t // args.checkpoint_interval] = (t, np.copy(actor.theta))
# Unpack the stored transition.
s_t, a_t, r_tp1, s_tp1, a_tp1, terminal = transition
gamma_tp1 = args.gamma if not terminal else 0 # Transition-dependent discounting.
indices_tp1 = tc.encode(s_tp1)
i_t = i(s_t, gamma_t)
i_tp1 = i(s_tp1, gamma_tp1)
# Compute importance sampling ratio for the policy:
pi_t = actor.pi(indices_t)
mu_t = mu(s_t)
rho_t = pi_t[a_t] / mu_t[a_t]
if args.direct_f:
# Estimate emphatic weightings with the function approximator:
f_t = fhat.estimate(indices_t)
# Update the function approximator:
fhat.learn(indices_tp1, gamma_tp1, indices_t, rho_t, i_tp1)
else:
# Estimate emphatic weightings with the follow-on trace:
f_t = (1 - gamma_t) * i_t + rho_tm1 * gamma_t * f_t if args.normalize else i_t + rho_tm1 * gamma_t * f_t
m_t = (1 - eta) * i_t + eta * f_t
if args.all_actions:
critic.learn(indices_t, a_t, rho_t, gamma_t, r_tp1, indices_tp1, actor.pi(indices_tp1), gamma_tp1)
q_t = critic.estimate(indices_t)
actor.all_actions_learn(indices_t, q_t, m_t)
elif args.critic == 'ETD':
delta_t = r_tp1 + gamma_tp1 * critic.estimate(indices_tp1) - critic.estimate(indices_t)
critic.learn(delta_t, indices_t, gamma_t, i_t, rho_t, f_t)
actor.learn(indices_t, a_t, delta_t, m_t, rho_t)
else:
delta_t = r_tp1 + gamma_tp1 * critic.estimate(indices_tp1) - critic.estimate(indices_t)
critic.learn(delta_t, indices_t, gamma_t, indices_tp1, gamma_tp1, rho_t)
actor.learn(indices_t, a_t, delta_t, m_t, rho_t)
gamma_t = gamma_tp1
indices_t = indices_tp1
rho_tm1 = rho_t
# Save and evaluate the policy after the final timestep:
policies[-1] = (t+1, np.copy(actor.theta))
performance[-1] = [evaluate_policy(actor, tc, env, rng, args.max_timesteps) for _ in range(args.num_evaluation_runs)]
perf_excursions = []
for sample in range(num_test_eval):
env.state = experience_memmap_test[run_num][sample][0]
perf_excursions.append(evaluate_policy(actor, tc, env, rng, args.max_timesteps,state=experience_memmap_test[run_num][sample][0]))
performance_excursions[-1] = perf_excursions
# Save the learned policies and their performance to the memmap:
performance_memmap[config_num]['results'][run_num] = performance
performance_memmap[config_num]['results_excursions'][run_num] = performance_excursions
policies_memmap[config_num]['policies'][run_num] = policies
except (FloatingPointError, ValueError) as e:
# Save NaN to indicate the weights overflowed and exit early:
performance_memmap[config_num]['results'][run_num] = np.full_like(performance, np.NaN)
performance_memmap[config_num]['results_excursions'][run_num] = np.full_like(performance_excursions, np.NaN)
policies_memmap[config_num]['policies'][run_num] = np.full_like(policies, np.NaN)
return
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='A script to run ACE (Actor-Critic with Emphatic weightings).', formatter_class=argparse.ArgumentDefaultsHelpFormatter, allow_abbrev=False)
parser.add_argument('--output_dir', type=str, default='experiment', help='The directory to write experiment files to')
parser.add_argument('--experience_file', type=str, default='experiment/experience.npy', help='The file to read experience from')
parser.add_argument('--experience_file_test', type=str, default='experiment/experience_test.npy', help='The file to read experience from for evaluating the excursions objective')
parser.add_argument('--num_cpus', type=int, default=-1, help='The number of cpus to use (-1 for all).')
# Policy evaluation parameters:
parser.add_argument('--checkpoint_interval', type=int, default=5000, help='The number of timesteps after which to save the learned policy.')
parser.add_argument('--num_evaluation_runs', type=int, default=5, help='The number of times to evaluate each policy')
parser.add_argument('--max_timesteps', type=int, default=1000, help='The maximum number of timesteps allowed per policy evaluation')
parser.add_argument('--random_seed', type=int, default=1944801619, help='The master random seed to use')
# Experiment parameters:
parser.add_argument('--critic', type=str, choices=['ETD', 'TDRC'], default='TDRC', help='Which critic to use.')
parser.add_argument('--direct_f', type=int, choices=[0, 1], default=0, help='Use a function approximator to estimate the emphatic weightings.')
parser.add_argument('--all_actions', type=int, choices=[0, 1], default=0, help='Use all-actions updates instead of TD error-based updates.')
parser.add_argument('--normalize', type=int, choices=[0, 1], default=0, help='Estimate the discounted follow-on distribution instead of the discounted follow-on visit counts.')
parser.add_argument('--interest_function', type=str, default='lambda s, g=1: 1.', help='Interest function to use. Example: \'lambda s, g=1: 1. if g==0. else 0.\' (episodic interest function)')
parser.add_argument('--behaviour_policy', type=str, default='lambda s: np.ones(env.action_space.n)/env.action_space.n', help='Policy to use. Default is uniform random. Another Example: \'lambda s: np.array([.9, .05, .05]) if s[1] < 0 else np.array([.05, .05, .9]) \' (energy pumping policy w/ 15 percent randomness)')
parser.add_argument('--environment', type=str, default='MountainCar-v0', help='An OpenAI Gym environment string.')
parser.add_argument('--gamma', '--discount_rate', type=float, default=.99, help='Discount rate.')
parser.add_argument('-p', '--parameters', type=float, nargs=5, action='append', metavar=('alpha_a', 'alpha_w', 'alpha_v', 'lambda', 'eta'), help='Parameters to use. Can be specified multiple times to run multiple configurations in parallel.')
parser.add_argument('--num_tiles_per_dim', type=int, nargs='+', default=[4, 4], help='The number of tiles per dimension to use in the tile coder.')
parser.add_argument('--num_tilings', type=int, default=8, help='The number of tilings to use in the tile coder.')
parser.add_argument('--bias_unit', type=int, choices=[0, 1], default=1, help='Whether or not to include a bias unit in the tile coder.')
args, unknown_args = parser.parse_known_args()
# Save the command line arguments in a format interpretable by argparse:
output_dir = Path(args.output_dir)
utils.save_args_to_file(args, output_dir / Path(parser.prog).with_suffix('.args'))
# Load the input data as a memmap to prevent a copy being loaded into memory in each sub-process:
experience_memmap = np.lib.format.open_memmap(args.experience_file, mode='r')
num_runs, num_timesteps = experience_memmap.shape
# Load the input data as a memmap to prevent a copy being loaded into memory in each sub-process:
experience_memmap_test = np.lib.format.open_memmap(args.experience_file_test, mode='r')
num_runs, num_test_eval = experience_memmap_test.shape
# Generate the random seed for each run without replacement to prevent the birthday paradox:
random.seed(args.random_seed)
random_seeds = random.sample(range(2**32), num_runs)
# Create the tile coder to be used for all parameter settings:
if args.environment == 'pw':
dummy_env = puddleworld()
else:
dummy_env = gym.make(args.environment).unwrapped # Make a dummy env to get shape info.
tc = TileCoder(np.array([dummy_env.observation_space.low, dummy_env.observation_space.high]).T, args.num_tiles_per_dim, args.num_tilings, args.bias_unit)
# Create the memmapped array of learned policies that will be populated in parallel:
parameters_dtype = np.dtype([
('alpha_a', float),
('alpha_w', float),
('alpha_v', float),
('lambda', float),
('eta', float),
('gamma', float),
('num_tiles_per_dim', int, (len(args.num_tiles_per_dim),)),
('num_tilings', int),
('bias_unit', bool)
])
policy_dtype = np.dtype([
('timesteps', int),
('weights', float, (dummy_env.action_space.n, tc.total_num_tiles))
])
num_policies = num_timesteps // args.checkpoint_interval + 1
configuration_dtype = np.dtype([
('parameters', parameters_dtype),
('policies', policy_dtype, (num_runs, num_policies))
])
policies_memmap_path = str(output_dir / 'policies.npy')
if os.path.isfile(policies_memmap_path):
policies_memmap = np.lib.format.open_memmap(policies_memmap_path, mode='r+')
else:
policies_memmap = np.lib.format.open_memmap(policies_memmap_path, shape=(len(args.parameters),), dtype=configuration_dtype, mode='w+')
# Create the memmapped array of performance results for the learned policies:
performance_dtype = np.dtype([
('parameters', parameters_dtype),
('results', float, (num_runs, num_policies, args.num_evaluation_runs)),
('results_excursions', float, (num_runs, num_policies, num_test_eval))
])
performance_memmap_path = str(output_dir / 'performance.npy')
if os.path.isfile(performance_memmap_path):
performance_memmap = np.lib.format.open_memmap(performance_memmap_path, mode='r+')
else:
performance_memmap = np.lib.format.open_memmap(performance_memmap_path, shape=(len(args.parameters),), dtype=performance_dtype, mode='w+')
# Run ACE for each configuration in parallel:
with utils.tqdm_joblib(tqdm(total=num_runs * len(args.parameters), smoothing=0)) as progress_bar:
Parallel(n_jobs=args.num_cpus, verbose=0)(
delayed(run_ace)(experience_memmap, policies_memmap, performance_memmap, run_num, config_num, parameters, random_seed, experience_memmap_test, num_test_eval)
for config_num, parameters in enumerate(args.parameters)
for run_num, random_seed in enumerate(random_seeds)
)