From f8663eaf11803ae9256a03f326d4d58ced9bcca7 Mon Sep 17 00:00:00 2001 From: Elman Mansimov Date: Sat, 30 Sep 2017 17:21:04 -0400 Subject: [PATCH] fixes acktr_cont issues --- baselines/acktr/acktr_cont.py | 8 +++++--- baselines/acktr/run_mujoco.py | 2 +- baselines/acktr/value_functions.py | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/baselines/acktr/acktr_cont.py b/baselines/acktr/acktr_cont.py index f0b0667afe..22cfbb1f46 100644 --- a/baselines/acktr/acktr_cont.py +++ b/baselines/acktr/acktr_cont.py @@ -46,7 +46,7 @@ def rollout(env, policy, max_pathlength, animate=False, obfilter=None): "action_dist": np.array(ac_dists), "logp" : np.array(logps)} def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, - animate=False, callback=None, optimizer="adam", desired_kl=0.002): + animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) @@ -117,14 +117,16 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, # Policy update do_update(ob_no, action_na, standardized_adv_n) + min_stepsize = np.float32(1e-8) + max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") - U.eval(tf.assign(stepsize, stepsize / 1.5)) + U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") - U.eval(tf.assign(stepsize, stepsize * 1.5)) + U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") diff --git a/baselines/acktr/run_mujoco.py b/baselines/acktr/run_mujoco.py index d4bfd843e0..70859c936a 100644 --- a/baselines/acktr/run_mujoco.py +++ b/baselines/acktr/run_mujoco.py @@ -39,4 +39,4 @@ def train(env_id, num_timesteps, seed): parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1") args = parser.parse_args() - train(args.env_id, num_timesteps=1e6, seed=args.seed) + train(args.env, num_timesteps=1e6, seed=args.seed) diff --git a/baselines/acktr/value_functions.py b/baselines/acktr/value_functions.py index 5151cfcef0..f4fa4a6994 100644 --- a/baselines/acktr/value_functions.py +++ b/baselines/acktr/value_functions.py @@ -13,7 +13,7 @@ def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) - vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] + vpred_n = dense(h2, 1, "hfinal", weight_init=None, bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) @@ -22,7 +22,7 @@ def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ - weight_decay_dict=wd_dict, max_grad_norm=None) + weight_decay_dict=wd_dict, max_grad_norm=1.0) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: