From 8ea4f4a87afa548832ca17e575b351ec5928c1b0 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Sun, 13 Dec 2020 17:28:49 +0100 Subject: [PATCH] Tune HER hyperparams (#58) * Update her hyperparams * Contrib repo is now required * Save hyperparams * Remove reward offset * Update params * Update hyperparams * Add TQC hyperparam opt support * Update requirements * Update docker image * Attempt to fix CI * Fix bug when using HER + DQN/TQC for hyperparam optimization * Fix SQLAlchemy version * Maybe pip will be happy now? * Use latest contrib version * Test if hack is still needed * Remove hack * Cleanup --- .github/workflows/ci.yml | 2 +- .github/workflows/trained_agents.yml | 2 +- CHANGELOG.md | 7 +- hyperparams/a2c.yml | 8 +- hyperparams/ddpg.yml | 16 +-- hyperparams/her.yml | 163 ++++++++++----------------- hyperparams/ppo.yml | 10 +- hyperparams/sac.yml | 20 ++-- hyperparams/td3.yml | 18 +-- hyperparams/tqc.yml | 22 ++-- requirements.txt | 4 +- scripts/build_docker.sh | 2 +- tests/test_hyperparams_opt.py | 2 + utils/hyperparams_opt.py | 63 +++++++++-- utils/utils.py | 11 +- utils/wrappers.py | 118 +------------------ version.txt | 2 +- 17 files changed, 177 insertions(+), 293 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 89c6de35a..42dc99c7e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: run: | python -m pip install --upgrade pip # cpu version of pytorch - faster to download - pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html pip install -r requirements.txt # Use headless version pip install opencv-python-headless diff --git a/.github/workflows/trained_agents.yml b/.github/workflows/trained_agents.yml index da3935c02..792b94606 100644 --- a/.github/workflows/trained_agents.yml +++ b/.github/workflows/trained_agents.yml @@ -30,7 +30,7 @@ jobs: run: | python -m pip install --upgrade pip # cpu version of pytorch - faster to download - pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html pip install -r requirements.txt # Use headless version pip install opencv-python-headless diff --git a/CHANGELOG.md b/CHANGELOG.md index 67522364b..8dde0dc28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,19 @@ -## Pre-Release 0.11.0a0 (WIP) +## Pre-Release 0.11.0a2 (WIP) ### Breaking Changes - Removed `LinearNormalActionNoise` - Evaluation is now deterministic by default, except for Atari games +- `sb3_contrib` is now required +- `TimeFeatureWrapper` was moved to the contrib repo ### New Features - Added option to choose which `VecEnv` class to use for multiprocessing +- Added hyperparameter optimization support for `TQC` ### Bug fixes - Improved detection of Atari games - Fix potential bug in plotting script when there is not enough timesteps +- Fixed a bug when using HER + DQN/TQC for hyperparam optimization ### Documentation @@ -21,6 +25,7 @@ - Changed `PPO` atari hyperparameters (removed vf clipping) - Changed `A2C` atari hyperparameters (eps value of the optimizer) - Updated benchmark script +- Updated hyperparameter optim search space (commented gSDE for A2C/PPO) ## Pre-Release 0.10.0 (2020-10-28) diff --git a/hyperparams/a2c.yml b/hyperparams/a2c.yml index 90f3b586e..e2beecc74 100644 --- a/hyperparams/a2c.yml +++ b/hyperparams/a2c.yml @@ -125,7 +125,7 @@ BipedalWalkerHardcore-v3: # Tuned HalfCheetahBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 4 n_timesteps: !!float 2e6 @@ -145,7 +145,7 @@ HalfCheetahBulletEnv-v0: policy_kwargs: "dict(log_std_init=-2, ortho_init=False, full_std=True)" Walker2DBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 4 n_timesteps: !!float 2e6 @@ -198,7 +198,7 @@ AntBulletEnv-v0: # Tuned HopperBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 4 n_timesteps: !!float 2e6 @@ -218,7 +218,7 @@ HopperBulletEnv-v0: # Tuned but unstable # Not working without SDE? ReacherBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 4 n_timesteps: !!float 2e6 diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml index dab6e353e..706804859 100644 --- a/hyperparams/ddpg.yml +++ b/hyperparams/ddpg.yml @@ -60,7 +60,7 @@ BipedalWalkerHardcore-v3: # Tuned HalfCheetahBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -75,7 +75,7 @@ HalfCheetahBulletEnv-v0: # Tuned AntBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -90,7 +90,7 @@ AntBulletEnv-v0: # Tuned HopperBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -107,7 +107,7 @@ HopperBulletEnv-v0: # Tuned Walker2DBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -124,7 +124,7 @@ Walker2DBulletEnv-v0: # TO BE tested HumanoidBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 2e6 policy: 'MlpPolicy' gamma: 0.98 @@ -139,7 +139,7 @@ HumanoidBulletEnv-v0: # To be tuned ReacherBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' gamma: 0.98 @@ -154,7 +154,7 @@ ReacherBulletEnv-v0: # To be tuned InvertedDoublePendulumBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -169,7 +169,7 @@ InvertedDoublePendulumBulletEnv-v0: # To be tuned InvertedPendulumSwingupBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' gamma: 0.98 diff --git a/hyperparams/her.yml b/hyperparams/her.yml index 8a20acf98..dac73aacd 100644 --- a/hyperparams/her.yml +++ b/hyperparams/her.yml @@ -4,7 +4,7 @@ NeckGoalEnvRelativeSparse-v2: # env_wrapper: # - utils.wrappers.HistoryWrapper: # horizon: 2 - # - utils.wrappers.TimeFeatureWrapper + # - sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -31,7 +31,7 @@ NeckGoalEnvRelativeDense-v2: env_wrapper: - utils.wrappers.HistoryWrapperObsDict: horizon: 2 - # - utils.wrappers.TimeFeatureWrapper + # - sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -53,6 +53,22 @@ NeckGoalEnvRelativeDense-v2: goal_selection_strategy: 'future' online_sampling: False +FetchPush-v1: + env_wrapper: + - sb3_contrib.common.wrappers.TimeFeatureWrapper + n_timesteps: !!float 1e6 + policy: 'MlpPolicy' + model_class: 'tqc' + n_sampled_goal: 4 + goal_selection_strategy: 'future' + buffer_size: 1000000 + batch_size: 2048 + gamma: 0.95 + learning_rate: !!float 1e-3 + tau: 0.05 + policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])" + online_sampling: True + # DDPG hyperparams #parking-v0: # n_timesteps: !!float 2e5 @@ -70,121 +86,76 @@ NeckGoalEnvRelativeDense-v2: # online_sampling: True # max_episode_length: 100 - -# SAC hyperparams, her paper parking-v0: n_timesteps: !!float 2e5 policy: 'MlpPolicy' - model_class: 'sac' + model_class: 'tqc' n_sampled_goal: 4 goal_selection_strategy: 'future' buffer_size: 1000000 - batch_size: 256 + batch_size: 1024 gamma: 0.95 learning_rate: !!float 1e-3 - # noise_type: 'normal' - # noise_std: 0.2 - policy_kwargs: "dict(net_arch=[256, 256, 256])" - online_sampling: False - # normalize: True + tau: 0.05 + policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])" + online_sampling: True max_episode_length: 100 - -# TD3 hyperparams, her paper -#parking-v0: -# n_timesteps: !!float 2e5 -# policy: 'MlpPolicy' -# model_class: 'td3' -# n_sampled_goal: 4 -# goal_selection_strategy: 'future' -# buffer_size: 1000000 -# batch_size: 256 -# gamma: 0.95 -# learning_rate: !!float 1e-3 -# noise_type: 'normal' -# noise_std: 0.2 -# policy_kwargs: "dict(net_arch=[256, 256, 256])" -# online_sampling: True -# max_episode_length: 100 - + # normalize: True # Mujoco Robotic Env -# DDPG hyperparams -# FetchReach-v1: -# n_timesteps: !!float 20000 -# policy: 'MlpPolicy' -# model_class: 'ddpg' -# n_sampled_goal: 4 -# goal_selection_strategy: 'future' -# buffer_size: 1000000 -# batch_size: 256 -# gamma: 0.95 -# random_exploration: 0.3 -# actor_lr: !!float 1e-3 -# critic_lr: !!float 1e-3 -# noise_type: 'normal' -# noise_std: 0.2 -# normalize_observations: true -# normalize_returns: false -# policy_kwargs: "dict(layers=[256, 256, 256])" -# online_sampling: True -# NOTE: shoube be run with 8 workers: mpirun -n 8 -# FetchPush-v1: -# n_timesteps: !!float 2e6 -# policy: 'MlpPolicy' -# model_class: 'ddpg' -# n_sampled_goal: 4 -# goal_selection_strategy: 'future' -# buffer_size: 200000 -# batch_size: 256 -# gamma: 0.95 -# random_exploration: 0.3 -# actor_lr: !!float 1e-3 -# critic_lr: !!float 1e-3 -# noise_type: 'normal' -# noise_std: 0.2 -# normalize_observations: true -# normalize_returns: false -# policy_kwargs: "dict(layers=[16, 16, 16])" +FetchSlide-v1: + env_wrapper: + - sb3_contrib.common.wrappers.TimeFeatureWrapper + n_timesteps: !!float 1e6 + policy: 'MlpPolicy' + model_class: 'tqc' + n_sampled_goal: 4 + goal_selection_strategy: 'future' + buffer_size: 1000000 + batch_size: 2048 + gamma: 0.95 + learning_rate: !!float 1e-3 + tau: 0.05 + # ent_coef: 0.01 + policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])" + online_sampling: True FetchPush-v1: env_wrapper: - - utils.wrappers.HistoryWrapperObsDict: - horizon: 2 - # - utils.wrappers.TimeFeatureObsDictWrapper - n_timesteps: !!float 3e6 + - sb3_contrib.common.wrappers.TimeFeatureWrapper + n_timesteps: !!float 1e6 policy: 'MlpPolicy' - model_class: 'sac' + model_class: 'tqc' n_sampled_goal: 4 goal_selection_strategy: 'future' buffer_size: 1000000 - ent_coef: 'auto' + batch_size: 2048 gamma: 0.95 - learning_rate: !!float 7e-4 - use_sde: True - gradient_steps: -1 - train_freq: -1 - n_episodes_rollout: 1 - sde_sample_freq: 10 - # noise_type: 'normal' - # noise_std: 0.2 - learning_starts: 1000 + learning_rate: !!float 1e-3 + tau: 0.05 + # ent_coef: 0.01 + policy_kwargs: "dict(n_critics=2, net_arch=[256, 256, 256])" online_sampling: True - normalize: True FetchPickAndPlace-v1: - n_timesteps: !!float 4e6 + env_wrapper: + - sb3_contrib.common.wrappers.TimeFeatureWrapper + # - utils.wrappers.DoneOnSuccessWrapper: + # reward_offset: 0 + # n_successes: 4 + # - stable_baselines3.common.monitor.Monitor + n_timesteps: !!float 1e6 policy: 'MlpPolicy' - model_class: 'sac' + model_class: 'tqc' n_sampled_goal: 4 goal_selection_strategy: 'future' buffer_size: 1000000 - ent_coef: 'auto' - # batch_size: 256 + batch_size: 1024 gamma: 0.95 - # learning_rate: !!float 1e-3 - learning_starts: 1000 - train_freq: 1 + learning_rate: !!float 1e-3 + tau: 0.05 + policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])" online_sampling: True # SAC hyperparams @@ -202,17 +173,3 @@ FetchReach-v1: learning_starts: 1000 online_sampling: True normalize: True - - -# TD3 hyperparams -# FetchReach-v1: -# n_timesteps: !!float 25000 -# policy: 'MlpPolicy' -# model_class: 'td3' -# n_sampled_goal: 4 -# goal_selection_strategy: 'future' -# buffer_size: 1000000 -# batch_size: 256 -# gamma: 0.95 -# learning_rate: 0.001 -# learning_starts: 1000 diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml index bc78bf7b9..c85a9825e 100644 --- a/hyperparams/ppo.yml +++ b/hyperparams/ppo.yml @@ -133,7 +133,7 @@ LunarLanderContinuous-v2: # Tuned HalfCheetahBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 16 n_timesteps: !!float 2e6 @@ -158,7 +158,7 @@ HalfCheetahBulletEnv-v0: # Tuned AntBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 16 n_timesteps: !!float 2e6 @@ -183,7 +183,7 @@ AntBulletEnv-v0: # Tuned Walker2DBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 16 n_timesteps: !!float 2e6 @@ -208,7 +208,7 @@ Walker2DBulletEnv-v0: # Tuned HopperBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 16 n_timesteps: !!float 2e6 @@ -233,7 +233,7 @@ HopperBulletEnv-v0: # Tuned ReacherBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper normalize: true n_envs: 8 n_timesteps: !!float 1e6 diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index 5e304cc7b..15b86c803 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -3,7 +3,7 @@ NeckEnvRelative-v2: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 - - utils.wrappers.TimeFeatureWrapper: + - sb3_contrib.common.wrappers.TimeFeatureWrapper: test_mode: False # - utils.wrappers.LowPassFilterWrapper: # freq: 2.0 @@ -117,12 +117,12 @@ BipedalWalkerHardcore-v3: # Tuned HalfCheetahBulletEnv-v0: # env_wrapper: - # - utils.wrappers.TimeFeatureWrapper + # - sb3_contrib.common.wrappers.TimeFeatureWrapper # - utils.wrappers.DelayedRewardWrapper: # delay: 10 # - utils.wrappers.HistoryWrapper: # horizon: 10 - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -139,7 +139,7 @@ HalfCheetahBulletEnv-v0: # Tuned AntBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -155,7 +155,7 @@ AntBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" HopperBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -171,7 +171,7 @@ HopperBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" Walker2DBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -189,7 +189,7 @@ Walker2DBulletEnv-v0: # Tuned ReacherBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -218,7 +218,7 @@ HumanoidBulletEnv-v0: # Tuned InvertedDoublePendulumBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 5e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -235,7 +235,7 @@ InvertedDoublePendulumBulletEnv-v0: # Tuned InvertedPendulumSwingupBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -313,7 +313,7 @@ donkey-generated-track-v0: max_episode_steps: 500 - utils.wrappers.HistoryWrapper: horizon: 5 - - utils.wrappers.TimeFeatureWrapper + - sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml index 6e42ee5c8..9b5931668 100644 --- a/hyperparams/td3.yml +++ b/hyperparams/td3.yml @@ -60,7 +60,7 @@ BipedalWalkerHardcore-v3: # Tuned HalfCheetahBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -74,7 +74,7 @@ HalfCheetahBulletEnv-v0: policy_kwargs: "dict(net_arch=[400, 300])" AntBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -88,7 +88,7 @@ AntBulletEnv-v0: policy_kwargs: "dict(net_arch=[400, 300])" HopperBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -102,7 +102,7 @@ HopperBulletEnv-v0: policy_kwargs: "dict(net_arch=[400, 300])" Walker2DBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -118,7 +118,7 @@ Walker2DBulletEnv-v0: # TO BE tested HumanoidBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 2e6 policy: 'MlpPolicy' gamma: 0.98 @@ -133,7 +133,7 @@ HumanoidBulletEnv-v0: # Tuned ReacherBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' gamma: 0.98 @@ -148,7 +148,7 @@ ReacherBulletEnv-v0: # Tuned InvertedDoublePendulumBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -163,7 +163,7 @@ InvertedDoublePendulumBulletEnv-v0: # Tuned InvertedPendulumSwingupBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' gamma: 0.98 @@ -177,7 +177,7 @@ InvertedPendulumSwingupBulletEnv-v0: policy_kwargs: "dict(net_arch=[400, 300])" MinitaurBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.99 diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index d2ff5df85..fe64b2bc6 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -19,9 +19,9 @@ Pendulum-v0: policy: 'MlpPolicy' learning_rate: !!float 1e-3 use_sde: True - n_episodes_rollout: 1 - gradient_steps: -1 - train_freq: -1 + n_episodes_rollout: -1 + gradient_steps: 64 + train_freq: 64 policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])" LunarLanderContinuous-v2: @@ -66,7 +66,7 @@ BipedalWalkerHardcore-v3: # Tuned HalfCheetahBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -83,7 +83,7 @@ HalfCheetahBulletEnv-v0: # Tuned AntBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -100,7 +100,7 @@ AntBulletEnv-v0: # Tuned HopperBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -118,7 +118,7 @@ HopperBulletEnv-v0: # Tuned Walker2DBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -135,7 +135,7 @@ Walker2DBulletEnv-v0: ReacherBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -153,7 +153,7 @@ ReacherBulletEnv-v0: # Almost tuned HumanoidBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e7 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -170,7 +170,7 @@ HumanoidBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" InvertedDoublePendulumBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 5e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -186,7 +186,7 @@ InvertedDoublePendulumBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" InvertedPendulumSwingupBulletEnv-v0: - env_wrapper: utils.wrappers.TimeFeatureWrapper + env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 diff --git a/requirements.txt b/requirements.txt index 4ff48dab2..9f6588142 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -stable-baselines3[extra,tests,docs]>=0.10.0 +stable-baselines3[extra,tests,docs]>=0.11.0a2 box2d-py==2.3.8 pybullet gym-minigrid @@ -7,4 +7,4 @@ optuna pytablewriter seaborn pyyaml>=5.1 -sb3-contrib>=0.10.0 +sb3-contrib>=0.11.0a3 diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh index 265b27dfa..d4a21d19f 100755 --- a/scripts/build_docker.sh +++ b/scripts/build_docker.sh @@ -3,7 +3,7 @@ PARENT=stablebaselines/stable-baselines3 TAG=stablebaselines/rl-baselines3-zoo -VERSION=0.10.0 +VERSION=0.11.0 if [[ ${USE_GPU} == "True" ]]; then PARENT="${PARENT}:${VERSION}" diff --git a/tests/test_hyperparams_opt.py b/tests/test_hyperparams_opt.py index 9ebf4f94f..6c5f0b016 100644 --- a/tests/test_hyperparams_opt.py +++ b/tests/test_hyperparams_opt.py @@ -26,6 +26,8 @@ def _assert_eq(left, right): experiments["td3-Pendulum-v0"] = ("td3", "Pendulum-v0") # Test for HER experiments["her-parking-v0"] = ("her", "parking-v0") +# Test for TQC +experiments["tqc-Pendulum-v0"] = ("tqc", "Pendulum-v0") @pytest.mark.parametrize("sampler", ["random", "tpe"]) diff --git a/utils/hyperparams_opt.py b/utils/hyperparams_opt.py index c1e32868d..ce7984bab 100644 --- a/utils/hyperparams_opt.py +++ b/utils/hyperparams_opt.py @@ -2,7 +2,8 @@ import numpy as np import optuna -from stable_baselines3 import DDPG, SAC, TD3 +from sb3_contrib import TQC +from stable_baselines3 import DDPG, DQN, SAC, TD3 from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from torch import nn as nn @@ -11,7 +12,7 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: """ - Sampler for PPO2 hyperparams. + Sampler for PPO hyperparams. :param trial: :return: @@ -21,6 +22,7 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) lr_schedule = "constant" + # Uncomment to enable learning rate schedule # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1) clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]) @@ -29,8 +31,11 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) vf_coef = trial.suggest_uniform("vf_coef", 0, 1) net_arch = trial.suggest_categorical("net_arch", ["small", "medium"]) - log_std_init = trial.suggest_uniform("log_std_init", -4, 1) - sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256]) + # Uncomment for gSDE (continuous actions) + # log_std_init = trial.suggest_uniform("log_std_init", -4, 1) + # Uncomment for gSDE (continuous action) + # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256]) + # Orthogonal initialization ortho_init = False # ortho_init = trial.suggest_categorical('ortho_init', [False, True]) # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) @@ -43,6 +48,8 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: if lr_schedule == "linear": learning_rate = linear_schedule(learning_rate) + # Independent networks usually work best + # when not working with images net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[256, 256], vf=[256, 256])], @@ -61,9 +68,9 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: "gae_lambda": gae_lambda, "max_grad_norm": max_grad_norm, "vf_coef": vf_coef, - "sde_sample_freq": sde_sample_freq, + # "sde_sample_freq": sde_sample_freq, "policy_kwargs": dict( - log_std_init=log_std_init, + # log_std_init=log_std_init, net_arch=net_arch, activation_fn=activation_fn, ortho_init=ortho_init, @@ -81,6 +88,7 @@ def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]: gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True]) max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) + # Toggle PyTorch RMS Prop (different from TF one, cf doc) use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True]) gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]) @@ -88,7 +96,8 @@ def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]: learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1) vf_coef = trial.suggest_uniform("vf_coef", 0, 1) - log_std_init = trial.suggest_uniform("log_std_init", -4, 1) + # Uncomment for gSDE (continuous actions) + # log_std_init = trial.suggest_uniform("log_std_init", -4, 1) ortho_init = trial.suggest_categorical("ortho_init", [False, True]) net_arch = trial.suggest_categorical("net_arch", ["small", "medium"]) # sde_net_arch = trial.suggest_categorical("sde_net_arch", [None, "tiny", "small"]) @@ -123,7 +132,7 @@ def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]: "use_rms_prop": use_rms_prop, "vf_coef": vf_coef, "policy_kwargs": dict( - log_std_init=log_std_init, + # log_std_init=log_std_init, net_arch=net_arch, # full_std=full_std, activation_fn=activation_fn, @@ -142,19 +151,21 @@ def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]: """ gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) - batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128, 256, 512]) + batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128, 256, 512, 1024, 2048]) buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)]) learning_starts = trial.suggest_categorical("learning_starts", [0, 1000, 10000, 20000]) # train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 300]) train_freq = trial.suggest_categorical("train_freq", [8, 16, 32, 64, 128, 256, 512]) # Polyak coeff - tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02]) + tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05]) # gradient_steps takes too much time # gradient_steps = trial.suggest_categorical('gradient_steps', [1, 100, 300]) gradient_steps = train_freq # ent_coef = trial.suggest_categorical('ent_coef', ['auto', 0.5, 0.1, 0.05, 0.01, 0.0001]) ent_coef = "auto" + # You can comment that out when not using gSDE log_std_init = trial.suggest_uniform("log_std_init", -4, 1) + # NOTE: Add "verybig" to net_arch when tuning HER net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) @@ -162,6 +173,8 @@ def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]: "small": [64, 64], "medium": [256, 256], "big": [400, 300], + # Uncomment for tuning HER + # "verybig": [256, 256, 256], }[net_arch] target_entropy = "auto" @@ -193,7 +206,7 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]: """ gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) - batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512]) + batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048]) buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)]) episodic = trial.suggest_categorical("episodic", [True, False]) @@ -209,6 +222,7 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]: noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None]) noise_std = trial.suggest_uniform("noise_std", 0, 1) + # NOTE: Add "verybig" to net_arch when tuning HER net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) @@ -216,6 +230,8 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]: "small": [64, 64], "medium": [256, 256], "big": [400, 300], + # Uncomment for tuning HER + # "verybig": [256, 256, 256], }[net_arch] hyperparams = { @@ -250,7 +266,7 @@ def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]: """ gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) - batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512]) + batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048]) buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)]) # Polyak coeff tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02]) @@ -268,6 +284,7 @@ def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]: noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None]) noise_std = trial.suggest_uniform("noise_std", 0, 1) + # NOTE: Add "verybig" to net_arch when tuning HER (see TD3) net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) @@ -354,7 +371,9 @@ def sample_her_params(trial: optuna.Trial) -> Dict[str, Any]: model_class_str = { SAC: "sac", DDPG: "ddpg", + DQN: "dqn", TD3: "td3", + TQC: "tqc", }[trial.model_class] hyperparams = HYPERPARAMS_SAMPLER[model_class_str](trial) @@ -368,12 +387,32 @@ def sample_her_params(trial: optuna.Trial) -> Dict[str, Any]: return hyperparams +def sample_tqc_params(trial: optuna.Trial) -> Dict[str, Any]: + """ + Sampler for TQC hyperparams. + + :param trial: + :return: + """ + # TQC is SAC + Distributional RL + hyperparams = sample_sac_params(trial) + + n_quantiles = trial.suggest_int("n_quantiles", 5, 50) + top_quantiles_to_drop_per_net = trial.suggest_int("top_quantiles_to_drop_per_net", 0, n_quantiles - 1) + + hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles}) + hyperparams["top_quantiles_to_drop_per_net"] = top_quantiles_to_drop_per_net + + return hyperparams + + HYPERPARAMS_SAMPLER = { "a2c": sample_a2c_params, "ddpg": sample_ddpg_params, "dqn": sample_dqn_params, "her": sample_her_params, "sac": sample_sac_params, + "tqc": sample_tqc_params, "ppo": sample_ppo_params, "td3": sample_td3_params, } diff --git a/utils/utils.py b/utils/utils.py index 6f633f77f..0fb5c3ce6 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -8,17 +8,13 @@ import stable_baselines3 as sb3 # noqa: F401 import torch as th # noqa: F401 import yaml +from sb3_contrib import TQC from stable_baselines3 import A2C, DDPG, DQN, HER, PPO, SAC, TD3 from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.sb2_compat.rmsprop_tf_like import RMSpropTFLike # noqa: F401 from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv, VecFrameStack, VecNormalize -try: - from sb3_contrib import TQC # pytype: disable=import-error -except ImportError: - TQC = None - # For custom activation fn from torch import nn as nn # noqa: F401 pylint: disable=unused-import @@ -30,11 +26,10 @@ "her": HER, "sac": SAC, "td3": TD3, + # SB3 Contrib, + "tqc": TQC, } -if TQC is not None: - ALGOS["tqc"] = TQC - def flatten_dict_observations(env: gym.Env) -> gym.Env: assert isinstance(env.observation_space, gym.spaces.Dict) diff --git a/utils/wrappers.py b/utils/wrappers.py index 32480ddd0..4b3aa77eb 100644 --- a/utils/wrappers.py +++ b/utils/wrappers.py @@ -1,6 +1,7 @@ import gym import numpy as np from matplotlib import pyplot as plt +from sb3_contrib.common.wrappers import TimeFeatureWrapper # noqa: F401 (backward compatibility) from scipy.signal import iirfilter, sosfilt, zpk2sos @@ -10,7 +11,7 @@ class DoneOnSuccessWrapper(gym.Wrapper): Useful for GoalEnv. """ - def __init__(self, env: gym.Env, reward_offset: float = 1.0, n_successes: int = 1): + def __init__(self, env: gym.Env, reward_offset: float = 0.0, n_successes: int = 1): super(DoneOnSuccessWrapper, self).__init__(env) self.reward_offset = reward_offset self.n_successes = n_successes @@ -36,121 +37,6 @@ def compute_reward(self, achieved_goal, desired_goal, info): return reward + self.reward_offset -class TimeFeatureWrapper(gym.Wrapper): - """ - Add remaining time to observation space for fixed length episodes. - See https://arxiv.org/abs/1712.00378 and https://github.com/aravindr93/mjrl/issues/13. - - :param env: (gym.Env) - :param max_steps: (int) Max number of steps of an episode - if it is not wrapped in a TimeLimit object. - :param test_mode: (bool) In test mode, the time feature is constant, - equal to zero. This allow to check that the agent did not overfit this feature, - learning a deterministic pre-defined sequence of actions. - """ - - def __init__(self, env: gym.Env, max_steps: int = 1000, test_mode: bool = False): - assert isinstance(env.observation_space, gym.spaces.Box) - # Add a time feature to the observation - low, high = env.observation_space.low, env.observation_space.high - low, high = np.concatenate((low, [0])), np.concatenate((high, [1.0])) - env.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) - - super(TimeFeatureWrapper, self).__init__(env) - - try: - self._max_steps = env.spec.max_episode_steps - except AttributeError: - self._max_steps = None - - if self._max_steps is None: - self._max_steps = max_steps - - self._current_step = 0 - self._test_mode = test_mode - - def reset(self): - self._current_step = 0 - return self._get_obs(self.env.reset()) - - def step(self, action): - self._current_step += 1 - obs, reward, done, info = self.env.step(action) - return self._get_obs(obs), reward, done, info - - def _get_obs(self, obs): - """ - Concatenate the time feature to the current observation. - - :param obs: (np.ndarray) - :return: (np.ndarray) - """ - # Remaining time is more general - time_feature = 1 - (self._current_step / self._max_steps) - if self._test_mode: - time_feature = 1.0 - # Optionnaly: concatenate [time_feature, time_feature ** 2] - return np.concatenate((obs, [time_feature])) - - -class TimeFeatureObsDictWrapper(gym.Wrapper): - """ - Add remaining time to observation space for fixed length episodes. - See https://arxiv.org/abs/1712.00378 and https://github.com/aravindr93/mjrl/issues/13. - - :param env: (gym.Env) - :param max_steps: (int) Max number of steps of an episode - if it is not wrapped in a TimeLimit object. - :param test_mode: (bool) In test mode, the time feature is constant, - equal to zero. This allow to check that the agent did not overfit this feature, - learning a deterministic pre-defined sequence of actions. - """ - - def __init__(self, env: gym.Env, max_steps: int = 1000, test_mode: bool = False): - assert isinstance(env.observation_space, gym.spaces.Dict) - # Add a time feature to the observation - obs_space = env.observation_space.spaces["observation"] - low, high = obs_space.low, obs_space.high - low, high = np.concatenate((low, [0])), np.concatenate((high, [1.0])) - env.observation_space.spaces["observation"] = gym.spaces.Box(low=low, high=high, dtype=np.float32) - - super(TimeFeatureObsDictWrapper, self).__init__(env) - - try: - self._max_steps = env.spec.max_episode_steps - except AttributeError: - self._max_steps = None - - if self._max_steps is None: - self._max_steps = max_steps - - self._current_step = 0 - self._test_mode = test_mode - - def reset(self): - self._current_step = 0 - return self._get_obs(self.env.reset()) - - def step(self, action): - self._current_step += 1 - obs, reward, done, info = self.env.step(action) - return self._get_obs(obs), reward, done, info - - def _get_obs(self, obs): - """ - Concatenate the time feature to the current observation. - - :param obs: (np.ndarray) - :return: (np.ndarray) - """ - # Remaining time is more general - time_feature = 1 - (self._current_step / self._max_steps) - if self._test_mode: - time_feature = 1.0 - obs["observation"] = np.concatenate((obs["observation"], [time_feature])) - return obs - - class ActionNoiseWrapper(gym.Wrapper): """ Add gaussian noise to the action (without telling the agent), diff --git a/version.txt b/version.txt index d22e31d20..a09c7eb7a 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.11.0a0 +0.11.0a2