From 8ea4f4a87afa548832ca17e575b351ec5928c1b0 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Sun, 13 Dec 2020 17:28:49 +0100
Subject: [PATCH] Tune HER hyperparams (#58)

* Update her hyperparams

* Contrib repo is now required

* Save hyperparams

* Remove reward offset

* Update params

* Update hyperparams

* Add TQC hyperparam opt support

* Update requirements

* Update docker image

* Attempt to fix CI

* Fix bug when using HER + DQN/TQC for hyperparam optimization

* Fix SQLAlchemy version

* Maybe pip will be happy now?

* Use latest contrib version

* Test if hack is still needed

* Remove hack

* Cleanup
---
 .github/workflows/ci.yml             |   2 +-
 .github/workflows/trained_agents.yml |   2 +-
 CHANGELOG.md                         |   7 +-
 hyperparams/a2c.yml                  |   8 +-
 hyperparams/ddpg.yml                 |  16 +--
 hyperparams/her.yml                  | 163 ++++++++++-----------------
 hyperparams/ppo.yml                  |  10 +-
 hyperparams/sac.yml                  |  20 ++--
 hyperparams/td3.yml                  |  18 +--
 hyperparams/tqc.yml                  |  22 ++--
 requirements.txt                     |   4 +-
 scripts/build_docker.sh              |   2 +-
 tests/test_hyperparams_opt.py        |   2 +
 utils/hyperparams_opt.py             |  63 +++++++++--
 utils/utils.py                       |  11 +-
 utils/wrappers.py                    | 118 +------------------
 version.txt                          |   2 +-
 17 files changed, 177 insertions(+), 293 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 89c6de35a..42dc99c7e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,7 +30,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         # cpu version of pytorch - faster to download
-        pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
         pip install -r requirements.txt
         # Use headless version
         pip install opencv-python-headless
diff --git a/.github/workflows/trained_agents.yml b/.github/workflows/trained_agents.yml
index da3935c02..792b94606 100644
--- a/.github/workflows/trained_agents.yml
+++ b/.github/workflows/trained_agents.yml
@@ -30,7 +30,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         # cpu version of pytorch - faster to download
-        pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
         pip install -r requirements.txt
         # Use headless version
         pip install opencv-python-headless
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67522364b..8dde0dc28 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,15 +1,19 @@
-## Pre-Release 0.11.0a0 (WIP)
+## Pre-Release 0.11.0a2 (WIP)
 
 ### Breaking Changes
 - Removed `LinearNormalActionNoise`
 - Evaluation is now deterministic by default, except for Atari games
+- `sb3_contrib` is now required
+- `TimeFeatureWrapper` was moved to the contrib repo
 
 ### New Features
 - Added option to choose which `VecEnv` class to use for multiprocessing
+- Added hyperparameter optimization support for `TQC`
 
 ### Bug fixes
 - Improved detection of Atari games
 - Fix potential bug in plotting script when there is not enough timesteps
+- Fixed a bug when using HER + DQN/TQC for hyperparam optimization
 
 ### Documentation
 
@@ -21,6 +25,7 @@
 - Changed `PPO` atari hyperparameters (removed vf clipping)
 - Changed `A2C` atari hyperparameters (eps value of the optimizer)
 - Updated benchmark script
+- Updated hyperparameter optim search space (commented gSDE for A2C/PPO)
 
 ## Pre-Release 0.10.0 (2020-10-28)
 
diff --git a/hyperparams/a2c.yml b/hyperparams/a2c.yml
index 90f3b586e..e2beecc74 100644
--- a/hyperparams/a2c.yml
+++ b/hyperparams/a2c.yml
@@ -125,7 +125,7 @@ BipedalWalkerHardcore-v3:
 
 # Tuned
 HalfCheetahBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 4
   n_timesteps: !!float 2e6
@@ -145,7 +145,7 @@ HalfCheetahBulletEnv-v0:
   policy_kwargs: "dict(log_std_init=-2, ortho_init=False, full_std=True)"
 
 Walker2DBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 4
   n_timesteps: !!float 2e6
@@ -198,7 +198,7 @@ AntBulletEnv-v0:
 
 # Tuned
 HopperBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 4
   n_timesteps: !!float 2e6
@@ -218,7 +218,7 @@ HopperBulletEnv-v0:
 # Tuned but unstable
 # Not working without SDE?
 ReacherBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 4
   n_timesteps: !!float 2e6
diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml
index dab6e353e..706804859 100644
--- a/hyperparams/ddpg.yml
+++ b/hyperparams/ddpg.yml
@@ -60,7 +60,7 @@ BipedalWalkerHardcore-v3:
 
 # Tuned
 HalfCheetahBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -75,7 +75,7 @@ HalfCheetahBulletEnv-v0:
 
 # Tuned
 AntBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -90,7 +90,7 @@ AntBulletEnv-v0:
 
 # Tuned
 HopperBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -107,7 +107,7 @@ HopperBulletEnv-v0:
 
 # Tuned
 Walker2DBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -124,7 +124,7 @@ Walker2DBulletEnv-v0:
 
 # TO BE tested
 HumanoidBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 2e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -139,7 +139,7 @@ HumanoidBulletEnv-v0:
 
 # To be tuned
 ReacherBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 3e5
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -154,7 +154,7 @@ ReacherBulletEnv-v0:
 
 # To be tuned
 InvertedDoublePendulumBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -169,7 +169,7 @@ InvertedDoublePendulumBulletEnv-v0:
 
 # To be tuned
 InvertedPendulumSwingupBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 3e5
   policy: 'MlpPolicy'
   gamma: 0.98
diff --git a/hyperparams/her.yml b/hyperparams/her.yml
index 8a20acf98..dac73aacd 100644
--- a/hyperparams/her.yml
+++ b/hyperparams/her.yml
@@ -4,7 +4,7 @@ NeckGoalEnvRelativeSparse-v2:
   # env_wrapper:
   #   - utils.wrappers.HistoryWrapper:
   #       horizon: 2
-  #   - utils.wrappers.TimeFeatureWrapper
+  #   - sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -31,7 +31,7 @@ NeckGoalEnvRelativeDense-v2:
   env_wrapper:
     - utils.wrappers.HistoryWrapperObsDict:
         horizon: 2
-  #   - utils.wrappers.TimeFeatureWrapper
+  #   - sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -53,6 +53,22 @@ NeckGoalEnvRelativeDense-v2:
   goal_selection_strategy: 'future'
   online_sampling: False
 
+FetchPush-v1:
+  env_wrapper:
+    - sb3_contrib.common.wrappers.TimeFeatureWrapper
+  n_timesteps: !!float 1e6
+  policy: 'MlpPolicy'
+  model_class: 'tqc'
+  n_sampled_goal: 4
+  goal_selection_strategy: 'future'
+  buffer_size: 1000000
+  batch_size: 2048
+  gamma: 0.95
+  learning_rate: !!float 1e-3
+  tau: 0.05
+  policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])"
+  online_sampling: True
+
 # DDPG hyperparams
 #parking-v0:
 #  n_timesteps: !!float 2e5
@@ -70,121 +86,76 @@ NeckGoalEnvRelativeDense-v2:
 #  online_sampling: True
 #  max_episode_length: 100
 
-
-# SAC hyperparams, her paper
 parking-v0:
   n_timesteps: !!float 2e5
   policy: 'MlpPolicy'
-  model_class: 'sac'
+  model_class: 'tqc'
   n_sampled_goal: 4
   goal_selection_strategy: 'future'
   buffer_size: 1000000
-  batch_size: 256
+  batch_size: 1024
   gamma: 0.95
   learning_rate: !!float 1e-3
-  # noise_type: 'normal'
-  # noise_std: 0.2
-  policy_kwargs: "dict(net_arch=[256, 256, 256])"
-  online_sampling: False
-  # normalize: True
+  tau: 0.05
+  policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])"
+  online_sampling: True
   max_episode_length: 100
-
-# TD3 hyperparams, her paper
-#parking-v0:
-#  n_timesteps: !!float 2e5
-#  policy: 'MlpPolicy'
-#  model_class: 'td3'
-#  n_sampled_goal: 4
-#  goal_selection_strategy: 'future'
-#  buffer_size: 1000000
-#  batch_size: 256
-#  gamma: 0.95
-#  learning_rate: !!float 1e-3
-#  noise_type: 'normal'
-#  noise_std: 0.2
-#  policy_kwargs: "dict(net_arch=[256, 256, 256])"
-#  online_sampling: True
-#  max_episode_length: 100
-
+  # normalize: True
 
 # Mujoco Robotic Env
-# DDPG hyperparams
-# FetchReach-v1:
-#   n_timesteps: !!float 20000
-#   policy: 'MlpPolicy'
-#   model_class: 'ddpg'
-#   n_sampled_goal: 4
-#   goal_selection_strategy: 'future'
-#   buffer_size: 1000000
-#   batch_size: 256
-#   gamma: 0.95
-#   random_exploration: 0.3
-#   actor_lr: !!float 1e-3
-#   critic_lr: !!float 1e-3
-#   noise_type: 'normal'
-#   noise_std: 0.2
-#   normalize_observations: true
-#   normalize_returns: false
-#   policy_kwargs: "dict(layers=[256, 256, 256])"
-#   online_sampling: True
 
-# NOTE: shoube be run with 8 workers: mpirun -n 8
-# FetchPush-v1:
-#   n_timesteps: !!float 2e6
-#   policy: 'MlpPolicy'
-#   model_class: 'ddpg'
-#   n_sampled_goal: 4
-#   goal_selection_strategy: 'future'
-#   buffer_size: 200000
-#   batch_size: 256
-#   gamma: 0.95
-#   random_exploration: 0.3
-#   actor_lr: !!float 1e-3
-#   critic_lr: !!float 1e-3
-#   noise_type: 'normal'
-#   noise_std: 0.2
-#   normalize_observations: true
-#   normalize_returns: false
-#   policy_kwargs: "dict(layers=[16, 16, 16])"
+FetchSlide-v1:
+  env_wrapper:
+    - sb3_contrib.common.wrappers.TimeFeatureWrapper
+  n_timesteps: !!float 1e6
+  policy: 'MlpPolicy'
+  model_class: 'tqc'
+  n_sampled_goal: 4
+  goal_selection_strategy: 'future'
+  buffer_size: 1000000
+  batch_size: 2048
+  gamma: 0.95
+  learning_rate: !!float 1e-3
+  tau: 0.05
+  # ent_coef: 0.01
+  policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])"
+  online_sampling: True
 
 FetchPush-v1:
   env_wrapper:
-    - utils.wrappers.HistoryWrapperObsDict:
-        horizon: 2
-    # - utils.wrappers.TimeFeatureObsDictWrapper
-  n_timesteps: !!float 3e6
+    - sb3_contrib.common.wrappers.TimeFeatureWrapper
+  n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
-  model_class: 'sac'
+  model_class: 'tqc'
   n_sampled_goal: 4
   goal_selection_strategy: 'future'
   buffer_size: 1000000
-  ent_coef: 'auto'
+  batch_size: 2048
   gamma: 0.95
-  learning_rate: !!float 7e-4
-  use_sde: True
-  gradient_steps: -1
-  train_freq: -1
-  n_episodes_rollout: 1
-  sde_sample_freq: 10
-  # noise_type: 'normal'
-  # noise_std: 0.2
-  learning_starts: 1000
+  learning_rate: !!float 1e-3
+  tau: 0.05
+  # ent_coef: 0.01
+  policy_kwargs: "dict(n_critics=2, net_arch=[256, 256, 256])"
   online_sampling: True
-  normalize: True
 
 FetchPickAndPlace-v1:
-  n_timesteps: !!float 4e6
+  env_wrapper:
+    - sb3_contrib.common.wrappers.TimeFeatureWrapper
+    # - utils.wrappers.DoneOnSuccessWrapper:
+    #     reward_offset: 0
+    #     n_successes: 4
+    # - stable_baselines3.common.monitor.Monitor
+  n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
-  model_class: 'sac'
+  model_class: 'tqc'
   n_sampled_goal: 4
   goal_selection_strategy: 'future'
   buffer_size: 1000000
-  ent_coef: 'auto'
-  # batch_size: 256
+  batch_size: 1024
   gamma: 0.95
-  # learning_rate: !!float 1e-3
-  learning_starts: 1000
-  train_freq: 1
+  learning_rate: !!float 1e-3
+  tau: 0.05
+  policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])"
   online_sampling: True
 
 # SAC hyperparams
@@ -202,17 +173,3 @@ FetchReach-v1:
   learning_starts: 1000
   online_sampling: True
   normalize: True
-
-
-# TD3 hyperparams
-# FetchReach-v1:
-#   n_timesteps: !!float 25000
-#   policy: 'MlpPolicy'
-#   model_class: 'td3'
-#   n_sampled_goal: 4
-#   goal_selection_strategy: 'future'
-#   buffer_size: 1000000
-#   batch_size: 256
-#   gamma: 0.95
-#   learning_rate: 0.001
-#   learning_starts: 1000
diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml
index bc78bf7b9..c85a9825e 100644
--- a/hyperparams/ppo.yml
+++ b/hyperparams/ppo.yml
@@ -133,7 +133,7 @@ LunarLanderContinuous-v2:
 
 # Tuned
 HalfCheetahBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 16
   n_timesteps: !!float 2e6
@@ -158,7 +158,7 @@ HalfCheetahBulletEnv-v0:
 
 # Tuned
 AntBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 16
   n_timesteps: !!float 2e6
@@ -183,7 +183,7 @@ AntBulletEnv-v0:
 
 # Tuned
 Walker2DBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 16
   n_timesteps: !!float 2e6
@@ -208,7 +208,7 @@ Walker2DBulletEnv-v0:
 
 # Tuned
 HopperBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 16
   n_timesteps: !!float 2e6
@@ -233,7 +233,7 @@ HopperBulletEnv-v0:
 
 # Tuned
 ReacherBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   normalize: true
   n_envs: 8
   n_timesteps: !!float 1e6
diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml
index 5e304cc7b..15b86c803 100644
--- a/hyperparams/sac.yml
+++ b/hyperparams/sac.yml
@@ -3,7 +3,7 @@ NeckEnvRelative-v2:
   env_wrapper:
     - utils.wrappers.HistoryWrapper:
         horizon: 2
-    - utils.wrappers.TimeFeatureWrapper:
+    - sb3_contrib.common.wrappers.TimeFeatureWrapper:
         test_mode: False
     # - utils.wrappers.LowPassFilterWrapper:
     #     freq: 2.0
@@ -117,12 +117,12 @@ BipedalWalkerHardcore-v3:
 # Tuned
 HalfCheetahBulletEnv-v0:
   # env_wrapper:
-  #   - utils.wrappers.TimeFeatureWrapper
+  #   - sb3_contrib.common.wrappers.TimeFeatureWrapper
   #   - utils.wrappers.DelayedRewardWrapper:
   #       delay: 10
   #   - utils.wrappers.HistoryWrapper:
   #       horizon: 10
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -139,7 +139,7 @@ HalfCheetahBulletEnv-v0:
 
 # Tuned
 AntBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -155,7 +155,7 @@ AntBulletEnv-v0:
   policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"
 
 HopperBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: lin_7.3e-4
@@ -171,7 +171,7 @@ HopperBulletEnv-v0:
   policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"
 
 Walker2DBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: lin_7.3e-4
@@ -189,7 +189,7 @@ Walker2DBulletEnv-v0:
 
 # Tuned
 ReacherBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 3e5
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -218,7 +218,7 @@ HumanoidBulletEnv-v0:
 
 # Tuned
 InvertedDoublePendulumBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 5e5
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -235,7 +235,7 @@ InvertedDoublePendulumBulletEnv-v0:
 
 # Tuned
 InvertedPendulumSwingupBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 3e5
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -313,7 +313,7 @@ donkey-generated-track-v0:
         max_episode_steps: 500
     - utils.wrappers.HistoryWrapper:
         horizon: 5
-    - utils.wrappers.TimeFeatureWrapper
+    - sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml
index 6e42ee5c8..9b5931668 100644
--- a/hyperparams/td3.yml
+++ b/hyperparams/td3.yml
@@ -60,7 +60,7 @@ BipedalWalkerHardcore-v3:
 
 # Tuned
 HalfCheetahBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -74,7 +74,7 @@ HalfCheetahBulletEnv-v0:
   policy_kwargs: "dict(net_arch=[400, 300])"
 
 AntBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -88,7 +88,7 @@ AntBulletEnv-v0:
   policy_kwargs: "dict(net_arch=[400, 300])"
 
 HopperBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -102,7 +102,7 @@ HopperBulletEnv-v0:
   policy_kwargs: "dict(net_arch=[400, 300])"
 
 Walker2DBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -118,7 +118,7 @@ Walker2DBulletEnv-v0:
 
 # TO BE tested
 HumanoidBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 2e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -133,7 +133,7 @@ HumanoidBulletEnv-v0:
 
 # Tuned
 ReacherBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 3e5
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -148,7 +148,7 @@ ReacherBulletEnv-v0:
 
 # Tuned
 InvertedDoublePendulumBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -163,7 +163,7 @@ InvertedDoublePendulumBulletEnv-v0:
 
 # Tuned
 InvertedPendulumSwingupBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 3e5
   policy: 'MlpPolicy'
   gamma: 0.98
@@ -177,7 +177,7 @@ InvertedPendulumSwingupBulletEnv-v0:
   policy_kwargs: "dict(net_arch=[400, 300])"
 
 MinitaurBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   gamma: 0.99
diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml
index d2ff5df85..fe64b2bc6 100644
--- a/hyperparams/tqc.yml
+++ b/hyperparams/tqc.yml
@@ -19,9 +19,9 @@ Pendulum-v0:
   policy: 'MlpPolicy'
   learning_rate: !!float 1e-3
   use_sde: True
-  n_episodes_rollout: 1
-  gradient_steps: -1
-  train_freq: -1
+  n_episodes_rollout: -1
+  gradient_steps: 64
+  train_freq: 64
   policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])"
 
 LunarLanderContinuous-v2:
@@ -66,7 +66,7 @@ BipedalWalkerHardcore-v3:
 
 # Tuned
 HalfCheetahBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -83,7 +83,7 @@ HalfCheetahBulletEnv-v0:
 
 # Tuned
 AntBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -100,7 +100,7 @@ AntBulletEnv-v0:
 
 # Tuned
 HopperBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: lin_7.3e-4
@@ -118,7 +118,7 @@ HopperBulletEnv-v0:
 
 # Tuned
 Walker2DBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: lin_7.3e-4
@@ -135,7 +135,7 @@ Walker2DBulletEnv-v0:
 
 
 ReacherBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 3e5
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -153,7 +153,7 @@ ReacherBulletEnv-v0:
 
 # Almost tuned
 HumanoidBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 1e7
   policy: 'MlpPolicy'
   learning_rate: lin_7.3e-4
@@ -170,7 +170,7 @@ HumanoidBulletEnv-v0:
   policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"
 
 InvertedDoublePendulumBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 5e5
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
@@ -186,7 +186,7 @@ InvertedDoublePendulumBulletEnv-v0:
   policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"
 
 InvertedPendulumSwingupBulletEnv-v0:
-  env_wrapper: utils.wrappers.TimeFeatureWrapper
+  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
   n_timesteps: !!float 3e5
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4
diff --git a/requirements.txt b/requirements.txt
index 4ff48dab2..9f6588142 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-stable-baselines3[extra,tests,docs]>=0.10.0
+stable-baselines3[extra,tests,docs]>=0.11.0a2
 box2d-py==2.3.8
 pybullet
 gym-minigrid
@@ -7,4 +7,4 @@ optuna
 pytablewriter
 seaborn
 pyyaml>=5.1
-sb3-contrib>=0.10.0
+sb3-contrib>=0.11.0a3
diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh
index 265b27dfa..d4a21d19f 100755
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@@ -3,7 +3,7 @@
 PARENT=stablebaselines/stable-baselines3
 
 TAG=stablebaselines/rl-baselines3-zoo
-VERSION=0.10.0
+VERSION=0.11.0
 
 if [[ ${USE_GPU} == "True" ]]; then
   PARENT="${PARENT}:${VERSION}"
diff --git a/tests/test_hyperparams_opt.py b/tests/test_hyperparams_opt.py
index 9ebf4f94f..6c5f0b016 100644
--- a/tests/test_hyperparams_opt.py
+++ b/tests/test_hyperparams_opt.py
@@ -26,6 +26,8 @@ def _assert_eq(left, right):
 experiments["td3-Pendulum-v0"] = ("td3", "Pendulum-v0")
 # Test for HER
 experiments["her-parking-v0"] = ("her", "parking-v0")
+# Test for TQC
+experiments["tqc-Pendulum-v0"] = ("tqc", "Pendulum-v0")
 
 
 @pytest.mark.parametrize("sampler", ["random", "tpe"])
diff --git a/utils/hyperparams_opt.py b/utils/hyperparams_opt.py
index c1e32868d..ce7984bab 100644
--- a/utils/hyperparams_opt.py
+++ b/utils/hyperparams_opt.py
@@ -2,7 +2,8 @@
 
 import numpy as np
 import optuna
-from stable_baselines3 import DDPG, SAC, TD3
+from sb3_contrib import TQC
+from stable_baselines3 import DDPG, DQN, SAC, TD3
 from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
 from torch import nn as nn
 
@@ -11,7 +12,7 @@
 
 def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
     """
-    Sampler for PPO2 hyperparams.
+    Sampler for PPO hyperparams.
 
     :param trial:
     :return:
@@ -21,6 +22,7 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
     gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
     learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
     lr_schedule = "constant"
+    # Uncomment to enable learning rate schedule
     # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
     ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
     clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
@@ -29,8 +31,11 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
     max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
     vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
     net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
-    log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
-    sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
+    # Uncomment for gSDE (continuous actions)
+    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
+    # Uncomment for gSDE (continuous action)
+    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
+    # Orthogonal initialization
     ortho_init = False
     # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
     # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
@@ -43,6 +48,8 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
     if lr_schedule == "linear":
         learning_rate = linear_schedule(learning_rate)
 
+    # Independent networks usually work best
+    # when not working with images
     net_arch = {
         "small": [dict(pi=[64, 64], vf=[64, 64])],
         "medium": [dict(pi=[256, 256], vf=[256, 256])],
@@ -61,9 +68,9 @@ def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
         "gae_lambda": gae_lambda,
         "max_grad_norm": max_grad_norm,
         "vf_coef": vf_coef,
-        "sde_sample_freq": sde_sample_freq,
+        # "sde_sample_freq": sde_sample_freq,
         "policy_kwargs": dict(
-            log_std_init=log_std_init,
+            # log_std_init=log_std_init,
             net_arch=net_arch,
             activation_fn=activation_fn,
             ortho_init=ortho_init,
@@ -81,6 +88,7 @@ def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
     gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
     normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
     max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
+    # Toggle PyTorch RMS Prop (different from TF one, cf doc)
     use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True])
     gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
     n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
@@ -88,7 +96,8 @@ def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
     learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
     ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
     vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
-    log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
+    # Uncomment for gSDE (continuous actions)
+    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
     ortho_init = trial.suggest_categorical("ortho_init", [False, True])
     net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
     # sde_net_arch = trial.suggest_categorical("sde_net_arch", [None, "tiny", "small"])
@@ -123,7 +132,7 @@ def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
         "use_rms_prop": use_rms_prop,
         "vf_coef": vf_coef,
         "policy_kwargs": dict(
-            log_std_init=log_std_init,
+            # log_std_init=log_std_init,
             net_arch=net_arch,
             # full_std=full_std,
             activation_fn=activation_fn,
@@ -142,19 +151,21 @@ def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]:
     """
     gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
     learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
-    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128, 256, 512])
+    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128, 256, 512, 1024, 2048])
     buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
     learning_starts = trial.suggest_categorical("learning_starts", [0, 1000, 10000, 20000])
     # train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 300])
     train_freq = trial.suggest_categorical("train_freq", [8, 16, 32, 64, 128, 256, 512])
     # Polyak coeff
-    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02])
+    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05])
     # gradient_steps takes too much time
     # gradient_steps = trial.suggest_categorical('gradient_steps', [1, 100, 300])
     gradient_steps = train_freq
     # ent_coef = trial.suggest_categorical('ent_coef', ['auto', 0.5, 0.1, 0.05, 0.01, 0.0001])
     ent_coef = "auto"
+    # You can comment that out when not using gSDE
     log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
+    # NOTE: Add "verybig" to net_arch when tuning HER
     net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
     # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])
 
@@ -162,6 +173,8 @@ def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]:
         "small": [64, 64],
         "medium": [256, 256],
         "big": [400, 300],
+        # Uncomment for tuning HER
+        # "verybig": [256, 256, 256],
     }[net_arch]
 
     target_entropy = "auto"
@@ -193,7 +206,7 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]:
     """
     gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
     learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
-    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512])
+    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048])
     buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
 
     episodic = trial.suggest_categorical("episodic", [True, False])
@@ -209,6 +222,7 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]:
     noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
     noise_std = trial.suggest_uniform("noise_std", 0, 1)
 
+    # NOTE: Add "verybig" to net_arch when tuning HER
     net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
     # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])
 
@@ -216,6 +230,8 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]:
         "small": [64, 64],
         "medium": [256, 256],
         "big": [400, 300],
+        # Uncomment for tuning HER
+        # "verybig": [256, 256, 256],
     }[net_arch]
 
     hyperparams = {
@@ -250,7 +266,7 @@ def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
     """
     gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
     learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
-    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512])
+    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048])
     buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
     # Polyak coeff
     tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02])
@@ -268,6 +284,7 @@ def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
     noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
     noise_std = trial.suggest_uniform("noise_std", 0, 1)
 
+    # NOTE: Add "verybig" to net_arch when tuning HER (see TD3)
     net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
     # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])
 
@@ -354,7 +371,9 @@ def sample_her_params(trial: optuna.Trial) -> Dict[str, Any]:
     model_class_str = {
         SAC: "sac",
         DDPG: "ddpg",
+        DQN: "dqn",
         TD3: "td3",
+        TQC: "tqc",
     }[trial.model_class]
 
     hyperparams = HYPERPARAMS_SAMPLER[model_class_str](trial)
@@ -368,12 +387,32 @@ def sample_her_params(trial: optuna.Trial) -> Dict[str, Any]:
     return hyperparams
 
 
+def sample_tqc_params(trial: optuna.Trial) -> Dict[str, Any]:
+    """
+    Sampler for TQC hyperparams.
+
+    :param trial:
+    :return:
+    """
+    # TQC is SAC + Distributional RL
+    hyperparams = sample_sac_params(trial)
+
+    n_quantiles = trial.suggest_int("n_quantiles", 5, 50)
+    top_quantiles_to_drop_per_net = trial.suggest_int("top_quantiles_to_drop_per_net", 0, n_quantiles - 1)
+
+    hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles})
+    hyperparams["top_quantiles_to_drop_per_net"] = top_quantiles_to_drop_per_net
+
+    return hyperparams
+
+
 HYPERPARAMS_SAMPLER = {
     "a2c": sample_a2c_params,
     "ddpg": sample_ddpg_params,
     "dqn": sample_dqn_params,
     "her": sample_her_params,
     "sac": sample_sac_params,
+    "tqc": sample_tqc_params,
     "ppo": sample_ppo_params,
     "td3": sample_td3_params,
 }
diff --git a/utils/utils.py b/utils/utils.py
index 6f633f77f..0fb5c3ce6 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -8,17 +8,13 @@
 import stable_baselines3 as sb3  # noqa: F401
 import torch as th  # noqa: F401
 import yaml
+from sb3_contrib import TQC
 from stable_baselines3 import A2C, DDPG, DQN, HER, PPO, SAC, TD3
 from stable_baselines3.common.callbacks import BaseCallback
 from stable_baselines3.common.env_util import make_vec_env
 from stable_baselines3.common.sb2_compat.rmsprop_tf_like import RMSpropTFLike  # noqa: F401
 from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv, VecFrameStack, VecNormalize
 
-try:
-    from sb3_contrib import TQC  # pytype: disable=import-error
-except ImportError:
-    TQC = None
-
 # For custom activation fn
 from torch import nn as nn  # noqa: F401 pylint: disable=unused-import
 
@@ -30,11 +26,10 @@
     "her": HER,
     "sac": SAC,
     "td3": TD3,
+    # SB3 Contrib,
+    "tqc": TQC,
 }
 
-if TQC is not None:
-    ALGOS["tqc"] = TQC
-
 
 def flatten_dict_observations(env: gym.Env) -> gym.Env:
     assert isinstance(env.observation_space, gym.spaces.Dict)
diff --git a/utils/wrappers.py b/utils/wrappers.py
index 32480ddd0..4b3aa77eb 100644
--- a/utils/wrappers.py
+++ b/utils/wrappers.py
@@ -1,6 +1,7 @@
 import gym
 import numpy as np
 from matplotlib import pyplot as plt
+from sb3_contrib.common.wrappers import TimeFeatureWrapper  # noqa: F401 (backward compatibility)
 from scipy.signal import iirfilter, sosfilt, zpk2sos
 
 
@@ -10,7 +11,7 @@ class DoneOnSuccessWrapper(gym.Wrapper):
     Useful for GoalEnv.
     """
 
-    def __init__(self, env: gym.Env, reward_offset: float = 1.0, n_successes: int = 1):
+    def __init__(self, env: gym.Env, reward_offset: float = 0.0, n_successes: int = 1):
         super(DoneOnSuccessWrapper, self).__init__(env)
         self.reward_offset = reward_offset
         self.n_successes = n_successes
@@ -36,121 +37,6 @@ def compute_reward(self, achieved_goal, desired_goal, info):
         return reward + self.reward_offset
 
 
-class TimeFeatureWrapper(gym.Wrapper):
-    """
-    Add remaining time to observation space for fixed length episodes.
-    See https://arxiv.org/abs/1712.00378 and https://github.com/aravindr93/mjrl/issues/13.
-
-    :param env: (gym.Env)
-    :param max_steps: (int) Max number of steps of an episode
-        if it is not wrapped in a TimeLimit object.
-    :param test_mode: (bool) In test mode, the time feature is constant,
-        equal to zero. This allow to check that the agent did not overfit this feature,
-        learning a deterministic pre-defined sequence of actions.
-    """
-
-    def __init__(self, env: gym.Env, max_steps: int = 1000, test_mode: bool = False):
-        assert isinstance(env.observation_space, gym.spaces.Box)
-        # Add a time feature to the observation
-        low, high = env.observation_space.low, env.observation_space.high
-        low, high = np.concatenate((low, [0])), np.concatenate((high, [1.0]))
-        env.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32)
-
-        super(TimeFeatureWrapper, self).__init__(env)
-
-        try:
-            self._max_steps = env.spec.max_episode_steps
-        except AttributeError:
-            self._max_steps = None
-
-        if self._max_steps is None:
-            self._max_steps = max_steps
-
-        self._current_step = 0
-        self._test_mode = test_mode
-
-    def reset(self):
-        self._current_step = 0
-        return self._get_obs(self.env.reset())
-
-    def step(self, action):
-        self._current_step += 1
-        obs, reward, done, info = self.env.step(action)
-        return self._get_obs(obs), reward, done, info
-
-    def _get_obs(self, obs):
-        """
-        Concatenate the time feature to the current observation.
-
-        :param obs: (np.ndarray)
-        :return: (np.ndarray)
-        """
-        # Remaining time is more general
-        time_feature = 1 - (self._current_step / self._max_steps)
-        if self._test_mode:
-            time_feature = 1.0
-        # Optionnaly: concatenate [time_feature, time_feature ** 2]
-        return np.concatenate((obs, [time_feature]))
-
-
-class TimeFeatureObsDictWrapper(gym.Wrapper):
-    """
-    Add remaining time to observation space for fixed length episodes.
-    See https://arxiv.org/abs/1712.00378 and https://github.com/aravindr93/mjrl/issues/13.
-
-    :param env: (gym.Env)
-    :param max_steps: (int) Max number of steps of an episode
-        if it is not wrapped in a TimeLimit object.
-    :param test_mode: (bool) In test mode, the time feature is constant,
-        equal to zero. This allow to check that the agent did not overfit this feature,
-        learning a deterministic pre-defined sequence of actions.
-    """
-
-    def __init__(self, env: gym.Env, max_steps: int = 1000, test_mode: bool = False):
-        assert isinstance(env.observation_space, gym.spaces.Dict)
-        # Add a time feature to the observation
-        obs_space = env.observation_space.spaces["observation"]
-        low, high = obs_space.low, obs_space.high
-        low, high = np.concatenate((low, [0])), np.concatenate((high, [1.0]))
-        env.observation_space.spaces["observation"] = gym.spaces.Box(low=low, high=high, dtype=np.float32)
-
-        super(TimeFeatureObsDictWrapper, self).__init__(env)
-
-        try:
-            self._max_steps = env.spec.max_episode_steps
-        except AttributeError:
-            self._max_steps = None
-
-        if self._max_steps is None:
-            self._max_steps = max_steps
-
-        self._current_step = 0
-        self._test_mode = test_mode
-
-    def reset(self):
-        self._current_step = 0
-        return self._get_obs(self.env.reset())
-
-    def step(self, action):
-        self._current_step += 1
-        obs, reward, done, info = self.env.step(action)
-        return self._get_obs(obs), reward, done, info
-
-    def _get_obs(self, obs):
-        """
-        Concatenate the time feature to the current observation.
-
-        :param obs: (np.ndarray)
-        :return: (np.ndarray)
-        """
-        # Remaining time is more general
-        time_feature = 1 - (self._current_step / self._max_steps)
-        if self._test_mode:
-            time_feature = 1.0
-        obs["observation"] = np.concatenate((obs["observation"], [time_feature]))
-        return obs
-
-
 class ActionNoiseWrapper(gym.Wrapper):
     """
     Add gaussian noise to the action (without telling the agent),
diff --git a/version.txt b/version.txt
index d22e31d20..a09c7eb7a 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.11.0a0
+0.11.0a2