From e41e415e54eea06ec44971270d3e9c2bcafcbc4d Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 10 Jun 2020 18:00:45 +0200 Subject: [PATCH 1/4] Fix variable being passed with gradients --- stable_baselines3/common/distributions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable_baselines3/common/distributions.py b/stable_baselines3/common/distributions.py index f9bb16c9c..951f1633f 100644 --- a/stable_baselines3/common/distributions.py +++ b/stable_baselines3/common/distributions.py @@ -525,7 +525,7 @@ def proba_distribution(self, mean_actions: th.Tensor, """ # Stop gradient if we don't want to influence the features self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() - variance = th.mm(latent_sde ** 2, self.get_std(log_std) ** 2) + variance = th.mm(self._latent_sde ** 2, self.get_std(log_std) ** 2) self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon)) return self From c96005fbaf0313dd9fc6b250ec5817e879b4defb Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 10 Jun 2020 18:10:27 +0200 Subject: [PATCH 2/4] Update changelog --- docs/misc/changelog.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index c0e19c44d..f942a1ae6 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -11,15 +11,15 @@ Breaking Changes: ^^^^^^^^^^^^^^^^^ - ``render()`` method of ``VecEnvs`` now only accept one argument: ``mode`` - Created new file common/torch_layers.py, similar to SB refactoring - + - Contains all PyTorch network layer definitions and feature extractors: ``MlpExtractor``, ``create_mlp``, ``NatureCNN`` - Renamed ``BaseRLModel`` to ``BaseAlgorithm`` (along with offpolicy and onpolicy variants) - Moved on-policy and off-policy base algorithms to ``common/on_policy_algorithm.py`` and ``common/off_policy_algorithm.py``, respectively. -- Moved ``PPOPolicy`` to ``ActorCriticPolicy`` in common/policies.py +- Moved ``PPOPolicy`` to ``ActorCriticPolicy`` in common/policies.py - Moved ``PPO`` (algorithm class) into ``OnPolicyAlgorithm`` (``common/on_policy_algorithm.py``), to be shared with A2C -- Moved following functions from ``BaseAlgorithm``: - +- Moved following functions from ``BaseAlgorithm``: + - ``_load_from_file`` to ``load_from_zip_file`` (save_util.py) - ``_save_to_file_zip`` to ``save_to_zip_file`` (save_util.py) - ``safe_mean`` to ``safe_mean`` (utils.py) @@ -28,7 +28,7 @@ Breaking Changes: - Moved static function ``_is_vectorized_observation`` from common/policies.py to common/utils.py under name ``is_vectorized_observation``. - Removed ``{save,load}_running_average`` functions of ``VecNormalize`` in favor of ``load/save``. - Removed ``use_gae`` parameter from ``RolloutBuffer.compute_returns_and_advantage``. - + New Features: ^^^^^^^^^^^^^ @@ -38,6 +38,7 @@ Bug Fixes: - Fixed ``seed()`` method for ``SubprocVecEnv`` - Fixed loading on GPU for testing when using gSDE and ``deterministic=False`` - Fixed ``register_policy`` to allow re-registering same policy for same sub-class (i.e. assign same value to same key). +- Fixed a bug where the gradient was passed when using ``gSDE`` with ``PPO``/``A2C``, this does not affect ``SAC`` Deprecations: ^^^^^^^^^^^^^ @@ -67,7 +68,7 @@ Breaking Changes: ^^^^^^^^^^^^^^^^^ - Remove State-Dependent Exploration (SDE) support for ``TD3`` - Methods were renamed in the logger: - + - ``logkv`` -> ``record``, ``writekvs`` -> ``write``, ``writeseq`` -> ``write_sequence``, - ``logkvs`` -> ``record_dict``, ``dumpkvs`` -> ``dump``, - ``getkvs`` -> ``get_log_dict``, ``logkv_mean`` -> ``record_mean``, From 5a5d42c20791325b587874f2c3487fa29c5e5d12 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 10 Jun 2020 18:14:22 +0200 Subject: [PATCH 3/4] Bump version --- docs/misc/changelog.rst | 4 +++- stable_baselines3/version.txt | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index f942a1ae6..4f3db8a00 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -4,9 +4,11 @@ Changelog ========== -Pre-Release 0.7.0a1 (WIP) +Pre-Release 0.7.0 (2020-06-10) ------------------------------ +**Hotfix for PPO/A2C + gSDE, internal refactoring and bug fixes** + Breaking Changes: ^^^^^^^^^^^^^^^^^ - ``render()`` method of ``VecEnvs`` now only accept one argument: ``mode`` diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt index cde2c3fbb..faef31a43 100644 --- a/stable_baselines3/version.txt +++ b/stable_baselines3/version.txt @@ -1 +1 @@ -0.7.0a1 +0.7.0 From 3bdfdb3a7c1aeedf71bd2e3c00ec16a769911b45 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 10 Jun 2020 18:40:17 +0200 Subject: [PATCH 4/4] Fixes #54 --- tests/test_vec_normalize.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 076596737..124c5e535 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -1,5 +1,3 @@ -import os - import gym import pytest import numpy as np @@ -138,7 +136,7 @@ def test_sync_vec_normalize(): assert unwrap_vec_normalize(env) is None - env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) + env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100., clip_reward=100.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) @@ -147,9 +145,13 @@ def test_sync_vec_normalize(): assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) - eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) + eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, + clip_obs=100., clip_reward=100.) eval_env = VecFrameStack(eval_env, 1) + env.seed(0) + env.action_space.seed(0) + env.reset() # Initialize running mean latest_reward = None