From 7bafdb3a675f4d6c033d6453454969ca5ef2f4c0 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 12 Feb 2020 11:34:29 +0100
Subject: [PATCH 1/9] Add `get_vec_normalize_env()`

---
 docs/misc/changelog.rst               |  1 +
 tests/test_vec_normalize.py           |  2 ++
 torchy_baselines/common/base_class.py | 12 ++++++++++--
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 30f5a60d0..0bb3910de 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -19,6 +19,7 @@ New Features:
 - Add support for Callback (cf https://github.com/hill-a/stable-baselines/pull/644)
 - Add methods for saving and loading replay buffer
 - Add `extend()` method to the buffers
+- Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists
 
 Bug Fixes:
 ^^^^^^^^^^
diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
index ab0f7f154..3c21f69aa 100644
--- a/tests/test_vec_normalize.py
+++ b/tests/test_vec_normalize.py
@@ -123,6 +123,8 @@ def test_offpolicy_normalization(model_class):
 
     model = model_class('MlpPolicy', env, verbose=1)
     model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500)
+    # Check getter
+    assert isinstance(model.get_vec_normalize_env(), VecNormalize)
 
 
 def test_sync_vec_normalize():
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index 68c9025d3..d4e681028 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -14,7 +14,7 @@
 from torchy_baselines.common import logger
 from torchy_baselines.common.policies import BasePolicy, get_policy_from_name
 from torchy_baselines.common.utils import set_random_seed, get_schedule_fn, update_learning_rate
-from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize
+from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize, VecNormalize
 from torchy_baselines.common.save_util import data_to_json, json_to_data, recursive_getattr, recursive_setattr
 from torchy_baselines.common.type_aliases import GymEnv, TensorDict, OptimizerStateDict
 from torchy_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback, EvalCallback
@@ -212,10 +212,18 @@ def get_env(self) -> Optional[VecEnv]:
         """
         Returns the current environment (can be None if not defined).
 
-        :return: The current environment
+        :return: (Optional[VecEnv]) The current environment
         """
         return self.env
 
+    def get_vec_normalize_env(self) -> Optional[VecNormalize]:
+        """
+        Return the `VecNormalize` wrapper of the training env
+        if it exists.
+        :return: Optional[VecNormalize] The `VecNormalize` env.
+        """
+        return self._vec_normalize_env
+
     @staticmethod
     def check_env(env, observation_space: gym.spaces.Space, action_space: gym.spaces.Space) -> bool:
         """

From 9caea35a11289e456ae2fe79cae135cfc00c7ab1 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 12 Feb 2020 14:31:15 +0100
Subject: [PATCH 2/9] Add results plotter

---
 .coveragerc                                |   2 +
 docs/misc/changelog.rst                    |   1 +
 torchy_baselines/common/results_plotter.py | 126 +++++++++++++++++++++
 3 files changed, 129 insertions(+)
 create mode 100644 torchy_baselines/common/results_plotter.py

diff --git a/.coveragerc b/.coveragerc
index a8fc2af79..511f20d8b 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -3,6 +3,8 @@ branch = False
 omit =
     tests/*
     setup.py
+    # Require graphical interface
+    torchy_baselines/common/results_plotter.py
 
 [report]
 exclude_lines =
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 0bb3910de..b083e3983 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -20,6 +20,7 @@ New Features:
 - Add methods for saving and loading replay buffer
 - Add `extend()` method to the buffers
 - Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists
+- Add `¶results_plotter` from Stable Baselines
 
 Bug Fixes:
 ^^^^^^^^^^
diff --git a/torchy_baselines/common/results_plotter.py b/torchy_baselines/common/results_plotter.py
new file mode 100644
index 000000000..a5464dadb
--- /dev/null
+++ b/torchy_baselines/common/results_plotter.py
@@ -0,0 +1,126 @@
+from typing import Tuple, Callable, List, Optional
+
+import numpy as np
+import pandas as pd
+import matplotlib
+import matplotlib.pyplot as plt
+
+from torchy_baselines.common.monitor import load_results
+
+# matplotlib.use('TkAgg')  # Can change to 'Agg' for non-interactive mode
+plt.rcParams['svg.fonttype'] = 'none'
+
+X_TIMESTEPS = 'timesteps'
+X_EPISODES = 'episodes'
+X_WALLTIME = 'walltime_hrs'
+POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
+EPISODES_WINDOW = 100
+COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
+          'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
+          'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
+
+
+def rolling_window(array: np.ndarray, window: int) -> np.ndarray:
+    """
+    Apply a rolling window to a np.ndarray
+
+    :param array: (np.ndarray) the input Array
+    :param window: (int) length of the rolling window
+    :return: (np.ndarray) rolling window on the input array
+    """
+    shape = array.shape[:-1] + (array.shape[-1] - window + 1, window)
+    strides = array.strides + (array.strides[-1],)
+    return np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides)
+
+
+def window_func(var_1: np.ndarray, var_2: np.ndarray,
+                window: int, func: Callable) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Apply a function to the rolling window of 2 arrays
+
+    :param var_1: (np.ndarray) variable 1
+    :param var_2: (np.ndarray) variable 2
+    :param window: (int) length of the rolling window
+    :param func: (numpy function) function to apply on the rolling window on variable 2 (such as np.mean)
+    :return: (Tuple[np.ndarray, np.ndarray])  the rolling output with applied function
+    """
+    var_2_window = rolling_window(var_2, window)
+    function_on_var2 = func(var_2_window, axis=-1)
+    return var_1[window - 1:], function_on_var2
+
+
+def ts2xy(timesteps: pd.DataFrame, x_axis: str) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Decompose a timesteps variable to x ans ys
+
+    :param timesteps: (pd.DataFrame) the input data
+    :param x_axis: (str) the axis for the x and y output
+        (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
+    :return: (Tuple[np.ndarray, np.ndarray]) the x and y output
+    """
+    if x_axis == X_TIMESTEPS:
+        x_var = np.cumsum(timesteps.l.values)
+        y_var = timesteps.r.values
+    elif x_axis == X_EPISODES:
+        x_var = np.arange(len(timesteps))
+        y_var = timesteps.r.values
+    elif x_axis == X_WALLTIME:
+        # Convert to hours
+        x_var = timesteps.t.values / 3600.
+        y_var = timesteps.r.values
+    else:
+        raise NotImplementedError
+    return x_var, y_var
+
+
+def plot_curves(xy_list: List[Tuple[np.ndarray, np.ndarray]],
+                x_axis: str, title: str, figsize: Tuple[int, int] = (8, 2)) -> None:
+    """
+    plot the curves
+
+    :param xy_list: (List[Tuple[np.ndarray, np.ndarray]]) the x and y coordinates to plot
+    :param x_axis: (str) the axis for the x and y output
+        (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
+    :param title: (str) the title of the plot
+    :param figsize: (Tuple[int, int]) Size of the figure (width, height)
+    """
+
+    plt.figure(figsize=figsize)
+    max_x = max(xy[0][-1] for xy in xy_list)
+    min_x = 0
+    for (i, (x, y)) in enumerate(xy_list):
+        color = COLORS[i]
+        plt.scatter(x, y, s=2)
+        # Do not plot the smoothed curve at all if the timeseries is shorter than window size.
+        if x.shape[0] >= EPISODES_WINDOW:
+            # Compute and plot rolling mean with window of size EPISODE_WINDOW
+            x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean)
+            plt.plot(x, y_mean, color=color)
+    plt.xlim(min_x, max_x)
+    plt.title(title)
+    plt.xlabel(x_axis)
+    plt.ylabel("Episode Rewards")
+    plt.tight_layout()
+
+
+def plot_results(dirs: List[str], num_timesteps: Optional[int],
+                 x_axis: str, task_name: str, figsize: Tuple[int, int] = (8, 2)) -> None:
+    """
+    plot the results
+
+    :param dirs: ([str]) the save location of the results to plot
+    :param num_timesteps: (int or None) only plot the points below this value
+    :param x_axis: (str) the axis for the x and y output
+        (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
+    :param task_name: (str) the title of the task to plot
+    :param figsize: (Tuple[int, int]) Size of the figure (width, height)
+    """
+
+    timesteps_list = []
+    for folder in dirs:
+        timesteps = load_results(folder)
+        if num_timesteps is not None:
+            timesteps = timesteps[timesteps.l.cumsum() <= num_timesteps]
+        timesteps_list.append(timesteps)
+    xy_list = [ts2xy(timesteps_item, x_axis) for timesteps_item in timesteps_list]
+    plot_curves(xy_list, x_axis, task_name, figsize)

From f1a4fa2d3fae520e1308929d04f78e6d7b6223cb Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 12 Feb 2020 15:25:05 +0100
Subject: [PATCH 3/9] Improve predict method

---
 docs/misc/changelog.rst               |   5 +-
 setup.py                              |   6 +-
 tests/test_save_load.py               |   6 +-
 torchy_baselines/__init__.py          |   2 +-
 torchy_baselines/common/base_class.py | 105 +++++++++++++++++++++++---
 torchy_baselines/common/policies.py   |  27 +++++--
 torchy_baselines/ppo/policies.py      |   6 +-
 torchy_baselines/ppo/ppo.py           |  22 ------
 torchy_baselines/sac/policies.py      |   6 +-
 torchy_baselines/sac/sac.py           |  19 -----
 torchy_baselines/td3/policies.py      |   3 +
 torchy_baselines/td3/td3.py           |  19 -----
 12 files changed, 135 insertions(+), 91 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index b083e3983..02bf897b5 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -3,7 +3,7 @@
 Changelog
 ==========
 
-Pre-Release 0.2.0a1 (WIP)
+Pre-Release 0.2.0a2 (WIP)
 ------------------------------
 
 Breaking Changes:
@@ -20,7 +20,8 @@ New Features:
 - Add methods for saving and loading replay buffer
 - Add `extend()` method to the buffers
 - Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists
-- Add `¶results_plotter` from Stable Baselines
+- Add `results_plotter` from Stable Baselines
+- Improve `predict()` method to handle different type of observations (single, vectorized, ...)
 
 Bug Fixes:
 ^^^^^^^^^^
diff --git a/setup.py b/setup.py
index b9598fc98..92389eaa0 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,9 @@
           'torch>=1.2.0',
           'cloudpickle',
           # For reading logs
-          'pandas'
+          'pandas',
+          # Plotting learning curves
+          'matplotlib'
       ],
       extras_require={
         'tests': [
@@ -45,7 +47,7 @@
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.2.0a1",
+      version="0.2.0a2",
       )
 
 # python setup.py sdist
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
index edd326ace..45c0ac5fc 100644
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@@ -34,7 +34,7 @@ def test_save_load(model_class):
 
     env.reset()
     observations = np.array([env.step(env.action_space.sample())[0] for _ in range(10)])
-    observations = np.squeeze(observations)
+    observations = observations.reshape(10, -1)
 
     # Get dictionary of current parameters
     params = deepcopy(model.policy.state_dict())
@@ -53,7 +53,7 @@ def test_save_load(model_class):
     params = new_params
 
     # get selected actions
-    selected_actions = [model.predict(observation, deterministic=True) for observation in observations]
+    selected_actions = model.predict(observations, deterministic=True)
 
     # Check
     model.save("test_save.zip")
@@ -68,7 +68,7 @@ def test_save_load(model_class):
         assert th.allclose(params[key], new_params[key]), "Model parameters not the same after save and load."
 
     # check if model still selects the same actions
-    new_selected_actions = [model.predict(observation, deterministic=True) for observation in observations]
+    new_selected_actions = model.predict(observations, deterministic=True)
     assert np.allclose(selected_actions, new_selected_actions, 1e-4)
 
     # check if learn still works
diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py
index e250cc26d..5e22a8dd0 100644
--- a/torchy_baselines/__init__.py
+++ b/torchy_baselines/__init__.py
@@ -4,4 +4,4 @@
 from torchy_baselines.sac import SAC
 from torchy_baselines.td3 import TD3
 
-__version__ = "0.2.0a1"
+__version__ = "0.2.0a2"
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index d4e681028..2348d0e10 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -306,21 +306,104 @@ def learn(self, total_timesteps: int,
         """
         raise NotImplementedError()
 
-    @abstractmethod
+    @staticmethod
+    def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool:
+        """
+        For every observation type, detects and validates the shape,
+        then returns whether or not the observation is vectorized.
+
+        :param observation: (np.ndarray) the input observation to validate
+        :param observation_space: (gym.spaces) the observation space
+        :return: (bool) whether the given observation is vectorized or not
+        """
+        if isinstance(observation_space, gym.spaces.Box):
+            if observation.shape == observation_space.shape:
+                return False
+            elif observation.shape[1:] == observation_space.shape:
+                return True
+            else:
+                raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
+                                 "Box environment, please use {} ".format(observation_space.shape) +
+                                 "or (n_env, {}) for the observation shape."
+                                 .format(", ".join(map(str, observation_space.shape))))
+        elif isinstance(observation_space, gym.spaces.Discrete):
+            if observation.shape == ():  # A numpy array of a number, has shape empty tuple '()'
+                return False
+            elif len(observation.shape) == 1:
+                return True
+            else:
+                raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
+                                 "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
+        elif isinstance(observation_space, gym.spaces.MultiDiscrete):
+            if observation.shape == (len(observation_space.nvec),):
+                return False
+            elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
+                return True
+            else:
+                raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
+                                 "environment, please use ({},) or ".format(len(observation_space.nvec)) +
+                                 "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
+        elif isinstance(observation_space, gym.spaces.MultiBinary):
+            if observation.shape == (observation_space.n,):
+                return False
+            elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
+                return True
+            else:
+                raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
+                                 "environment, please use ({},) or ".format(observation_space.n) +
+                                 "(n_env, {}) for the observation shape.".format(observation_space.n))
+        else:
+            raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}."
+                             .format(observation_space))
+
     def predict(self, observation: np.ndarray,
                 state: Optional[np.ndarray] = None,
                 mask: Optional[np.ndarray] = None,
                 deterministic: bool = False) -> np.ndarray:
         """
-        Get the model's action from an observation
+        Get the model's action(s) from an observation
+
+        :param observation: (np.ndarray) the input observation
+        :param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies)
+        :param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies)
+        :param deterministic: (bool) Whether or not to return deterministic actions.
+        :return: (np.ndarray) the model's action and the next state (used in recurrent policies)
+        """
+        # if state is None:
+        #     state = self.initial_state
+        # if mask is None:
+        #     mask = [False for _ in range(self.n_envs)]
+        observation = np.array(observation)
+        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)
+
+        observation = observation.reshape((-1,) + self.observation_space.shape)
+        # Convert to float pytorch
+        # TODO: replace with preprocessing
+        observation = th.as_tensor(observation).float().to(self.device)
+        with th.no_grad():
+            actions = self.policy.predict(observation, deterministic=deterministic)
+        # Convert to numpy
+        actions = actions.cpu().numpy()
+
+        # Rescale to proper domain when using squashing
+        # TODO: should not be used for a Gaussian distribution?
+        if isinstance(self.action_space, gym.spaces.Box):
+            actions = self.unscale_action(actions)
+
+        clipped_actions = actions
+        # Clip the actions to avoid out of bound error when using gaussian distribution
+        if isinstance(self.action_space, gym.spaces.Box):
+            clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
+
+        if not vectorized_env:
+            if state is not None:
+                raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
+            clipped_actions = clipped_actions[0]
+
+        # TODO: switch to stable baselines API
+        # return clipped_actions, state
+        return clipped_actions
 
-        :param observation: the input observation
-        :param state: The last states (can be None, used in recurrent policies)
-        :param mask: The last masks (can be None, used in recurrent policies)
-        :param deterministic: Whether or not to return deterministic actions.
-        :return: the model's action and the next state (used in recurrent policies)
-        """
-        raise NotImplementedError()
 
     @classmethod
     def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs):
@@ -806,7 +889,9 @@ def collect_rollouts(self,
                     # Warmup phase
                     unscaled_action = np.array([self.action_space.sample()])
                 else:
-                    unscaled_action = self.predict(obs, deterministic=not self.use_sde)
+                    # Note: we assume that the policy uses tanh to scale the action
+                    # We use non-deterministic action in the case of SAC, for TD3, it does not matter
+                    unscaled_action = self.predict(obs, deterministic=False)
 
                 # Rescale the action from [low, high] to [-1, 1]
                 scaled_action = self.scale_action(unscaled_action)
diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py
index ea7f755ad..1e0064dca 100644
--- a/torchy_baselines/common/policies.py
+++ b/torchy_baselines/common/policies.py
@@ -1,25 +1,30 @@
+from typing import Union
+
 from itertools import zip_longest
 
+import gym
 import torch as th
 import torch.nn as nn
+import numpy as np
 
 
 class BasePolicy(nn.Module):
     """
     The base policy object
 
-    :param observation_space: (Gym Space) The observation space of the environment
-    :param action_space: (Gym Space) The action space of the environment
+    :param observation_space: (gym.spaces.Space) The observation space of the environment
+    :param action_space: (gym.spaces.Space) The action space of the environment
     """
 
-    def __init__(self, observation_space, action_space, device='cpu'):
+    def __init__(self, observation_space: gym.spaces.Space,
+                 action_space: gym.spaces.Space, device: Union[th.device, str] = 'cpu'):
         super(BasePolicy, self).__init__()
         self.observation_space = observation_space
         self.action_space = action_space
         self.device = device
 
     @staticmethod
-    def init_weights(module, gain=1):
+    def init_weights(module: nn.Module, gain: float = 1):
         if type(module) == nn.Linear:
             nn.init.orthogonal_(module.weight, gain=gain)
             module.bias.data.fill_(0.0)
@@ -27,7 +32,13 @@ def init_weights(module, gain=1):
     def forward(self, *_args, **kwargs):
         raise NotImplementedError()
 
-    def save(self, path):
+    def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        """
+        Get the action according to the policy for a given observation.
+        """
+        raise NotImplementedError()
+
+    def save(self, path: str) -> None:
         """
         Save model to a given location.
 
@@ -35,7 +46,7 @@ def save(self, path):
         """
         th.save(self.state_dict(), path)
 
-    def load(self, path):
+    def load(self, path: str) -> None:
         """
         Load saved model from path.
 
@@ -43,7 +54,7 @@ def load(self, path):
         """
         self.load_state_dict(th.load(path))
 
-    def load_from_vector(self, vector):
+    def load_from_vector(self, vector: np.ndarray):
         """
         Load parameters from a 1D vector.
 
@@ -51,7 +62,7 @@ def load_from_vector(self, vector):
         """
         th.nn.utils.vector_to_parameters(th.FloatTensor(vector).to(self.device), self.parameters())
 
-    def parameters_to_vector(self):
+    def parameters_to_vector(self) -> np.ndarray:
         """
         Convert the parameters to a 1D vector.
 
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index 4421d5045..1e3492660 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -162,10 +162,10 @@ def _get_action_dist_from_latent(self, latent_pi, latent_sde=None, deterministic
             return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde,
                                                        deterministic=deterministic)
 
-    def actor_forward(self, obs, deterministic=False):
-        latent_pi, _, latent_sde = self._get_latent(obs)
+    def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        latent_pi, _, latent_sde = self._get_latent(observation)
         action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
-        return action.detach().cpu().numpy()
+        return action
 
     def evaluate_actions(self, obs, action, deterministic=False):
         """
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index b17f4f8ce..2ce168b1f 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -129,28 +129,6 @@ def _setup_model(self):
         if self.clip_range_vf is not None:
             self.clip_range_vf = get_schedule_fn(self.clip_range_vf)
 
-    def select_action(self, observation, deterministic=False):
-        # Normally not needed
-        observation = np.array(observation)
-        with th.no_grad():
-            observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
-            return self.policy.actor_forward(observation, deterministic=deterministic)
-
-    def predict(self, observation, state=None, mask=None, deterministic=False):
-        """
-        Get the model's action from an observation
-
-        :param observation: (np.ndarray) the input observation
-        :param state: (np.ndarray) The last states (can be None, used in recurrent policies)
-        :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies)
-        :param deterministic: (bool) Whether or not to return deterministic actions.
-        :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
-        """
-        clipped_actions = self.select_action(observation, deterministic=deterministic)
-        if isinstance(self.action_space, gym.spaces.Box):
-            clipped_actions = np.clip(clipped_actions, self.action_space.low, self.action_space.high)
-        return clipped_actions
-
     def collect_rollouts(self,
                         env: VecEnv,
                         callback: BaseCallback,
diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py
index 3fe11c50e..3fbea937a 100644
--- a/torchy_baselines/sac/policies.py
+++ b/torchy_baselines/sac/policies.py
@@ -129,11 +129,11 @@ def get_action_dist_params(self, obs):
     def forward(self, obs, deterministic=False):
         mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
         if self.use_sde:
-            # Note the action is squashed
+            # Note: the action is squashed
             action, _ = self.action_dist.proba_distribution(mean_actions, log_std, latent_sde,
                                                             deterministic=deterministic)
         else:
-            # Note the action is squashed
+            # Note: the action is squashed
             action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
                                                             deterministic=deterministic)
         return action
@@ -246,6 +246,8 @@ def make_critic(self):
     def forward(self, obs):
         return self.actor(obs)
 
+    def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        return self.actor.forward(observation, deterministic)
 
 MlpPolicy = SACPolicy
 
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index 9617ec78f..f0930ee21 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -148,25 +148,6 @@ def _create_aliases(self):
         self.critic = self.policy.critic
         self.critic_target = self.policy.critic_target
 
-    def select_action(self, observation):
-        # Normally not needed
-        observation = np.array(observation)
-        with th.no_grad():
-            observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
-            return self.actor(observation).cpu().data.numpy()
-
-    def predict(self, observation, state=None, mask=None, deterministic=True):
-        """
-        Get the model's action from an observation
-
-        :param observation: (np.ndarray) the input observation
-        :param state: (np.ndarray) The last states (can be None, used in recurrent policies)
-        :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies)
-        :param deterministic: (bool) Whether or not to return deterministic actions.
-        :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
-        """
-        return self.unscale_action(self.select_action(observation))
-
     def train(self, gradient_steps: int, batch_size: int = 64):
         # Update optimizers learning rate
         optimizers = [self.actor.optimizer, self.critic.optimizer]
diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py
index 8bc5f60a4..fa199521e 100644
--- a/torchy_baselines/td3/policies.py
+++ b/torchy_baselines/td3/policies.py
@@ -277,6 +277,9 @@ def make_critic(self):
     def forward(self, obs, deterministic=True):
         return self.actor(obs, deterministic=deterministic)
 
+    def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        return self.forward(observation, deterministic)
+
 
 MlpPolicy = TD3Policy
 
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index 05f1b4fa7..1ba594775 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -114,25 +114,6 @@ def _create_aliases(self):
         self.critic_target = self.policy.critic_target
         self.vf_net = self.policy.vf_net
 
-    def select_action(self, observation, deterministic=True):
-        # Normally not needed
-        observation = np.array(observation)
-        with th.no_grad():
-            observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device)
-            return self.actor(observation, deterministic=deterministic).cpu().numpy()
-
-    def predict(self, observation, state=None, mask=None, deterministic=True):
-        """
-        Get the model's action from an observation
-
-        :param observation: (np.ndarray) the input observation
-        :param state: (np.ndarray) The last states (can be None, used in recurrent policies)
-        :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies)
-        :param deterministic: (bool) Whether or not to return deterministic actions.
-        :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies)
-        """
-        return self.unscale_action(self.select_action(observation, deterministic=deterministic))
-
     def train_critic(self, gradient_steps: int = 1,
                     batch_size: int = 100,
                     replay_data: Optional[ReplayBufferSamples] = None,

From aa8b4eb22a8157917f89ba1352186409311028fe Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 13 Feb 2020 13:46:22 +0100
Subject: [PATCH 4/9] Reformat and type the distributions

---
 torchy_baselines/common/base_class.py      |   9 +-
 torchy_baselines/common/distributions.py   | 114 ++++++++++++---------
 torchy_baselines/common/results_plotter.py |   4 +-
 torchy_baselines/ppo/policies.py           |   2 +
 4 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index 2348d0e10..c0951cbbf 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -225,13 +225,14 @@ def get_vec_normalize_env(self) -> Optional[VecNormalize]:
         return self._vec_normalize_env
 
     @staticmethod
-    def check_env(env, observation_space: gym.spaces.Space, action_space: gym.spaces.Space) -> bool:
+    def check_env(env: GymEnv, observation_space: gym.spaces.Space, action_space: gym.spaces.Space) -> bool:
         """
         Checks the validity of the environment and returns if it is consistent.
         Checked parameters:
         - observation_space
         - action_space
 
+        :param env: (GymEnv)
         :param observation_space: (gym.spaces.Space)
         :param action_space: (gym.spaces.Space)
         :return: (bool) True if environment seems to be coherent
@@ -404,7 +405,6 @@ def predict(self, observation: np.ndarray,
         # return clipped_actions, state
         return clipped_actions
 
-
     @classmethod
     def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs):
         """
@@ -774,6 +774,7 @@ class OffPolicyRLModel(BaseRLModel):
     :param use_sde_at_warmup: (bool) Whether to use SDE instead of uniform sampling
         during the warm up phase (before learning starts)
     """
+
     def __init__(self,
                  policy: Type[BasePolicy],
                  env: Union[GymEnv, str],
@@ -790,8 +791,8 @@ def __init__(self,
                  use_sde_at_warmup: bool = False):
 
         super(OffPolicyRLModel, self).__init__(policy, env, policy_base, policy_kwargs, verbose,
-                                                device, support_multi_env, create_eval_env, monitor_wrapper,
-                                                seed, use_sde, sde_sample_freq)
+                                               device, support_multi_env, create_eval_env, monitor_wrapper,
+                                               seed, use_sde, sde_sample_freq)
         # For SDE only
         self.rollout_data = None
         self.on_policy_exploration = False
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index 451535c35..a7f219ac8 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -1,5 +1,6 @@
-from typing import Optional
+from typing import Optional, Tuple, Dict, Any
 
+import gym
 import torch as th
 import torch.nn as nn
 from torch.distributions import Normal, Categorical
@@ -45,14 +46,15 @@ class DiagGaussianDistribution(Distribution):
     :param action_dim: (int)  Number of continuous actions
     """
 
-    def __init__(self, action_dim):
+    def __init__(self, action_dim: int):
         super(DiagGaussianDistribution, self).__init__()
         self.distribution = None
         self.action_dim = action_dim
         self.mean_actions = None
         self.log_std = None
 
-    def proba_distribution_net(self, latent_dim, log_std_init=0.0):
+    def proba_distribution_net(self, latent_dim: int,
+                               log_std_init: float = 0.0) -> Tuple[nn.Module, nn.Parameter]:
         """
         Create the layers and parameter that represent the distribution:
         one output will be the mean of the gaussian, the other parameter will be the
@@ -64,10 +66,12 @@ def proba_distribution_net(self, latent_dim, log_std_init=0.0):
         """
         mean_actions = nn.Linear(latent_dim, self.action_dim)
         # TODO: allow action dependent std
-        log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init)
+        log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init, requires_grad=True)
         return mean_actions, log_std
 
-    def proba_distribution(self, mean_actions, log_std, deterministic=False):
+    def proba_distribution(self, mean_actions: th.Tensor,
+                           log_std: th.Tensor,
+                           deterministic: bool = False) -> Tuple[th.Tensor, 'DiagGaussianDistribution']:
         """
         Create and sample for the distribution given its parameters (mean, std)
 
@@ -84,29 +88,29 @@ def proba_distribution(self, mean_actions, log_std, deterministic=False):
             action = self.sample()
         return action, self
 
-    def mode(self):
+    def mode(self) -> th.Tensor:
         return self.distribution.mean
 
-    def sample(self):
+    def sample(self) -> th.Tensor:
         return self.distribution.rsample()
 
-    def entropy(self):
+    def entropy(self) -> th.Tensor:
         return self.distribution.entropy()
 
-    def log_prob_from_params(self, mean_actions, log_std):
+    def log_prob_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         """
         Compute the log probabilty of taking an action
         given the distribution parameters.
 
         :param mean_actions: (th.Tensor)
         :param log_std: (th.Tensor)
-        :return: (th.Tensor, th.Tensor)
+        :return: (Tuple[th.Tensor, th.Tensor])
         """
         action, _ = self.proba_distribution(mean_actions, log_std)
         log_prob = self.log_prob(action)
         return action, log_prob
 
-    def log_prob(self, action):
+    def log_prob(self, action: th.Tensor) -> th.Tensor:
         """
         Get the log probabilty of an action given a distribution.
         Note that you must call `proba_distribution()` method
@@ -132,7 +136,7 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution):
     :param epsilon: (float) small value to avoid NaN due to numerical imprecision.
     """
 
-    def __init__(self, action_dim, epsilon=1e-6):
+    def __init__(self, action_dim: int, epsilon: float = 1e-6):
         super(SquashedDiagGaussianDistribution, self).__init__(action_dim)
         # Avoid NaN (prevents division by zero or log of zero)
         self.epsilon = epsilon
@@ -143,26 +147,26 @@ def proba_distribution(self, mean_actions, log_std, deterministic=False):
                                                                                      deterministic)
         return action, self
 
-    def mode(self):
+    def mode(self) -> th.Tensor:
         self.gaussian_action = self.distribution.mean
         # Squash the output
         return th.tanh(self.gaussian_action)
 
-    def entropy(self):
+    def entropy(self) -> Optional[th.Tensor]:
         # No analytical form,
         # entropy needs to be estimated using -log_prob.mean()
         return None
 
-    def sample(self):
+    def sample(self) -> th.Tensor:
         self.gaussian_action = self.distribution.rsample()
         return th.tanh(self.gaussian_action)
 
-    def log_prob_from_params(self, mean_actions, log_std):
+    def log_prob_from_params(self, mean_actions, log_std) -> Tuple[th.Tensor, th.Tensor]:
         action, _ = self.proba_distribution(mean_actions, log_std)
         log_prob = self.log_prob(action, self.gaussian_action)
         return action, log_prob
 
-    def log_prob(self, action, gaussian_action=None):
+    def log_prob(self, action: th.Tensor, gaussian_action: Optional[th.Tensor] = None) -> th.Tensor:
         # Inverse tanh
         # Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x))
         # We use numpy to avoid numerical instability
@@ -185,12 +189,12 @@ class CategoricalDistribution(Distribution):
     :param action_dim: (int) Number of discrete actions
     """
 
-    def __init__(self, action_dim):
+    def __init__(self, action_dim: int):
         super(CategoricalDistribution, self).__init__()
         self.distribution = None
         self.action_dim = action_dim
 
-    def proba_distribution_net(self, latent_dim):
+    def proba_distribution_net(self, latent_dim: int) -> nn.Module:
         """
         Create the layer that represents the distribution:
         it will be the logits of the Categorical distribution.
@@ -202,7 +206,8 @@ def proba_distribution_net(self, latent_dim):
         action_logits = nn.Linear(latent_dim, self.action_dim)
         return action_logits
 
-    def proba_distribution(self, action_logits, deterministic=False):
+    def proba_distribution(self, action_logits: th.Tensor,
+                           deterministic: bool = False) -> Tuple[th.Tensor, 'CategoricalDistribution']:
         self.distribution = Categorical(logits=action_logits)
         if deterministic:
             action = self.mode()
@@ -210,21 +215,21 @@ def proba_distribution(self, action_logits, deterministic=False):
             action = self.sample()
         return action, self
 
-    def mode(self):
+    def mode(self) -> th.Tensor:
         return th.argmax(self.distribution.probs, dim=1)
 
-    def sample(self):
+    def sample(self) -> th.Tensor:
         return self.distribution.sample()
 
-    def entropy(self):
+    def entropy(self) -> th.Tensor:
         return self.distribution.entropy()
 
-    def log_prob_from_params(self, action_logits):
+    def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         action, _ = self.proba_distribution(action_logits)
         log_prob = self.log_prob(action)
         return action, log_prob
 
-    def log_prob(self, action):
+    def log_prob(self, action: th.Tensor) -> th.Tensor:
         log_prob = self.distribution.log_prob(action)
         return log_prob
 
@@ -249,8 +254,12 @@ class StateDependentNoiseDistribution(Distribution):
     :param epsilon: (float) small value to avoid NaN due to numerical imprecision.
     """
 
-    def __init__(self, action_dim, full_std=True, use_expln=False,
-                 squash_output=False, learn_features=False, epsilon=1e-6):
+    def __init__(self, action_dim: int,
+                 full_std: bool = True,
+                 use_expln: bool = False,
+                 squash_output: bool = False,
+                 learn_features: bool = False,
+                 epsilon: float = 1e-6):
         super(StateDependentNoiseDistribution, self).__init__()
         self.distribution = None
         self.action_dim = action_dim
@@ -269,7 +278,7 @@ def __init__(self, action_dim, full_std=True, use_expln=False,
         else:
             self.bijector = None
 
-    def get_std(self, log_std):
+    def get_std(self, log_std: th.Tensor) -> th.Tensor:
         """
         Get the standard deviation from the learned parameter
         (log of it by default). This ensures that the std is positive.
@@ -294,7 +303,7 @@ def get_std(self, log_std):
         # Reduce the number of parameters:
         return th.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std
 
-    def sample_weights(self, log_std, batch_size=1):
+    def sample_weights(self, log_std: th.Tensor, batch_size: int = 1) -> None:
         """
         Sample weights for the noise exploration matrix,
         using a centered Gaussian distribution.
@@ -307,7 +316,8 @@ def sample_weights(self, log_std, batch_size=1):
         self.exploration_mat = self.weights_dist.rsample()
         self.exploration_matrices = self.weights_dist.rsample((batch_size,))
 
-    def proba_distribution_net(self, latent_dim, log_std_init=-2.0, latent_sde_dim=None):
+    def proba_distribution_net(self, latent_dim: int, log_std_init: float = -2.0,
+                               latent_sde_dim: Optional[th.Tensor] = None) -> Tuple[nn.Module, nn.Parameter]:
         """
         Create the layers and parameter that represent the distribution:
         one output will be the deterministic action, the other parameter will be the
@@ -327,12 +337,15 @@ def proba_distribution_net(self, latent_dim, log_std_init=-2.0, latent_sde_dim=N
         # Reduce the number of parameters if needed
         log_std = th.ones(self.latent_sde_dim, self.action_dim) if self.full_std else th.ones(self.latent_sde_dim, 1)
         # Transform it to a parameter so it can be optimized
-        log_std = nn.Parameter(log_std * log_std_init)
+        log_std = nn.Parameter(log_std * log_std_init, requires_grad=True)
         # Sample an exploration matrix
         self.sample_weights(log_std)
         return mean_actions_net, log_std
 
-    def proba_distribution(self, mean_actions, log_std, latent_sde, deterministic=False):
+    def proba_distribution(self, mean_actions: th.Tensor,
+                           log_std: th.Tensor,
+                           latent_sde: th.Tensor,
+                           deterministic: bool = False) -> Tuple[th.Tensor, 'StateDependentNoiseDistribution']:
         """
         Create and sample for the distribution given its parameters (mean, std)
 
@@ -340,7 +353,7 @@ def proba_distribution(self, mean_actions, log_std, latent_sde, deterministic=Fa
         :param log_std: (th.Tensor)
         :param latent_sde: (th.Tensor)
         :param deterministic: (bool)
-        :return: (th.Tensor)
+        :return: (Tuple[th.Tensor, Distribution])
         """
         # Stop gradient if we don't want to influence the features
         latent_sde = latent_sde if self.learn_features else latent_sde.detach()
@@ -353,13 +366,13 @@ def proba_distribution(self, mean_actions, log_std, latent_sde, deterministic=Fa
             action = self.sample(latent_sde)
         return action, self
 
-    def mode(self):
+    def mode(self) -> th.Tensor:
         action = self.distribution.mean
         if self.bijector is not None:
             return self.bijector.forward(action)
         return action
 
-    def get_noise(self, latent_sde):
+    def get_noise(self, latent_sde: th.Tensor) -> th.Tensor:
         latent_sde = latent_sde if self.learn_features else latent_sde.detach()
         # Default case: only one exploration matrix
         if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
@@ -371,26 +384,28 @@ def get_noise(self, latent_sde):
         noise = th.bmm(latent_sde, self.exploration_matrices)
         return noise.squeeze(1)
 
-    def sample(self, latent_sde):
+    def sample(self, latent_sde: th.Tensor) -> th.Tensor:
         noise = self.get_noise(latent_sde)
         action = self.distribution.mean + noise
         if self.bijector is not None:
             return self.bijector.forward(action)
         return action
 
-    def entropy(self):
+    def entropy(self) -> Optional[th.Tensor]:
         # No analytical form,
         # entropy needs to be estimated using -log_prob.mean()
         if self.bijector is not None:
             return None
         return self.distribution.entropy()
 
-    def log_prob_from_params(self, mean_actions, log_std, latent_sde):
+    def log_prob_from_params(self, mean_actions: th.Tensor,
+                             log_std: th.Tensor,
+                             latent_sde: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         action, _ = self.proba_distribution(mean_actions, log_std, latent_sde)
         log_prob = self.log_prob(action)
         return action, log_prob
 
-    def log_prob(self, action):
+    def log_prob(self, action: th.Tensor) -> th.Tensor:
         if self.bijector is not None:
             gaussian_action = self.bijector.inverse(action)
         else:
@@ -418,16 +433,16 @@ class TanhBijector(object):
     :param epsilon: (float) small value to avoid NaN due to numerical imprecision.
     """
 
-    def __init__(self, epsilon=1e-6):
+    def __init__(self, epsilon: float = 1e-6):
         super(TanhBijector, self).__init__()
         self.epsilon = epsilon
 
     @staticmethod
-    def forward(x):
+    def forward(x: th.Tensor) -> th.Tensor:
         return th.tanh(x)
 
     @staticmethod
-    def atanh(x):
+    def atanh(x: th.Tensor) -> th.Tensor:
         """
         Inverse of Tanh
 
@@ -437,7 +452,7 @@ def atanh(x):
         return 0.5 * (x.log1p() - (-x).log1p())
 
     @staticmethod
-    def inverse(y):
+    def inverse(y: th.Tensor) -> th.Tensor:
         """
         Inverse tanh.
 
@@ -448,19 +463,21 @@ def inverse(y):
         # Clip the action to avoid NaN
         return TanhBijector.atanh(y.clamp(min=-1. + eps, max=1. - eps))
 
-    def log_prob_correction(self, x):
+    def log_prob_correction(self, x: th.Tensor) -> th.Tensor:
         # Squash correction (from original SAC implementation)
         return th.log(1.0 - th.tanh(x) ** 2 + self.epsilon)
 
 
-def make_proba_distribution(action_space, use_sde=False, dist_kwargs=None):
+def make_proba_distribution(action_space: gym.spaces.Space,
+                            use_sde: bool = False,
+                            dist_kwargs: Optional[Dict[str, Any]] = None) -> Distribution:
     """
     Return an instance of Distribution for the correct type of action space
 
-    :param action_space: (Gym Space) the input action space
+    :param action_space: (gym.spaces.Space) the input action space
     :param use_sde: (bool) Force the use of StateDependentNoiseDistribution
         instead of DiagGaussianDistribution
-    :param dist_kwargs: (dict) Keyword arguments to pass to the probabilty distribution
+    :param dist_kwargs: (Optional[Dict[str, Any]]) Keyword arguments to pass to the probabilty distribution
     :return: (Distribution) the approriate Distribution object
     """
     if dist_kwargs is None:
@@ -478,5 +495,6 @@ def make_proba_distribution(action_space, use_sde=False, dist_kwargs=None):
     # elif isinstance(action_space, spaces.MultiBinary):
     #     return BernoulliDistribution(action_space.n, **dist_kwargs)
     else:
-        raise NotImplementedError(f"Error: probability distribution, not implemented for action space of type {type(action_space)}."
+        raise NotImplementedError("Error: probability distribution, not implemented for action space"
+                                  f"of type {type(action_space)}."
                                   " Must be of type Gym Spaces: Box, Discrete, MultiDiscrete or MultiBinary.")
diff --git a/torchy_baselines/common/results_plotter.py b/torchy_baselines/common/results_plotter.py
index a5464dadb..6dfe0e077 100644
--- a/torchy_baselines/common/results_plotter.py
+++ b/torchy_baselines/common/results_plotter.py
@@ -2,12 +2,12 @@
 
 import numpy as np
 import pandas as pd
-import matplotlib
+# import matplotlib
+# matplotlib.use('TkAgg')  # Can change to 'Agg' for non-interactive mode
 import matplotlib.pyplot as plt
 
 from torchy_baselines.common.monitor import load_results
 
-# matplotlib.use('TkAgg')  # Can change to 'Agg' for non-interactive mode
 plt.rcParams['svg.fonttype'] = 'none'
 
 X_TIMESTEPS = 'timesteps'
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index 1e3492660..d303a6672 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -161,6 +161,8 @@ def _get_action_dist_from_latent(self, latent_pi, latent_sde=None, deterministic
         elif isinstance(self.action_dist, StateDependentNoiseDistribution):
             return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde,
                                                        deterministic=deterministic)
+        else:
+            raise ValueError('Invalid action distribution')
 
     def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
         latent_pi, _, latent_sde = self._get_latent(observation)

From a2b1bf06d36bbf2dd9101e7910abb4d4d5d1090e Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Fri, 14 Feb 2020 11:12:07 +0100
Subject: [PATCH 5/9] Add `squash_output` attribute to policy

---
 torchy_baselines/common/base_class.py | 11 +++--
 torchy_baselines/common/callbacks.py  |  8 ++--
 torchy_baselines/common/policies.py   | 59 +++++++++++++++++----------
 torchy_baselines/ppo/policies.py      |  2 +-
 torchy_baselines/sac/policies.py      |  2 +-
 torchy_baselines/td3/policies.py      |  8 ++--
 6 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index c0951cbbf..eabc7f1ae 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -86,7 +86,7 @@ def __init__(self,
         self.num_timesteps = 0
         self.eval_env = None
         self.seed = seed
-        self.action_noise = None  # type: ActionNoise
+        self.action_noise = None  # type: Optional[ActionNoise]
         self.start_time = None
         self.policy = None
         self.learning_rate = None
@@ -97,8 +97,8 @@ def __init__(self,
         # this is used to update the learning rate
         self._current_progress = 1
         # Buffers for logging
-        self.ep_info_buffer = None  # type: deque
-        self.ep_success_buffer = None  # type: deque
+        self.ep_info_buffer = None  # type: Optional[deque]
+        self.ep_success_buffer = None  # type: Optional[deque]
 
         # Create and wrap the env if needed
         if env is not None:
@@ -387,13 +387,12 @@ def predict(self, observation: np.ndarray,
         actions = actions.cpu().numpy()
 
         # Rescale to proper domain when using squashing
-        # TODO: should not be used for a Gaussian distribution?
-        if isinstance(self.action_space, gym.spaces.Box):
+        if isinstance(self.action_space, gym.spaces.Box) and self.policy.squash_output:
             actions = self.unscale_action(actions)
 
         clipped_actions = actions
         # Clip the actions to avoid out of bound error when using gaussian distribution
-        if isinstance(self.action_space, gym.spaces.Box):
+        if isinstance(self.action_space, gym.spaces.Box) and not self.policy.squash_output:
             clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
 
         if not vectorized_env:
diff --git a/torchy_baselines/common/callbacks.py b/torchy_baselines/common/callbacks.py
index 8c0d108e2..392716c96 100644
--- a/torchy_baselines/common/callbacks.py
+++ b/torchy_baselines/common/callbacks.py
@@ -22,14 +22,14 @@ class BaseCallback(ABC):
     """
     def __init__(self, verbose: int = 0):
         super(BaseCallback, self).__init__()
-        self.model = None  # type: BaseRLModel
+        self.model = None  # type: Optional[BaseRLModel]
         self.training_env = None  # type: Union[gym.Env, VecEnv, None]
         self.n_calls = 0  # type: int
         self.num_timesteps = 0  # type: int
         self.verbose = verbose
-        self.locals = None  # type: Dict[str, Any]
-        self.globals = None  # type: Dict[str, Any]
-        self.logger = None  # type: Logger
+        self.locals = None  # type: Optional[Dict[str, Any]]
+        self.globals = None  # type: Optional[Dict[str, Any]]
+        self.logger = None  # type: Optional[Logger]
         # Sometimes, for event callback, it is useful
         # to have access to the parent object
         self.parent = None  # type: Optional[BaseCallback]
diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py
index 1e0064dca..abab7d845 100644
--- a/torchy_baselines/common/policies.py
+++ b/torchy_baselines/common/policies.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Type, Dict, List, Tuple
 
 from itertools import zip_longest
 
@@ -14,14 +14,24 @@ class BasePolicy(nn.Module):
 
     :param observation_space: (gym.spaces.Space) The observation space of the environment
     :param action_space: (gym.spaces.Space) The action space of the environment
+    :param device: (Union[th.device, str]) Device on which the code should run.
+    :param squash_output: (bool) For continuous actions, whether the output is squashed
+        or not using a `tanh()` function.
     """
-
     def __init__(self, observation_space: gym.spaces.Space,
-                 action_space: gym.spaces.Space, device: Union[th.device, str] = 'cpu'):
+                 action_space: gym.spaces.Space,
+                 device: Union[th.device, str] = 'cpu',
+                 squash_output: bool = False):
         super(BasePolicy, self).__init__()
         self.observation_space = observation_space
         self.action_space = action_space
         self.device = device
+        self._squash_output = squash_output
+
+    @property
+    def squash_output(self) -> bool:
+        """ (bool) Getter for squash_output."""
+        return self._squash_output
 
     @staticmethod
     def init_weights(module: nn.Module, gain: float = 1):
@@ -71,21 +81,25 @@ def parameters_to_vector(self) -> np.ndarray:
         return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy()
 
 
-def create_mlp(input_dim, output_dim, net_arch,
-               activation_fn=nn.ReLU, squash_out=False):
+def create_mlp(input_dim: int,
+               output_dim: int,
+               net_arch: List[int],
+               activation_fn: nn.Module = nn.ReLU,
+               squash_output: bool = False) -> List[nn.Module]:
     """
     Create a multi layer perceptron (MLP), which is
     a collection of fully-connected layers each followed by an activation function.
 
     :param input_dim: (int) Dimension of the input vector
     :param output_dim: (int)
-    :param net_arch: ([int]) Architecture of the neural net
+    :param net_arch: (List[int]) Architecture of the neural net
         It represents the number of units per layer.
         The length of this list is the number of layers.
-    :param activation_fn: (th.nn.Module) The activation function
+    :param activation_fn: (nn.Module) The activation function
         to use after each layer.
-    :param squash_out: (bool) Whether to squash the output using a Tanh
+    :param squash_output: (bool) Whether to squash the output using a Tanh
         activation function
+    :return: (List[nn.Module])
     """
 
     if len(net_arch) > 0:
@@ -99,12 +113,14 @@ def create_mlp(input_dim, output_dim, net_arch,
 
     if output_dim > 0:
         modules.append(nn.Linear(net_arch[-1], output_dim))
-    if squash_out:
+    if squash_output:
         modules.append(nn.Tanh())
     return modules
 
 
-def create_sde_feature_extractor(features_dim, sde_net_arch, activation_fn):
+def create_sde_feature_extractor(features_dim: int,
+                                 sde_net_arch: List[int],
+                                 activation_fn: nn.Module) -> Tuple[nn.Sequential, int]:
     """
     Create the neural network that will be used to extract features
     for the SDE.
@@ -117,7 +133,7 @@ def create_sde_feature_extractor(features_dim, sde_net_arch, activation_fn):
     # Special case: when using states as features (i.e. sde_net_arch is an empty list)
     # don't use any activation function
     sde_activation = activation_fn if len(sde_net_arch) > 0 else None
-    latent_sde_net = create_mlp(features_dim, -1, sde_net_arch, activation_fn=sde_activation, squash_out=False)
+    latent_sde_net = create_mlp(features_dim, -1, sde_net_arch, activation_fn=sde_activation, squash_output=False)
     latent_sde_dim = sde_net_arch[-1] if len(sde_net_arch) > 0 else features_dim
     sde_feature_extractor = nn.Sequential(*latent_sde_net)
     return sde_feature_extractor, latent_sde_dim
@@ -131,7 +147,7 @@ class BaseNetwork(nn.Module):
     def __init__(self):
         super(BaseNetwork, self).__init__()
 
-    def load_from_vector(self, vector):
+    def load_from_vector(self, vector: np.ndarray):
         """
         Load parameters from a 1D vector.
 
@@ -140,7 +156,7 @@ def load_from_vector(self, vector):
         device = next(self.parameters()).device
         th.nn.utils.vector_to_parameters(th.FloatTensor(vector).to(device), self.parameters())
 
-    def parameters_to_vector(self):
+    def parameters_to_vector(self) -> np.ndarray:
         """
         Convert the parameters to a 1D vector.
 
@@ -149,16 +165,16 @@ def parameters_to_vector(self):
         return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy()
 
 
-_policy_registry = dict()
+_policy_registry = dict()  # type: Dict[Type[BasePolicy], Dict[str, Type[BasePolicy]]]
 
 
-def get_policy_from_name(base_policy_type, name):
+def get_policy_from_name(base_policy_type: Type[BasePolicy], name: str) -> Type[BasePolicy]:
     """
-    returns the registed policy from the base type and name
+    Returns the registered policy from the base type and name
 
-    :param base_policy_type: (BasePolicy) the base policy object
+    :param base_policy_type: (Type[BasePolicy]) the base policy class
     :param name: (str) the policy name
-    :return: (base_policy_type) the policy
+    :return: (Type[BasePolicy]) the policy
     """
     if base_policy_type not in _policy_registry:
         raise ValueError(f"Error: the policy type {base_policy_type} is not registered!")
@@ -168,12 +184,13 @@ def get_policy_from_name(base_policy_type, name):
     return _policy_registry[base_policy_type][name]
 
 
-def register_policy(name, policy):
+def register_policy(name: str, policy: Type[BasePolicy]) -> None:
     """
-    returns the registed policy from the base type and name
+    Register a policy, so it can be called using its name.
+    e.g. SAC('MlpPolicy', ...) instead of SAC(MlpPolicy, ...)
 
     :param name: (str) the policy name
-    :param policy: (subclass of BasePolicy) the policy
+    :param policy: (Type[BasePolicy]) the policy class
     """
     sub_class = None
     # For building the doc
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index d303a6672..3e47375d5 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -41,7 +41,7 @@ def __init__(self, observation_space, action_space,
                  ortho_init=True, use_sde=False,
                  log_std_init=0.0, full_std=True,
                  sde_net_arch=None, use_expln=False, squash_output=False):
-        super(PPOPolicy, self).__init__(observation_space, action_space, device)
+        super(PPOPolicy, self).__init__(observation_space, action_space, device, squash_output=squash_output)
         self.obs_dim = self.observation_space.shape[0]
 
         # Default network architecture, from stable-baselines
diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py
index 3fbea937a..4f9673840 100644
--- a/torchy_baselines/sac/policies.py
+++ b/torchy_baselines/sac/policies.py
@@ -200,7 +200,7 @@ def __init__(self, observation_space, action_space,
                  learning_rate, net_arch=None, device='cpu',
                  activation_fn=nn.ReLU, use_sde=False,
                  log_std_init=-3, sde_net_arch=None, use_expln=False):
-        super(SACPolicy, self).__init__(observation_space, action_space, device)
+        super(SACPolicy, self).__init__(observation_space, action_space, device, squash_output=True)
 
         if net_arch is None:
             net_arch = [256, 256]
diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py
index fa199521e..f75cdea68 100644
--- a/torchy_baselines/td3/policies.py
+++ b/torchy_baselines/td3/policies.py
@@ -52,7 +52,7 @@ def __init__(self,
         self.sde_feature_extractor = None
 
         if use_sde:
-            latent_pi_net = create_mlp(obs_dim, -1, net_arch, activation_fn, squash_out=False)
+            latent_pi_net = create_mlp(obs_dim, -1, net_arch, activation_fn, squash_output=False)
             self.latent_pi = nn.Sequential(*latent_pi_net)
             latent_sde_dim = net_arch[-1]
             learn_features = sde_net_arch is not None
@@ -74,7 +74,7 @@ def __init__(self,
             self.sde_optimizer = th.optim.Adam([self.log_std], lr=lr_sde)
             self.reset_noise()
         else:
-            actor_net = create_mlp(obs_dim, action_dim, net_arch, activation_fn, squash_out=True)
+            actor_net = create_mlp(obs_dim, action_dim, net_arch, activation_fn, squash_output=True)
             self.mu = nn.Sequential(*actor_net)
 
     def get_std(self) -> torch.Tensor:
@@ -134,7 +134,7 @@ def forward(self, obs: torch.Tensor, deterministic: bool = True) -> torch.Tensor
             if self.clip_noise is not None:
                 noise = th.clamp(noise, -self.clip_noise, self.clip_noise)
             # TODO: Replace with squashing -> need to account for that in the sde update
-            # -> set squash_out=True in the action_dist?
+            # -> set squash_output=True in the action_dist?
             # NOTE: the clipping is done in the rollout for now
             return self.mu(latent_pi) + noise
             # action, _ = self._get_action_dist_from_latent(latent_pi)
@@ -215,7 +215,7 @@ def __init__(self, observation_space, action_space,
                  learning_rate, net_arch=None, device='cpu',
                  activation_fn=nn.ReLU, use_sde=False, log_std_init=-3,
                  clip_noise=None, lr_sde=3e-4, sde_net_arch=None, use_expln=False):
-        super(TD3Policy, self).__init__(observation_space, action_space, device)
+        super(TD3Policy, self).__init__(observation_space, action_space, device, squash_output=True)
 
         # Default network architecture, from the original paper
         if net_arch is None:

From 8b559d71ab7eca97790be7263ab3c827004d430b Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Fri, 14 Feb 2020 13:42:16 +0100
Subject: [PATCH 6/9] Remove deprecated monitor format and improve tests

---
 docs/misc/changelog.rst            |  1 +
 tests/test_monitor.py              | 46 ++++++++++++++++++----------
 torchy_baselines/common/monitor.py | 48 ++++++++++--------------------
 3 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 02bf897b5..ea784b619 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -12,6 +12,7 @@ Breaking Changes:
 - Return type of `evaluation.evaluate_policy()` has been changed
 - Refactored the replay buffer to avoid transformation between PyTorch and NumPy
 - Created `OffPolicyRLModel` base class
+- Remove deprecated JSON format for `Monitor`
 
 New Features:
 ^^^^^^^^^^^^^
diff --git a/tests/test_monitor.py b/tests/test_monitor.py
index 141b2c0d4..d783f2554 100644
--- a/tests/test_monitor.py
+++ b/tests/test_monitor.py
@@ -8,33 +8,47 @@
 from torchy_baselines.common.monitor import Monitor, get_monitor_files, load_results
 
 
-def test_monitor():
+def test_monitor(tmp_path):
     """
     test the monitor wrapper
     """
     env = gym.make("CartPole-v1")
     env.seed(0)
-    monitor_file = "/tmp/stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())
+    monitor_file = os.path.join(str(tmp_path), "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
     monitor_env = Monitor(env, monitor_file)
     monitor_env.reset()
-    for _ in range(1000):
-        _, _, done, _ = monitor_env.step(0)
+    total_steps = 1000
+    ep_rewards = []
+    ep_lengths = []
+    ep_len, ep_reward = 0, 0
+    for _ in range(total_steps):
+        _, reward, done, _ = monitor_env.step(0)
+        ep_len += 1
+        ep_reward += reward
         if done:
+            ep_rewards.append(ep_reward)
+            ep_lengths.append(ep_len)
             monitor_env.reset()
-
-    file_handler = open(monitor_file, 'rt')
-
-    first_line = file_handler.readline()
-    assert first_line.startswith('#')
-    metadata = json.loads(first_line[1:])
-    assert metadata['env_id'] == "CartPole-v1"
-    assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"
-
-    last_logline = pandas.read_csv(file_handler, index_col=None)
-    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
-    file_handler.close()
+            ep_len, ep_reward = 0, 0
+
+    monitor_env.close()
+    assert monitor_env.get_total_steps() == total_steps
+    assert sum(ep_lengths) == sum(monitor_env.get_episode_lengths())
+    assert sum(monitor_env.get_episode_rewards()) == sum(ep_rewards)
+    _ = monitor_env.get_episode_times()
+
+    with open(monitor_file, 'rt') as file_handler:
+        first_line = file_handler.readline()
+        assert first_line.startswith('#')
+        metadata = json.loads(first_line[1:])
+        assert metadata['env_id'] == "CartPole-v1"
+        assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"
+
+        last_logline = pandas.read_csv(file_handler, index_col=None)
+        assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
     os.remove(monitor_file)
 
+
 def test_monitor_load_results(tmp_path):
     """
     test load_results on log files produced by the monitor wrapper
diff --git a/torchy_baselines/common/monitor.py b/torchy_baselines/common/monitor.py
index 88bea456a..3d84b9ba9 100644
--- a/torchy_baselines/common/monitor.py
+++ b/torchy_baselines/common/monitor.py
@@ -14,22 +14,21 @@
 
 class Monitor(gym.Wrapper):
     EXT = "monitor.csv"
-    file_handler = None
 
     def __init__(self,
                  env: gym.Env,
                  filename: Optional[str] = None,
                  allow_early_resets: bool = True,
-                 reset_keywords=(),
-                 info_keywords=()):
+                 reset_keywords: Tuple[str, ...] = (),
+                 info_keywords: Tuple[str, ...] = ()):
         """
         A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data.
 
         :param env: (gym.Env) The environment
         :param filename: (Optional[str]) the location to save a log file, can be None for no log
         :param allow_early_resets: (bool) allows the reset of the environment before it is done
-        :param reset_keywords: (tuple) extra keywords for the reset call, if extra parameters are needed at reset
-        :param info_keywords: (tuple) extra information to log, from the information return of environment.step
+        :param reset_keywords: (Tuple[str, ...]) extra keywords for the reset call, if extra parameters are needed at reset
+        :param info_keywords: (Tuple[str, ...]) extra information to log, from the information return of environment.step
         """
         super(Monitor, self).__init__(env=env)
         self.t_start = time.time()
@@ -93,12 +92,12 @@ def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, Dict[Any, A
         if done:
             self.needs_reset = True
             ep_rew = sum(self.rewards)
-            eplen = len(self.rewards)
-            ep_info = {"r": round(ep_rew, 6), "l": eplen, "t": round(time.time() - self.t_start, 6)}
+            ep_len = len(self.rewards)
+            ep_info = {"r": round(ep_rew, 6), "l": ep_len, "t": round(time.time() - self.t_start, 6)}
             for key in self.info_keywords:
                 ep_info[key] = info[key]
             self.episode_rewards.append(ep_rew)
-            self.episode_lengths.append(eplen)
+            self.episode_lengths.append(ep_len)
             self.episode_times.append(time.time() - self.t_start)
             ep_info.update(self.current_reset_info)
             if self.logger:
@@ -168,41 +167,26 @@ def get_monitor_files(path: str) -> List[str]:
 
 def load_results(path: str) -> pandas.DataFrame:
     """
-    Load all Monitor logs from a given directory path matching ``*monitor.csv`` and ``*monitor.json``
+    Load all Monitor logs from a given directory path matching ``*monitor.csv``
 
     :param path: (str) the directory path containing the log file(s)
     :return: (pandas.DataFrame) the logged data
     """
-    # get both csv and (old) json files
-    monitor_files = (glob(os.path.join(path, "*monitor.json")) + get_monitor_files(path))
-    if not monitor_files:
+    monitor_files = get_monitor_files(path)
+    if len(monitor_files) == 0:
         raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, path))
-    data_frames = []
-    headers = []
+    data_frames, headers = [], []
     for file_name in monitor_files:
         with open(file_name, 'rt') as file_handler:
-            if file_name.endswith('csv'):
-                first_line = file_handler.readline()
-                assert first_line[0] == '#'
-                header = json.loads(first_line[1:])
-                data_frame = pandas.read_csv(file_handler, index_col=None)
-                headers.append(header)
-            elif file_name.endswith('json'):  # Deprecated json format
-                episodes = []
-                lines = file_handler.readlines()
-                header = json.loads(lines[0])
-                headers.append(header)
-                for line in lines[1:]:
-                    episode = json.loads(line)
-                    episodes.append(episode)
-                data_frame = pandas.DataFrame(episodes)
-            else:
-                assert 0, 'unreachable'
+            first_line = file_handler.readline()
+            assert first_line[0] == '#'
+            header = json.loads(first_line[1:])
+            data_frame = pandas.read_csv(file_handler, index_col=None)
+            headers.append(header)
             data_frame['t'] += header['t_start']
         data_frames.append(data_frame)
     data_frame = pandas.concat(data_frames)
     data_frame.sort_values('t', inplace=True)
     data_frame.reset_index(inplace=True)
     data_frame['t'] -= min(header['t_start'] for header in headers)
-    # data_frame.headers = headers  # HACK to preserve backwards compatibility
     return data_frame

From e31b139c470c21041ebfe841480e47d18ee89c55 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Fri, 14 Feb 2020 14:03:41 +0100
Subject: [PATCH 7/9] Add test for predict method

---
 tests/test_predict.py                 | 55 +++++++++++++++++++++++++++
 tests/test_run.py                     | 12 ------
 tests/test_save_load.py               |  1 +
 torchy_baselines/common/base_class.py | 37 +++++++++---------
 4 files changed, 75 insertions(+), 30 deletions(-)
 create mode 100644 tests/test_predict.py

diff --git a/tests/test_predict.py b/tests/test_predict.py
new file mode 100644
index 000000000..e75deec06
--- /dev/null
+++ b/tests/test_predict.py
@@ -0,0 +1,55 @@
+import gym
+import pytest
+
+from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3
+from torchy_baselines.common.vec_env import DummyVecEnv
+
+MODEL_LIST = [
+    CEMRL,
+    PPO,
+    A2C,
+    TD3,
+    SAC,
+]
+
+@pytest.mark.parametrize("model_class", MODEL_LIST)
+def test_auto_wrap(model_class):
+    # test auto wrapping of env into a VecEnv
+    env = gym.make('Pendulum-v0')
+    eval_env = gym.make('Pendulum-v0')
+    model = model_class('MlpPolicy', env)
+    model.learn(100, eval_env=eval_env)
+
+
+@pytest.mark.parametrize("model_class", MODEL_LIST)
+def test_predict(model_class):
+    # test detection of different shapes by the predict method
+    model = model_class('MlpPolicy', 'Pendulum-v0')
+    env = gym.make('Pendulum-v0')
+    vec_env = DummyVecEnv([lambda: gym.make('Pendulum-v0'), lambda: gym.make('Pendulum-v0')])
+
+    obs = env.reset()
+    action = model.predict(obs)
+    assert action.shape == env.action_space.shape
+    assert env.action_space.contains(action)
+
+    vec_env_obs = vec_env.reset()
+    action = model.predict(vec_env_obs)
+    assert action.shape[0] == vec_env_obs.shape[0]
+
+
+@pytest.mark.parametrize("model_class", [A2C, PPO])
+def test_predict_discrete(model_class):
+    # test detection of different shapes by the predict method
+    model = model_class('MlpPolicy', 'CartPole-v1')
+    env = gym.make('CartPole-v1')
+    vec_env = DummyVecEnv([lambda: gym.make('CartPole-v1'), lambda: gym.make('CartPole-v1')])
+
+    obs = env.reset()
+    action = model.predict(obs)
+    assert action.shape == ()
+    assert env.action_space.contains(action)
+
+    vec_env_obs = vec_env.reset()
+    action = model.predict(vec_env_obs)
+    assert action.shape[0] == vec_env_obs.shape[0]
diff --git a/tests/test_run.py b/tests/test_run.py
index 1d206c9b8..fdfcff6da 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -14,18 +14,12 @@ def test_td3(action_noise):
     model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
                 learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
     model.learn(total_timesteps=1000, eval_freq=500)
-    model.save("test_save")
-    model.load("test_save")
-    os.remove("test_save.zip")
 
 
 def test_cemrl():
     model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
                   learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
     model.learn(total_timesteps=1000, eval_freq=500)
-    model.save("test_save")
-    model.load("test_save")
-    os.remove("test_save.zip")
 
 
 @pytest.mark.parametrize("model_class", [A2C, PPO])
@@ -33,9 +27,6 @@ def test_cemrl():
 def test_onpolicy(model_class, env_id):
     model = model_class('MlpPolicy', env_id, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
     model.learn(total_timesteps=1000, eval_freq=500)
-    model.save("test_save")
-    model.load("test_save")
-    os.remove("test_save.zip")
 
 
 @pytest.mark.parametrize("ent_coef", ['auto', 0.01])
@@ -44,6 +35,3 @@ def test_sac(ent_coef):
                 learning_starts=100, verbose=1, create_eval_env=True, ent_coef=ent_coef,
                 action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)))
     model.learn(total_timesteps=1000, eval_freq=500)
-    model.save("test_save")
-    model.load("test_save")
-    os.remove("test_save.zip")
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
index 45c0ac5fc..9d73ddf74 100644
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@@ -134,6 +134,7 @@ def test_exclude_include_saved_params(model_class):
     # clear file from os
     os.remove("test_save.zip")
 
+
 @pytest.mark.parametrize("model_class", [SAC, TD3])
 def test_save_load_replay_buffer(model_class):
     log_folder = 'logs'
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index eabc7f1ae..15da3bb6b 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -335,24 +335,25 @@ def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.s
             else:
                 raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
                                  "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
-        elif isinstance(observation_space, gym.spaces.MultiDiscrete):
-            if observation.shape == (len(observation_space.nvec),):
-                return False
-            elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
-                return True
-            else:
-                raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
-                                 "environment, please use ({},) or ".format(len(observation_space.nvec)) +
-                                 "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
-        elif isinstance(observation_space, gym.spaces.MultiBinary):
-            if observation.shape == (observation_space.n,):
-                return False
-            elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
-                return True
-            else:
-                raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
-                                 "environment, please use ({},) or ".format(observation_space.n) +
-                                 "(n_env, {}) for the observation shape.".format(observation_space.n))
+        # TODO: add support for MultiDiscrete and MultiBinary action spaces
+        # elif isinstance(observation_space, gym.spaces.MultiDiscrete):
+        #     if observation.shape == (len(observation_space.nvec),):
+        #         return False
+        #     elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec):
+        #         return True
+        #     else:
+        #         raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) +
+        #                          "environment, please use ({},) or ".format(len(observation_space.nvec)) +
+        #                          "(n_env, {}) for the observation shape.".format(len(observation_space.nvec)))
+        # elif isinstance(observation_space, gym.spaces.MultiBinary):
+        #     if observation.shape == (observation_space.n,):
+        #         return False
+        #     elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n:
+        #         return True
+        #     else:
+        #         raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) +
+        #                          "environment, please use ({},) or ".format(observation_space.n) +
+        #                          "(n_env, {}) for the observation shape.".format(observation_space.n))
         else:
             raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}."
                              .format(observation_space))

From 4392759057958520e504bca407dc249dd5db8f33 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Fri, 14 Feb 2020 14:15:55 +0100
Subject: [PATCH 8/9] Comment unused code

---
 tests/test_predict.py                 | 29 ++++++++-------------------
 torchy_baselines/common/base_class.py | 18 ++++++++---------
 2 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/tests/test_predict.py b/tests/test_predict.py
index e75deec06..6f2245ce8 100644
--- a/tests/test_predict.py
+++ b/tests/test_predict.py
@@ -22,32 +22,19 @@ def test_auto_wrap(model_class):
 
 
 @pytest.mark.parametrize("model_class", MODEL_LIST)
-def test_predict(model_class):
-    # test detection of different shapes by the predict method
-    model = model_class('MlpPolicy', 'Pendulum-v0')
-    env = gym.make('Pendulum-v0')
-    vec_env = DummyVecEnv([lambda: gym.make('Pendulum-v0'), lambda: gym.make('Pendulum-v0')])
-
-    obs = env.reset()
-    action = model.predict(obs)
-    assert action.shape == env.action_space.shape
-    assert env.action_space.contains(action)
+@pytest.mark.parametrize("env_id", ['Pendulum-v0', 'CartPole-v1'])
+def test_predict(model_class, env_id):
+    if env_id == 'CartPole-v1' and model_class not in [PPO, A2C]:
+        return
 
-    vec_env_obs = vec_env.reset()
-    action = model.predict(vec_env_obs)
-    assert action.shape[0] == vec_env_obs.shape[0]
-
-
-@pytest.mark.parametrize("model_class", [A2C, PPO])
-def test_predict_discrete(model_class):
     # test detection of different shapes by the predict method
-    model = model_class('MlpPolicy', 'CartPole-v1')
-    env = gym.make('CartPole-v1')
-    vec_env = DummyVecEnv([lambda: gym.make('CartPole-v1'), lambda: gym.make('CartPole-v1')])
+    model = model_class('MlpPolicy', env_id)
+    env = gym.make(env_id)
+    vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)])
 
     obs = env.reset()
     action = model.predict(obs)
-    assert action.shape == ()
+    assert action.shape == env.action_space.shape
     assert env.action_space.contains(action)
 
     vec_env_obs = vec_env.reset()
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index 15da3bb6b..6b3fb6a52 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -327,15 +327,15 @@ def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.s
                                  "Box environment, please use {} ".format(observation_space.shape) +
                                  "or (n_env, {}) for the observation shape."
                                  .format(", ".join(map(str, observation_space.shape))))
-        elif isinstance(observation_space, gym.spaces.Discrete):
-            if observation.shape == ():  # A numpy array of a number, has shape empty tuple '()'
-                return False
-            elif len(observation.shape) == 1:
-                return True
-            else:
-                raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
-                                 "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
-        # TODO: add support for MultiDiscrete and MultiBinary action spaces
+        # TODO: add support for Discrete, MultiDiscrete and MultiBinary observation spaces
+        # elif isinstance(observation_space, gym.spaces.Discrete):
+        #     if observation.shape == ():  # A numpy array of a number, has shape empty tuple '()'
+        #         return False
+        #     elif len(observation.shape) == 1:
+        #         return True
+        #     else:
+        #         raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) +
+        #                          "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.")
         # elif isinstance(observation_space, gym.spaces.MultiDiscrete):
         #     if observation.shape == (len(observation_space.nvec),):
         #         return False

From af46aa19d1bfc5b99604710fc9062d5ad32129a0 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Fri, 14 Feb 2020 14:33:41 +0100
Subject: [PATCH 9/9] Add copyright notice

---
 NOTICE | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 NOTICE

diff --git a/NOTICE b/NOTICE
new file mode 100644
index 000000000..9fc6700ee
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,27 @@
+Large portion of the code of Torchy-Baselines (in `common/`) were ported from Stable-Baselines, a fork of OpenAI Baselines,
+both licensed under the MIT License:
+
+before the fork (June 2018):
+Copyright (c) 2017 OpenAI (http://openai.com)
+
+after the fork (June 2018):
+Copyright (c) 2018-2019 Stable-Baselines Team
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.