From 7bafdb3a675f4d6c033d6453454969ca5ef2f4c0 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 12 Feb 2020 11:34:29 +0100 Subject: [PATCH 1/9] Add `get_vec_normalize_env()` --- docs/misc/changelog.rst | 1 + tests/test_vec_normalize.py | 2 ++ torchy_baselines/common/base_class.py | 12 ++++++++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 30f5a60d0..0bb3910de 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -19,6 +19,7 @@ New Features: - Add support for Callback (cf https://github.com/hill-a/stable-baselines/pull/644) - Add methods for saving and loading replay buffer - Add `extend()` method to the buffers +- Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists Bug Fixes: ^^^^^^^^^^ diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index ab0f7f154..3c21f69aa 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -123,6 +123,8 @@ def test_offpolicy_normalization(model_class): model = model_class('MlpPolicy', env, verbose=1) model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500) + # Check getter + assert isinstance(model.get_vec_normalize_env(), VecNormalize) def test_sync_vec_normalize(): diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 68c9025d3..d4e681028 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -14,7 +14,7 @@ from torchy_baselines.common import logger from torchy_baselines.common.policies import BasePolicy, get_policy_from_name from torchy_baselines.common.utils import set_random_seed, get_schedule_fn, update_learning_rate -from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize +from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize, VecNormalize from torchy_baselines.common.save_util import data_to_json, json_to_data, recursive_getattr, recursive_setattr from torchy_baselines.common.type_aliases import GymEnv, TensorDict, OptimizerStateDict from torchy_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback, EvalCallback @@ -212,10 +212,18 @@ def get_env(self) -> Optional[VecEnv]: """ Returns the current environment (can be None if not defined). - :return: The current environment + :return: (Optional[VecEnv]) The current environment """ return self.env + def get_vec_normalize_env(self) -> Optional[VecNormalize]: + """ + Return the `VecNormalize` wrapper of the training env + if it exists. + :return: Optional[VecNormalize] The `VecNormalize` env. + """ + return self._vec_normalize_env + @staticmethod def check_env(env, observation_space: gym.spaces.Space, action_space: gym.spaces.Space) -> bool: """ From 9caea35a11289e456ae2fe79cae135cfc00c7ab1 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 12 Feb 2020 14:31:15 +0100 Subject: [PATCH 2/9] Add results plotter --- .coveragerc | 2 + docs/misc/changelog.rst | 1 + torchy_baselines/common/results_plotter.py | 126 +++++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 torchy_baselines/common/results_plotter.py diff --git a/.coveragerc b/.coveragerc index a8fc2af79..511f20d8b 100644 --- a/.coveragerc +++ b/.coveragerc @@ -3,6 +3,8 @@ branch = False omit = tests/* setup.py + # Require graphical interface + torchy_baselines/common/results_plotter.py [report] exclude_lines = diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 0bb3910de..b083e3983 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -20,6 +20,7 @@ New Features: - Add methods for saving and loading replay buffer - Add `extend()` method to the buffers - Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists +- Add `¶results_plotter` from Stable Baselines Bug Fixes: ^^^^^^^^^^ diff --git a/torchy_baselines/common/results_plotter.py b/torchy_baselines/common/results_plotter.py new file mode 100644 index 000000000..a5464dadb --- /dev/null +++ b/torchy_baselines/common/results_plotter.py @@ -0,0 +1,126 @@ +from typing import Tuple, Callable, List, Optional + +import numpy as np +import pandas as pd +import matplotlib +import matplotlib.pyplot as plt + +from torchy_baselines.common.monitor import load_results + +# matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode +plt.rcParams['svg.fonttype'] = 'none' + +X_TIMESTEPS = 'timesteps' +X_EPISODES = 'episodes' +X_WALLTIME = 'walltime_hrs' +POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] +EPISODES_WINDOW = 100 +COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', + 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', + 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] + + +def rolling_window(array: np.ndarray, window: int) -> np.ndarray: + """ + Apply a rolling window to a np.ndarray + + :param array: (np.ndarray) the input Array + :param window: (int) length of the rolling window + :return: (np.ndarray) rolling window on the input array + """ + shape = array.shape[:-1] + (array.shape[-1] - window + 1, window) + strides = array.strides + (array.strides[-1],) + return np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides) + + +def window_func(var_1: np.ndarray, var_2: np.ndarray, + window: int, func: Callable) -> Tuple[np.ndarray, np.ndarray]: + """ + Apply a function to the rolling window of 2 arrays + + :param var_1: (np.ndarray) variable 1 + :param var_2: (np.ndarray) variable 2 + :param window: (int) length of the rolling window + :param func: (numpy function) function to apply on the rolling window on variable 2 (such as np.mean) + :return: (Tuple[np.ndarray, np.ndarray]) the rolling output with applied function + """ + var_2_window = rolling_window(var_2, window) + function_on_var2 = func(var_2_window, axis=-1) + return var_1[window - 1:], function_on_var2 + + +def ts2xy(timesteps: pd.DataFrame, x_axis: str) -> Tuple[np.ndarray, np.ndarray]: + """ + Decompose a timesteps variable to x ans ys + + :param timesteps: (pd.DataFrame) the input data + :param x_axis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :return: (Tuple[np.ndarray, np.ndarray]) the x and y output + """ + if x_axis == X_TIMESTEPS: + x_var = np.cumsum(timesteps.l.values) + y_var = timesteps.r.values + elif x_axis == X_EPISODES: + x_var = np.arange(len(timesteps)) + y_var = timesteps.r.values + elif x_axis == X_WALLTIME: + # Convert to hours + x_var = timesteps.t.values / 3600. + y_var = timesteps.r.values + else: + raise NotImplementedError + return x_var, y_var + + +def plot_curves(xy_list: List[Tuple[np.ndarray, np.ndarray]], + x_axis: str, title: str, figsize: Tuple[int, int] = (8, 2)) -> None: + """ + plot the curves + + :param xy_list: (List[Tuple[np.ndarray, np.ndarray]]) the x and y coordinates to plot + :param x_axis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :param title: (str) the title of the plot + :param figsize: (Tuple[int, int]) Size of the figure (width, height) + """ + + plt.figure(figsize=figsize) + max_x = max(xy[0][-1] for xy in xy_list) + min_x = 0 + for (i, (x, y)) in enumerate(xy_list): + color = COLORS[i] + plt.scatter(x, y, s=2) + # Do not plot the smoothed curve at all if the timeseries is shorter than window size. + if x.shape[0] >= EPISODES_WINDOW: + # Compute and plot rolling mean with window of size EPISODE_WINDOW + x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) + plt.plot(x, y_mean, color=color) + plt.xlim(min_x, max_x) + plt.title(title) + plt.xlabel(x_axis) + plt.ylabel("Episode Rewards") + plt.tight_layout() + + +def plot_results(dirs: List[str], num_timesteps: Optional[int], + x_axis: str, task_name: str, figsize: Tuple[int, int] = (8, 2)) -> None: + """ + plot the results + + :param dirs: ([str]) the save location of the results to plot + :param num_timesteps: (int or None) only plot the points below this value + :param x_axis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :param task_name: (str) the title of the task to plot + :param figsize: (Tuple[int, int]) Size of the figure (width, height) + """ + + timesteps_list = [] + for folder in dirs: + timesteps = load_results(folder) + if num_timesteps is not None: + timesteps = timesteps[timesteps.l.cumsum() <= num_timesteps] + timesteps_list.append(timesteps) + xy_list = [ts2xy(timesteps_item, x_axis) for timesteps_item in timesteps_list] + plot_curves(xy_list, x_axis, task_name, figsize) From f1a4fa2d3fae520e1308929d04f78e6d7b6223cb Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 12 Feb 2020 15:25:05 +0100 Subject: [PATCH 3/9] Improve predict method --- docs/misc/changelog.rst | 5 +- setup.py | 6 +- tests/test_save_load.py | 6 +- torchy_baselines/__init__.py | 2 +- torchy_baselines/common/base_class.py | 105 +++++++++++++++++++++++--- torchy_baselines/common/policies.py | 27 +++++-- torchy_baselines/ppo/policies.py | 6 +- torchy_baselines/ppo/ppo.py | 22 ------ torchy_baselines/sac/policies.py | 6 +- torchy_baselines/sac/sac.py | 19 ----- torchy_baselines/td3/policies.py | 3 + torchy_baselines/td3/td3.py | 19 ----- 12 files changed, 135 insertions(+), 91 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index b083e3983..02bf897b5 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -3,7 +3,7 @@ Changelog ========== -Pre-Release 0.2.0a1 (WIP) +Pre-Release 0.2.0a2 (WIP) ------------------------------ Breaking Changes: @@ -20,7 +20,8 @@ New Features: - Add methods for saving and loading replay buffer - Add `extend()` method to the buffers - Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists -- Add `¶results_plotter` from Stable Baselines +- Add `results_plotter` from Stable Baselines +- Improve `predict()` method to handle different type of observations (single, vectorized, ...) Bug Fixes: ^^^^^^^^^^ diff --git a/setup.py b/setup.py index b9598fc98..92389eaa0 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,9 @@ 'torch>=1.2.0', 'cloudpickle', # For reading logs - 'pandas' + 'pandas', + # Plotting learning curves + 'matplotlib' ], extras_require={ 'tests': [ @@ -45,7 +47,7 @@ license="MIT", long_description="", long_description_content_type='text/markdown', - version="0.2.0a1", + version="0.2.0a2", ) # python setup.py sdist diff --git a/tests/test_save_load.py b/tests/test_save_load.py index edd326ace..45c0ac5fc 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -34,7 +34,7 @@ def test_save_load(model_class): env.reset() observations = np.array([env.step(env.action_space.sample())[0] for _ in range(10)]) - observations = np.squeeze(observations) + observations = observations.reshape(10, -1) # Get dictionary of current parameters params = deepcopy(model.policy.state_dict()) @@ -53,7 +53,7 @@ def test_save_load(model_class): params = new_params # get selected actions - selected_actions = [model.predict(observation, deterministic=True) for observation in observations] + selected_actions = model.predict(observations, deterministic=True) # Check model.save("test_save.zip") @@ -68,7 +68,7 @@ def test_save_load(model_class): assert th.allclose(params[key], new_params[key]), "Model parameters not the same after save and load." # check if model still selects the same actions - new_selected_actions = [model.predict(observation, deterministic=True) for observation in observations] + new_selected_actions = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py index e250cc26d..5e22a8dd0 100644 --- a/torchy_baselines/__init__.py +++ b/torchy_baselines/__init__.py @@ -4,4 +4,4 @@ from torchy_baselines.sac import SAC from torchy_baselines.td3 import TD3 -__version__ = "0.2.0a1" +__version__ = "0.2.0a2" diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index d4e681028..2348d0e10 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -306,21 +306,104 @@ def learn(self, total_timesteps: int, """ raise NotImplementedError() - @abstractmethod + @staticmethod + def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.spaces.Space) -> bool: + """ + For every observation type, detects and validates the shape, + then returns whether or not the observation is vectorized. + + :param observation: (np.ndarray) the input observation to validate + :param observation_space: (gym.spaces) the observation space + :return: (bool) whether the given observation is vectorized or not + """ + if isinstance(observation_space, gym.spaces.Box): + if observation.shape == observation_space.shape: + return False + elif observation.shape[1:] == observation_space.shape: + return True + else: + raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + + "Box environment, please use {} ".format(observation_space.shape) + + "or (n_env, {}) for the observation shape." + .format(", ".join(map(str, observation_space.shape)))) + elif isinstance(observation_space, gym.spaces.Discrete): + if observation.shape == (): # A numpy array of a number, has shape empty tuple '()' + return False + elif len(observation.shape) == 1: + return True + else: + raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + + "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.") + elif isinstance(observation_space, gym.spaces.MultiDiscrete): + if observation.shape == (len(observation_space.nvec),): + return False + elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec): + return True + else: + raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) + + "environment, please use ({},) or ".format(len(observation_space.nvec)) + + "(n_env, {}) for the observation shape.".format(len(observation_space.nvec))) + elif isinstance(observation_space, gym.spaces.MultiBinary): + if observation.shape == (observation_space.n,): + return False + elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n: + return True + else: + raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) + + "environment, please use ({},) or ".format(observation_space.n) + + "(n_env, {}) for the observation shape.".format(observation_space.n)) + else: + raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}." + .format(observation_space)) + def predict(self, observation: np.ndarray, state: Optional[np.ndarray] = None, mask: Optional[np.ndarray] = None, deterministic: bool = False) -> np.ndarray: """ - Get the model's action from an observation + Get the model's action(s) from an observation + + :param observation: (np.ndarray) the input observation + :param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies) + :param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies) + :param deterministic: (bool) Whether or not to return deterministic actions. + :return: (np.ndarray) the model's action and the next state (used in recurrent policies) + """ + # if state is None: + # state = self.initial_state + # if mask is None: + # mask = [False for _ in range(self.n_envs)] + observation = np.array(observation) + vectorized_env = self._is_vectorized_observation(observation, self.observation_space) + + observation = observation.reshape((-1,) + self.observation_space.shape) + # Convert to float pytorch + # TODO: replace with preprocessing + observation = th.as_tensor(observation).float().to(self.device) + with th.no_grad(): + actions = self.policy.predict(observation, deterministic=deterministic) + # Convert to numpy + actions = actions.cpu().numpy() + + # Rescale to proper domain when using squashing + # TODO: should not be used for a Gaussian distribution? + if isinstance(self.action_space, gym.spaces.Box): + actions = self.unscale_action(actions) + + clipped_actions = actions + # Clip the actions to avoid out of bound error when using gaussian distribution + if isinstance(self.action_space, gym.spaces.Box): + clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) + + if not vectorized_env: + if state is not None: + raise ValueError("Error: The environment must be vectorized when using recurrent policies.") + clipped_actions = clipped_actions[0] + + # TODO: switch to stable baselines API + # return clipped_actions, state + return clipped_actions - :param observation: the input observation - :param state: The last states (can be None, used in recurrent policies) - :param mask: The last masks (can be None, used in recurrent policies) - :param deterministic: Whether or not to return deterministic actions. - :return: the model's action and the next state (used in recurrent policies) - """ - raise NotImplementedError() @classmethod def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs): @@ -806,7 +889,9 @@ def collect_rollouts(self, # Warmup phase unscaled_action = np.array([self.action_space.sample()]) else: - unscaled_action = self.predict(obs, deterministic=not self.use_sde) + # Note: we assume that the policy uses tanh to scale the action + # We use non-deterministic action in the case of SAC, for TD3, it does not matter + unscaled_action = self.predict(obs, deterministic=False) # Rescale the action from [low, high] to [-1, 1] scaled_action = self.scale_action(unscaled_action) diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py index ea7f755ad..1e0064dca 100644 --- a/torchy_baselines/common/policies.py +++ b/torchy_baselines/common/policies.py @@ -1,25 +1,30 @@ +from typing import Union + from itertools import zip_longest +import gym import torch as th import torch.nn as nn +import numpy as np class BasePolicy(nn.Module): """ The base policy object - :param observation_space: (Gym Space) The observation space of the environment - :param action_space: (Gym Space) The action space of the environment + :param observation_space: (gym.spaces.Space) The observation space of the environment + :param action_space: (gym.spaces.Space) The action space of the environment """ - def __init__(self, observation_space, action_space, device='cpu'): + def __init__(self, observation_space: gym.spaces.Space, + action_space: gym.spaces.Space, device: Union[th.device, str] = 'cpu'): super(BasePolicy, self).__init__() self.observation_space = observation_space self.action_space = action_space self.device = device @staticmethod - def init_weights(module, gain=1): + def init_weights(module: nn.Module, gain: float = 1): if type(module) == nn.Linear: nn.init.orthogonal_(module.weight, gain=gain) module.bias.data.fill_(0.0) @@ -27,7 +32,13 @@ def init_weights(module, gain=1): def forward(self, *_args, **kwargs): raise NotImplementedError() - def save(self, path): + def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + """ + Get the action according to the policy for a given observation. + """ + raise NotImplementedError() + + def save(self, path: str) -> None: """ Save model to a given location. @@ -35,7 +46,7 @@ def save(self, path): """ th.save(self.state_dict(), path) - def load(self, path): + def load(self, path: str) -> None: """ Load saved model from path. @@ -43,7 +54,7 @@ def load(self, path): """ self.load_state_dict(th.load(path)) - def load_from_vector(self, vector): + def load_from_vector(self, vector: np.ndarray): """ Load parameters from a 1D vector. @@ -51,7 +62,7 @@ def load_from_vector(self, vector): """ th.nn.utils.vector_to_parameters(th.FloatTensor(vector).to(self.device), self.parameters()) - def parameters_to_vector(self): + def parameters_to_vector(self) -> np.ndarray: """ Convert the parameters to a 1D vector. diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index 4421d5045..1e3492660 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -162,10 +162,10 @@ def _get_action_dist_from_latent(self, latent_pi, latent_sde=None, deterministic return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde, deterministic=deterministic) - def actor_forward(self, obs, deterministic=False): - latent_pi, _, latent_sde = self._get_latent(obs) + def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + latent_pi, _, latent_sde = self._get_latent(observation) action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic) - return action.detach().cpu().numpy() + return action def evaluate_actions(self, obs, action, deterministic=False): """ diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index b17f4f8ce..2ce168b1f 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -129,28 +129,6 @@ def _setup_model(self): if self.clip_range_vf is not None: self.clip_range_vf = get_schedule_fn(self.clip_range_vf) - def select_action(self, observation, deterministic=False): - # Normally not needed - observation = np.array(observation) - with th.no_grad(): - observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device) - return self.policy.actor_forward(observation, deterministic=deterministic) - - def predict(self, observation, state=None, mask=None, deterministic=False): - """ - Get the model's action from an observation - - :param observation: (np.ndarray) the input observation - :param state: (np.ndarray) The last states (can be None, used in recurrent policies) - :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies) - :param deterministic: (bool) Whether or not to return deterministic actions. - :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) - """ - clipped_actions = self.select_action(observation, deterministic=deterministic) - if isinstance(self.action_space, gym.spaces.Box): - clipped_actions = np.clip(clipped_actions, self.action_space.low, self.action_space.high) - return clipped_actions - def collect_rollouts(self, env: VecEnv, callback: BaseCallback, diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py index 3fe11c50e..3fbea937a 100644 --- a/torchy_baselines/sac/policies.py +++ b/torchy_baselines/sac/policies.py @@ -129,11 +129,11 @@ def get_action_dist_params(self, obs): def forward(self, obs, deterministic=False): mean_actions, log_std, latent_sde = self.get_action_dist_params(obs) if self.use_sde: - # Note the action is squashed + # Note: the action is squashed action, _ = self.action_dist.proba_distribution(mean_actions, log_std, latent_sde, deterministic=deterministic) else: - # Note the action is squashed + # Note: the action is squashed action, _ = self.action_dist.proba_distribution(mean_actions, log_std, deterministic=deterministic) return action @@ -246,6 +246,8 @@ def make_critic(self): def forward(self, obs): return self.actor(obs) + def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + return self.actor.forward(observation, deterministic) MlpPolicy = SACPolicy diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index 9617ec78f..f0930ee21 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -148,25 +148,6 @@ def _create_aliases(self): self.critic = self.policy.critic self.critic_target = self.policy.critic_target - def select_action(self, observation): - # Normally not needed - observation = np.array(observation) - with th.no_grad(): - observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device) - return self.actor(observation).cpu().data.numpy() - - def predict(self, observation, state=None, mask=None, deterministic=True): - """ - Get the model's action from an observation - - :param observation: (np.ndarray) the input observation - :param state: (np.ndarray) The last states (can be None, used in recurrent policies) - :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies) - :param deterministic: (bool) Whether or not to return deterministic actions. - :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) - """ - return self.unscale_action(self.select_action(observation)) - def train(self, gradient_steps: int, batch_size: int = 64): # Update optimizers learning rate optimizers = [self.actor.optimizer, self.critic.optimizer] diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py index 8bc5f60a4..fa199521e 100644 --- a/torchy_baselines/td3/policies.py +++ b/torchy_baselines/td3/policies.py @@ -277,6 +277,9 @@ def make_critic(self): def forward(self, obs, deterministic=True): return self.actor(obs, deterministic=deterministic) + def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + return self.forward(observation, deterministic) + MlpPolicy = TD3Policy diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 05f1b4fa7..1ba594775 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -114,25 +114,6 @@ def _create_aliases(self): self.critic_target = self.policy.critic_target self.vf_net = self.policy.vf_net - def select_action(self, observation, deterministic=True): - # Normally not needed - observation = np.array(observation) - with th.no_grad(): - observation = th.FloatTensor(observation.reshape(1, -1)).to(self.device) - return self.actor(observation, deterministic=deterministic).cpu().numpy() - - def predict(self, observation, state=None, mask=None, deterministic=True): - """ - Get the model's action from an observation - - :param observation: (np.ndarray) the input observation - :param state: (np.ndarray) The last states (can be None, used in recurrent policies) - :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies) - :param deterministic: (bool) Whether or not to return deterministic actions. - :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) - """ - return self.unscale_action(self.select_action(observation, deterministic=deterministic)) - def train_critic(self, gradient_steps: int = 1, batch_size: int = 100, replay_data: Optional[ReplayBufferSamples] = None, From aa8b4eb22a8157917f89ba1352186409311028fe Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 13 Feb 2020 13:46:22 +0100 Subject: [PATCH 4/9] Reformat and type the distributions --- torchy_baselines/common/base_class.py | 9 +- torchy_baselines/common/distributions.py | 114 ++++++++++++--------- torchy_baselines/common/results_plotter.py | 4 +- torchy_baselines/ppo/policies.py | 2 + 4 files changed, 75 insertions(+), 54 deletions(-) diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 2348d0e10..c0951cbbf 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -225,13 +225,14 @@ def get_vec_normalize_env(self) -> Optional[VecNormalize]: return self._vec_normalize_env @staticmethod - def check_env(env, observation_space: gym.spaces.Space, action_space: gym.spaces.Space) -> bool: + def check_env(env: GymEnv, observation_space: gym.spaces.Space, action_space: gym.spaces.Space) -> bool: """ Checks the validity of the environment and returns if it is consistent. Checked parameters: - observation_space - action_space + :param env: (GymEnv) :param observation_space: (gym.spaces.Space) :param action_space: (gym.spaces.Space) :return: (bool) True if environment seems to be coherent @@ -404,7 +405,6 @@ def predict(self, observation: np.ndarray, # return clipped_actions, state return clipped_actions - @classmethod def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs): """ @@ -774,6 +774,7 @@ class OffPolicyRLModel(BaseRLModel): :param use_sde_at_warmup: (bool) Whether to use SDE instead of uniform sampling during the warm up phase (before learning starts) """ + def __init__(self, policy: Type[BasePolicy], env: Union[GymEnv, str], @@ -790,8 +791,8 @@ def __init__(self, use_sde_at_warmup: bool = False): super(OffPolicyRLModel, self).__init__(policy, env, policy_base, policy_kwargs, verbose, - device, support_multi_env, create_eval_env, monitor_wrapper, - seed, use_sde, sde_sample_freq) + device, support_multi_env, create_eval_env, monitor_wrapper, + seed, use_sde, sde_sample_freq) # For SDE only self.rollout_data = None self.on_policy_exploration = False diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index 451535c35..a7f219ac8 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -1,5 +1,6 @@ -from typing import Optional +from typing import Optional, Tuple, Dict, Any +import gym import torch as th import torch.nn as nn from torch.distributions import Normal, Categorical @@ -45,14 +46,15 @@ class DiagGaussianDistribution(Distribution): :param action_dim: (int) Number of continuous actions """ - def __init__(self, action_dim): + def __init__(self, action_dim: int): super(DiagGaussianDistribution, self).__init__() self.distribution = None self.action_dim = action_dim self.mean_actions = None self.log_std = None - def proba_distribution_net(self, latent_dim, log_std_init=0.0): + def proba_distribution_net(self, latent_dim: int, + log_std_init: float = 0.0) -> Tuple[nn.Module, nn.Parameter]: """ Create the layers and parameter that represent the distribution: one output will be the mean of the gaussian, the other parameter will be the @@ -64,10 +66,12 @@ def proba_distribution_net(self, latent_dim, log_std_init=0.0): """ mean_actions = nn.Linear(latent_dim, self.action_dim) # TODO: allow action dependent std - log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init) + log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init, requires_grad=True) return mean_actions, log_std - def proba_distribution(self, mean_actions, log_std, deterministic=False): + def proba_distribution(self, mean_actions: th.Tensor, + log_std: th.Tensor, + deterministic: bool = False) -> Tuple[th.Tensor, 'DiagGaussianDistribution']: """ Create and sample for the distribution given its parameters (mean, std) @@ -84,29 +88,29 @@ def proba_distribution(self, mean_actions, log_std, deterministic=False): action = self.sample() return action, self - def mode(self): + def mode(self) -> th.Tensor: return self.distribution.mean - def sample(self): + def sample(self) -> th.Tensor: return self.distribution.rsample() - def entropy(self): + def entropy(self) -> th.Tensor: return self.distribution.entropy() - def log_prob_from_params(self, mean_actions, log_std): + def log_prob_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: """ Compute the log probabilty of taking an action given the distribution parameters. :param mean_actions: (th.Tensor) :param log_std: (th.Tensor) - :return: (th.Tensor, th.Tensor) + :return: (Tuple[th.Tensor, th.Tensor]) """ action, _ = self.proba_distribution(mean_actions, log_std) log_prob = self.log_prob(action) return action, log_prob - def log_prob(self, action): + def log_prob(self, action: th.Tensor) -> th.Tensor: """ Get the log probabilty of an action given a distribution. Note that you must call `proba_distribution()` method @@ -132,7 +136,7 @@ class SquashedDiagGaussianDistribution(DiagGaussianDistribution): :param epsilon: (float) small value to avoid NaN due to numerical imprecision. """ - def __init__(self, action_dim, epsilon=1e-6): + def __init__(self, action_dim: int, epsilon: float = 1e-6): super(SquashedDiagGaussianDistribution, self).__init__(action_dim) # Avoid NaN (prevents division by zero or log of zero) self.epsilon = epsilon @@ -143,26 +147,26 @@ def proba_distribution(self, mean_actions, log_std, deterministic=False): deterministic) return action, self - def mode(self): + def mode(self) -> th.Tensor: self.gaussian_action = self.distribution.mean # Squash the output return th.tanh(self.gaussian_action) - def entropy(self): + def entropy(self) -> Optional[th.Tensor]: # No analytical form, # entropy needs to be estimated using -log_prob.mean() return None - def sample(self): + def sample(self) -> th.Tensor: self.gaussian_action = self.distribution.rsample() return th.tanh(self.gaussian_action) - def log_prob_from_params(self, mean_actions, log_std): + def log_prob_from_params(self, mean_actions, log_std) -> Tuple[th.Tensor, th.Tensor]: action, _ = self.proba_distribution(mean_actions, log_std) log_prob = self.log_prob(action, self.gaussian_action) return action, log_prob - def log_prob(self, action, gaussian_action=None): + def log_prob(self, action: th.Tensor, gaussian_action: Optional[th.Tensor] = None) -> th.Tensor: # Inverse tanh # Naive implementation (not stable): 0.5 * torch.log((1 + x) / (1 - x)) # We use numpy to avoid numerical instability @@ -185,12 +189,12 @@ class CategoricalDistribution(Distribution): :param action_dim: (int) Number of discrete actions """ - def __init__(self, action_dim): + def __init__(self, action_dim: int): super(CategoricalDistribution, self).__init__() self.distribution = None self.action_dim = action_dim - def proba_distribution_net(self, latent_dim): + def proba_distribution_net(self, latent_dim: int) -> nn.Module: """ Create the layer that represents the distribution: it will be the logits of the Categorical distribution. @@ -202,7 +206,8 @@ def proba_distribution_net(self, latent_dim): action_logits = nn.Linear(latent_dim, self.action_dim) return action_logits - def proba_distribution(self, action_logits, deterministic=False): + def proba_distribution(self, action_logits: th.Tensor, + deterministic: bool = False) -> Tuple[th.Tensor, 'CategoricalDistribution']: self.distribution = Categorical(logits=action_logits) if deterministic: action = self.mode() @@ -210,21 +215,21 @@ def proba_distribution(self, action_logits, deterministic=False): action = self.sample() return action, self - def mode(self): + def mode(self) -> th.Tensor: return th.argmax(self.distribution.probs, dim=1) - def sample(self): + def sample(self) -> th.Tensor: return self.distribution.sample() - def entropy(self): + def entropy(self) -> th.Tensor: return self.distribution.entropy() - def log_prob_from_params(self, action_logits): + def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: action, _ = self.proba_distribution(action_logits) log_prob = self.log_prob(action) return action, log_prob - def log_prob(self, action): + def log_prob(self, action: th.Tensor) -> th.Tensor: log_prob = self.distribution.log_prob(action) return log_prob @@ -249,8 +254,12 @@ class StateDependentNoiseDistribution(Distribution): :param epsilon: (float) small value to avoid NaN due to numerical imprecision. """ - def __init__(self, action_dim, full_std=True, use_expln=False, - squash_output=False, learn_features=False, epsilon=1e-6): + def __init__(self, action_dim: int, + full_std: bool = True, + use_expln: bool = False, + squash_output: bool = False, + learn_features: bool = False, + epsilon: float = 1e-6): super(StateDependentNoiseDistribution, self).__init__() self.distribution = None self.action_dim = action_dim @@ -269,7 +278,7 @@ def __init__(self, action_dim, full_std=True, use_expln=False, else: self.bijector = None - def get_std(self, log_std): + def get_std(self, log_std: th.Tensor) -> th.Tensor: """ Get the standard deviation from the learned parameter (log of it by default). This ensures that the std is positive. @@ -294,7 +303,7 @@ def get_std(self, log_std): # Reduce the number of parameters: return th.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std - def sample_weights(self, log_std, batch_size=1): + def sample_weights(self, log_std: th.Tensor, batch_size: int = 1) -> None: """ Sample weights for the noise exploration matrix, using a centered Gaussian distribution. @@ -307,7 +316,8 @@ def sample_weights(self, log_std, batch_size=1): self.exploration_mat = self.weights_dist.rsample() self.exploration_matrices = self.weights_dist.rsample((batch_size,)) - def proba_distribution_net(self, latent_dim, log_std_init=-2.0, latent_sde_dim=None): + def proba_distribution_net(self, latent_dim: int, log_std_init: float = -2.0, + latent_sde_dim: Optional[th.Tensor] = None) -> Tuple[nn.Module, nn.Parameter]: """ Create the layers and parameter that represent the distribution: one output will be the deterministic action, the other parameter will be the @@ -327,12 +337,15 @@ def proba_distribution_net(self, latent_dim, log_std_init=-2.0, latent_sde_dim=N # Reduce the number of parameters if needed log_std = th.ones(self.latent_sde_dim, self.action_dim) if self.full_std else th.ones(self.latent_sde_dim, 1) # Transform it to a parameter so it can be optimized - log_std = nn.Parameter(log_std * log_std_init) + log_std = nn.Parameter(log_std * log_std_init, requires_grad=True) # Sample an exploration matrix self.sample_weights(log_std) return mean_actions_net, log_std - def proba_distribution(self, mean_actions, log_std, latent_sde, deterministic=False): + def proba_distribution(self, mean_actions: th.Tensor, + log_std: th.Tensor, + latent_sde: th.Tensor, + deterministic: bool = False) -> Tuple[th.Tensor, 'StateDependentNoiseDistribution']: """ Create and sample for the distribution given its parameters (mean, std) @@ -340,7 +353,7 @@ def proba_distribution(self, mean_actions, log_std, latent_sde, deterministic=Fa :param log_std: (th.Tensor) :param latent_sde: (th.Tensor) :param deterministic: (bool) - :return: (th.Tensor) + :return: (Tuple[th.Tensor, Distribution]) """ # Stop gradient if we don't want to influence the features latent_sde = latent_sde if self.learn_features else latent_sde.detach() @@ -353,13 +366,13 @@ def proba_distribution(self, mean_actions, log_std, latent_sde, deterministic=Fa action = self.sample(latent_sde) return action, self - def mode(self): + def mode(self) -> th.Tensor: action = self.distribution.mean if self.bijector is not None: return self.bijector.forward(action) return action - def get_noise(self, latent_sde): + def get_noise(self, latent_sde: th.Tensor) -> th.Tensor: latent_sde = latent_sde if self.learn_features else latent_sde.detach() # Default case: only one exploration matrix if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): @@ -371,26 +384,28 @@ def get_noise(self, latent_sde): noise = th.bmm(latent_sde, self.exploration_matrices) return noise.squeeze(1) - def sample(self, latent_sde): + def sample(self, latent_sde: th.Tensor) -> th.Tensor: noise = self.get_noise(latent_sde) action = self.distribution.mean + noise if self.bijector is not None: return self.bijector.forward(action) return action - def entropy(self): + def entropy(self) -> Optional[th.Tensor]: # No analytical form, # entropy needs to be estimated using -log_prob.mean() if self.bijector is not None: return None return self.distribution.entropy() - def log_prob_from_params(self, mean_actions, log_std, latent_sde): + def log_prob_from_params(self, mean_actions: th.Tensor, + log_std: th.Tensor, + latent_sde: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: action, _ = self.proba_distribution(mean_actions, log_std, latent_sde) log_prob = self.log_prob(action) return action, log_prob - def log_prob(self, action): + def log_prob(self, action: th.Tensor) -> th.Tensor: if self.bijector is not None: gaussian_action = self.bijector.inverse(action) else: @@ -418,16 +433,16 @@ class TanhBijector(object): :param epsilon: (float) small value to avoid NaN due to numerical imprecision. """ - def __init__(self, epsilon=1e-6): + def __init__(self, epsilon: float = 1e-6): super(TanhBijector, self).__init__() self.epsilon = epsilon @staticmethod - def forward(x): + def forward(x: th.Tensor) -> th.Tensor: return th.tanh(x) @staticmethod - def atanh(x): + def atanh(x: th.Tensor) -> th.Tensor: """ Inverse of Tanh @@ -437,7 +452,7 @@ def atanh(x): return 0.5 * (x.log1p() - (-x).log1p()) @staticmethod - def inverse(y): + def inverse(y: th.Tensor) -> th.Tensor: """ Inverse tanh. @@ -448,19 +463,21 @@ def inverse(y): # Clip the action to avoid NaN return TanhBijector.atanh(y.clamp(min=-1. + eps, max=1. - eps)) - def log_prob_correction(self, x): + def log_prob_correction(self, x: th.Tensor) -> th.Tensor: # Squash correction (from original SAC implementation) return th.log(1.0 - th.tanh(x) ** 2 + self.epsilon) -def make_proba_distribution(action_space, use_sde=False, dist_kwargs=None): +def make_proba_distribution(action_space: gym.spaces.Space, + use_sde: bool = False, + dist_kwargs: Optional[Dict[str, Any]] = None) -> Distribution: """ Return an instance of Distribution for the correct type of action space - :param action_space: (Gym Space) the input action space + :param action_space: (gym.spaces.Space) the input action space :param use_sde: (bool) Force the use of StateDependentNoiseDistribution instead of DiagGaussianDistribution - :param dist_kwargs: (dict) Keyword arguments to pass to the probabilty distribution + :param dist_kwargs: (Optional[Dict[str, Any]]) Keyword arguments to pass to the probabilty distribution :return: (Distribution) the approriate Distribution object """ if dist_kwargs is None: @@ -478,5 +495,6 @@ def make_proba_distribution(action_space, use_sde=False, dist_kwargs=None): # elif isinstance(action_space, spaces.MultiBinary): # return BernoulliDistribution(action_space.n, **dist_kwargs) else: - raise NotImplementedError(f"Error: probability distribution, not implemented for action space of type {type(action_space)}." + raise NotImplementedError("Error: probability distribution, not implemented for action space" + f"of type {type(action_space)}." " Must be of type Gym Spaces: Box, Discrete, MultiDiscrete or MultiBinary.") diff --git a/torchy_baselines/common/results_plotter.py b/torchy_baselines/common/results_plotter.py index a5464dadb..6dfe0e077 100644 --- a/torchy_baselines/common/results_plotter.py +++ b/torchy_baselines/common/results_plotter.py @@ -2,12 +2,12 @@ import numpy as np import pandas as pd -import matplotlib +# import matplotlib +# matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode import matplotlib.pyplot as plt from torchy_baselines.common.monitor import load_results -# matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode plt.rcParams['svg.fonttype'] = 'none' X_TIMESTEPS = 'timesteps' diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index 1e3492660..d303a6672 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -161,6 +161,8 @@ def _get_action_dist_from_latent(self, latent_pi, latent_sde=None, deterministic elif isinstance(self.action_dist, StateDependentNoiseDistribution): return self.action_dist.proba_distribution(mean_actions, self.log_std, latent_sde, deterministic=deterministic) + else: + raise ValueError('Invalid action distribution') def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: latent_pi, _, latent_sde = self._get_latent(observation) From a2b1bf06d36bbf2dd9101e7910abb4d4d5d1090e Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 14 Feb 2020 11:12:07 +0100 Subject: [PATCH 5/9] Add `squash_output` attribute to policy --- torchy_baselines/common/base_class.py | 11 +++-- torchy_baselines/common/callbacks.py | 8 ++-- torchy_baselines/common/policies.py | 59 +++++++++++++++++---------- torchy_baselines/ppo/policies.py | 2 +- torchy_baselines/sac/policies.py | 2 +- torchy_baselines/td3/policies.py | 8 ++-- 6 files changed, 53 insertions(+), 37 deletions(-) diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index c0951cbbf..eabc7f1ae 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -86,7 +86,7 @@ def __init__(self, self.num_timesteps = 0 self.eval_env = None self.seed = seed - self.action_noise = None # type: ActionNoise + self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = None @@ -97,8 +97,8 @@ def __init__(self, # this is used to update the learning rate self._current_progress = 1 # Buffers for logging - self.ep_info_buffer = None # type: deque - self.ep_success_buffer = None # type: deque + self.ep_info_buffer = None # type: Optional[deque] + self.ep_success_buffer = None # type: Optional[deque] # Create and wrap the env if needed if env is not None: @@ -387,13 +387,12 @@ def predict(self, observation: np.ndarray, actions = actions.cpu().numpy() # Rescale to proper domain when using squashing - # TODO: should not be used for a Gaussian distribution? - if isinstance(self.action_space, gym.spaces.Box): + if isinstance(self.action_space, gym.spaces.Box) and self.policy.squash_output: actions = self.unscale_action(actions) clipped_actions = actions # Clip the actions to avoid out of bound error when using gaussian distribution - if isinstance(self.action_space, gym.spaces.Box): + if isinstance(self.action_space, gym.spaces.Box) and not self.policy.squash_output: clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) if not vectorized_env: diff --git a/torchy_baselines/common/callbacks.py b/torchy_baselines/common/callbacks.py index 8c0d108e2..392716c96 100644 --- a/torchy_baselines/common/callbacks.py +++ b/torchy_baselines/common/callbacks.py @@ -22,14 +22,14 @@ class BaseCallback(ABC): """ def __init__(self, verbose: int = 0): super(BaseCallback, self).__init__() - self.model = None # type: BaseRLModel + self.model = None # type: Optional[BaseRLModel] self.training_env = None # type: Union[gym.Env, VecEnv, None] self.n_calls = 0 # type: int self.num_timesteps = 0 # type: int self.verbose = verbose - self.locals = None # type: Dict[str, Any] - self.globals = None # type: Dict[str, Any] - self.logger = None # type: Logger + self.locals = None # type: Optional[Dict[str, Any]] + self.globals = None # type: Optional[Dict[str, Any]] + self.logger = None # type: Optional[Logger] # Sometimes, for event callback, it is useful # to have access to the parent object self.parent = None # type: Optional[BaseCallback] diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py index 1e0064dca..abab7d845 100644 --- a/torchy_baselines/common/policies.py +++ b/torchy_baselines/common/policies.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Type, Dict, List, Tuple from itertools import zip_longest @@ -14,14 +14,24 @@ class BasePolicy(nn.Module): :param observation_space: (gym.spaces.Space) The observation space of the environment :param action_space: (gym.spaces.Space) The action space of the environment + :param device: (Union[th.device, str]) Device on which the code should run. + :param squash_output: (bool) For continuous actions, whether the output is squashed + or not using a `tanh()` function. """ - def __init__(self, observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, device: Union[th.device, str] = 'cpu'): + action_space: gym.spaces.Space, + device: Union[th.device, str] = 'cpu', + squash_output: bool = False): super(BasePolicy, self).__init__() self.observation_space = observation_space self.action_space = action_space self.device = device + self._squash_output = squash_output + + @property + def squash_output(self) -> bool: + """ (bool) Getter for squash_output.""" + return self._squash_output @staticmethod def init_weights(module: nn.Module, gain: float = 1): @@ -71,21 +81,25 @@ def parameters_to_vector(self) -> np.ndarray: return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy() -def create_mlp(input_dim, output_dim, net_arch, - activation_fn=nn.ReLU, squash_out=False): +def create_mlp(input_dim: int, + output_dim: int, + net_arch: List[int], + activation_fn: nn.Module = nn.ReLU, + squash_output: bool = False) -> List[nn.Module]: """ Create a multi layer perceptron (MLP), which is a collection of fully-connected layers each followed by an activation function. :param input_dim: (int) Dimension of the input vector :param output_dim: (int) - :param net_arch: ([int]) Architecture of the neural net + :param net_arch: (List[int]) Architecture of the neural net It represents the number of units per layer. The length of this list is the number of layers. - :param activation_fn: (th.nn.Module) The activation function + :param activation_fn: (nn.Module) The activation function to use after each layer. - :param squash_out: (bool) Whether to squash the output using a Tanh + :param squash_output: (bool) Whether to squash the output using a Tanh activation function + :return: (List[nn.Module]) """ if len(net_arch) > 0: @@ -99,12 +113,14 @@ def create_mlp(input_dim, output_dim, net_arch, if output_dim > 0: modules.append(nn.Linear(net_arch[-1], output_dim)) - if squash_out: + if squash_output: modules.append(nn.Tanh()) return modules -def create_sde_feature_extractor(features_dim, sde_net_arch, activation_fn): +def create_sde_feature_extractor(features_dim: int, + sde_net_arch: List[int], + activation_fn: nn.Module) -> Tuple[nn.Sequential, int]: """ Create the neural network that will be used to extract features for the SDE. @@ -117,7 +133,7 @@ def create_sde_feature_extractor(features_dim, sde_net_arch, activation_fn): # Special case: when using states as features (i.e. sde_net_arch is an empty list) # don't use any activation function sde_activation = activation_fn if len(sde_net_arch) > 0 else None - latent_sde_net = create_mlp(features_dim, -1, sde_net_arch, activation_fn=sde_activation, squash_out=False) + latent_sde_net = create_mlp(features_dim, -1, sde_net_arch, activation_fn=sde_activation, squash_output=False) latent_sde_dim = sde_net_arch[-1] if len(sde_net_arch) > 0 else features_dim sde_feature_extractor = nn.Sequential(*latent_sde_net) return sde_feature_extractor, latent_sde_dim @@ -131,7 +147,7 @@ class BaseNetwork(nn.Module): def __init__(self): super(BaseNetwork, self).__init__() - def load_from_vector(self, vector): + def load_from_vector(self, vector: np.ndarray): """ Load parameters from a 1D vector. @@ -140,7 +156,7 @@ def load_from_vector(self, vector): device = next(self.parameters()).device th.nn.utils.vector_to_parameters(th.FloatTensor(vector).to(device), self.parameters()) - def parameters_to_vector(self): + def parameters_to_vector(self) -> np.ndarray: """ Convert the parameters to a 1D vector. @@ -149,16 +165,16 @@ def parameters_to_vector(self): return th.nn.utils.parameters_to_vector(self.parameters()).detach().cpu().numpy() -_policy_registry = dict() +_policy_registry = dict() # type: Dict[Type[BasePolicy], Dict[str, Type[BasePolicy]]] -def get_policy_from_name(base_policy_type, name): +def get_policy_from_name(base_policy_type: Type[BasePolicy], name: str) -> Type[BasePolicy]: """ - returns the registed policy from the base type and name + Returns the registered policy from the base type and name - :param base_policy_type: (BasePolicy) the base policy object + :param base_policy_type: (Type[BasePolicy]) the base policy class :param name: (str) the policy name - :return: (base_policy_type) the policy + :return: (Type[BasePolicy]) the policy """ if base_policy_type not in _policy_registry: raise ValueError(f"Error: the policy type {base_policy_type} is not registered!") @@ -168,12 +184,13 @@ def get_policy_from_name(base_policy_type, name): return _policy_registry[base_policy_type][name] -def register_policy(name, policy): +def register_policy(name: str, policy: Type[BasePolicy]) -> None: """ - returns the registed policy from the base type and name + Register a policy, so it can be called using its name. + e.g. SAC('MlpPolicy', ...) instead of SAC(MlpPolicy, ...) :param name: (str) the policy name - :param policy: (subclass of BasePolicy) the policy + :param policy: (Type[BasePolicy]) the policy class """ sub_class = None # For building the doc diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index d303a6672..3e47375d5 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -41,7 +41,7 @@ def __init__(self, observation_space, action_space, ortho_init=True, use_sde=False, log_std_init=0.0, full_std=True, sde_net_arch=None, use_expln=False, squash_output=False): - super(PPOPolicy, self).__init__(observation_space, action_space, device) + super(PPOPolicy, self).__init__(observation_space, action_space, device, squash_output=squash_output) self.obs_dim = self.observation_space.shape[0] # Default network architecture, from stable-baselines diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py index 3fbea937a..4f9673840 100644 --- a/torchy_baselines/sac/policies.py +++ b/torchy_baselines/sac/policies.py @@ -200,7 +200,7 @@ def __init__(self, observation_space, action_space, learning_rate, net_arch=None, device='cpu', activation_fn=nn.ReLU, use_sde=False, log_std_init=-3, sde_net_arch=None, use_expln=False): - super(SACPolicy, self).__init__(observation_space, action_space, device) + super(SACPolicy, self).__init__(observation_space, action_space, device, squash_output=True) if net_arch is None: net_arch = [256, 256] diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py index fa199521e..f75cdea68 100644 --- a/torchy_baselines/td3/policies.py +++ b/torchy_baselines/td3/policies.py @@ -52,7 +52,7 @@ def __init__(self, self.sde_feature_extractor = None if use_sde: - latent_pi_net = create_mlp(obs_dim, -1, net_arch, activation_fn, squash_out=False) + latent_pi_net = create_mlp(obs_dim, -1, net_arch, activation_fn, squash_output=False) self.latent_pi = nn.Sequential(*latent_pi_net) latent_sde_dim = net_arch[-1] learn_features = sde_net_arch is not None @@ -74,7 +74,7 @@ def __init__(self, self.sde_optimizer = th.optim.Adam([self.log_std], lr=lr_sde) self.reset_noise() else: - actor_net = create_mlp(obs_dim, action_dim, net_arch, activation_fn, squash_out=True) + actor_net = create_mlp(obs_dim, action_dim, net_arch, activation_fn, squash_output=True) self.mu = nn.Sequential(*actor_net) def get_std(self) -> torch.Tensor: @@ -134,7 +134,7 @@ def forward(self, obs: torch.Tensor, deterministic: bool = True) -> torch.Tensor if self.clip_noise is not None: noise = th.clamp(noise, -self.clip_noise, self.clip_noise) # TODO: Replace with squashing -> need to account for that in the sde update - # -> set squash_out=True in the action_dist? + # -> set squash_output=True in the action_dist? # NOTE: the clipping is done in the rollout for now return self.mu(latent_pi) + noise # action, _ = self._get_action_dist_from_latent(latent_pi) @@ -215,7 +215,7 @@ def __init__(self, observation_space, action_space, learning_rate, net_arch=None, device='cpu', activation_fn=nn.ReLU, use_sde=False, log_std_init=-3, clip_noise=None, lr_sde=3e-4, sde_net_arch=None, use_expln=False): - super(TD3Policy, self).__init__(observation_space, action_space, device) + super(TD3Policy, self).__init__(observation_space, action_space, device, squash_output=True) # Default network architecture, from the original paper if net_arch is None: From 8b559d71ab7eca97790be7263ab3c827004d430b Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 14 Feb 2020 13:42:16 +0100 Subject: [PATCH 6/9] Remove deprecated monitor format and improve tests --- docs/misc/changelog.rst | 1 + tests/test_monitor.py | 46 ++++++++++++++++++---------- torchy_baselines/common/monitor.py | 48 ++++++++++-------------------- 3 files changed, 47 insertions(+), 48 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 02bf897b5..ea784b619 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -12,6 +12,7 @@ Breaking Changes: - Return type of `evaluation.evaluate_policy()` has been changed - Refactored the replay buffer to avoid transformation between PyTorch and NumPy - Created `OffPolicyRLModel` base class +- Remove deprecated JSON format for `Monitor` New Features: ^^^^^^^^^^^^^ diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 141b2c0d4..d783f2554 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -8,33 +8,47 @@ from torchy_baselines.common.monitor import Monitor, get_monitor_files, load_results -def test_monitor(): +def test_monitor(tmp_path): """ test the monitor wrapper """ env = gym.make("CartPole-v1") env.seed(0) - monitor_file = "/tmp/stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()) + monitor_file = os.path.join(str(tmp_path), "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env = Monitor(env, monitor_file) monitor_env.reset() - for _ in range(1000): - _, _, done, _ = monitor_env.step(0) + total_steps = 1000 + ep_rewards = [] + ep_lengths = [] + ep_len, ep_reward = 0, 0 + for _ in range(total_steps): + _, reward, done, _ = monitor_env.step(0) + ep_len += 1 + ep_reward += reward if done: + ep_rewards.append(ep_reward) + ep_lengths.append(ep_len) monitor_env.reset() - - file_handler = open(monitor_file, 'rt') - - first_line = file_handler.readline() - assert first_line.startswith('#') - metadata = json.loads(first_line[1:]) - assert metadata['env_id'] == "CartPole-v1" - assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata" - - last_logline = pandas.read_csv(file_handler, index_col=None) - assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" - file_handler.close() + ep_len, ep_reward = 0, 0 + + monitor_env.close() + assert monitor_env.get_total_steps() == total_steps + assert sum(ep_lengths) == sum(monitor_env.get_episode_lengths()) + assert sum(monitor_env.get_episode_rewards()) == sum(ep_rewards) + _ = monitor_env.get_episode_times() + + with open(monitor_file, 'rt') as file_handler: + first_line = file_handler.readline() + assert first_line.startswith('#') + metadata = json.loads(first_line[1:]) + assert metadata['env_id'] == "CartPole-v1" + assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata" + + last_logline = pandas.read_csv(file_handler, index_col=None) + assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" os.remove(monitor_file) + def test_monitor_load_results(tmp_path): """ test load_results on log files produced by the monitor wrapper diff --git a/torchy_baselines/common/monitor.py b/torchy_baselines/common/monitor.py index 88bea456a..3d84b9ba9 100644 --- a/torchy_baselines/common/monitor.py +++ b/torchy_baselines/common/monitor.py @@ -14,22 +14,21 @@ class Monitor(gym.Wrapper): EXT = "monitor.csv" - file_handler = None def __init__(self, env: gym.Env, filename: Optional[str] = None, allow_early_resets: bool = True, - reset_keywords=(), - info_keywords=()): + reset_keywords: Tuple[str, ...] = (), + info_keywords: Tuple[str, ...] = ()): """ A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data. :param env: (gym.Env) The environment :param filename: (Optional[str]) the location to save a log file, can be None for no log :param allow_early_resets: (bool) allows the reset of the environment before it is done - :param reset_keywords: (tuple) extra keywords for the reset call, if extra parameters are needed at reset - :param info_keywords: (tuple) extra information to log, from the information return of environment.step + :param reset_keywords: (Tuple[str, ...]) extra keywords for the reset call, if extra parameters are needed at reset + :param info_keywords: (Tuple[str, ...]) extra information to log, from the information return of environment.step """ super(Monitor, self).__init__(env=env) self.t_start = time.time() @@ -93,12 +92,12 @@ def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, Dict[Any, A if done: self.needs_reset = True ep_rew = sum(self.rewards) - eplen = len(self.rewards) - ep_info = {"r": round(ep_rew, 6), "l": eplen, "t": round(time.time() - self.t_start, 6)} + ep_len = len(self.rewards) + ep_info = {"r": round(ep_rew, 6), "l": ep_len, "t": round(time.time() - self.t_start, 6)} for key in self.info_keywords: ep_info[key] = info[key] self.episode_rewards.append(ep_rew) - self.episode_lengths.append(eplen) + self.episode_lengths.append(ep_len) self.episode_times.append(time.time() - self.t_start) ep_info.update(self.current_reset_info) if self.logger: @@ -168,41 +167,26 @@ def get_monitor_files(path: str) -> List[str]: def load_results(path: str) -> pandas.DataFrame: """ - Load all Monitor logs from a given directory path matching ``*monitor.csv`` and ``*monitor.json`` + Load all Monitor logs from a given directory path matching ``*monitor.csv`` :param path: (str) the directory path containing the log file(s) :return: (pandas.DataFrame) the logged data """ - # get both csv and (old) json files - monitor_files = (glob(os.path.join(path, "*monitor.json")) + get_monitor_files(path)) - if not monitor_files: + monitor_files = get_monitor_files(path) + if len(monitor_files) == 0: raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, path)) - data_frames = [] - headers = [] + data_frames, headers = [], [] for file_name in monitor_files: with open(file_name, 'rt') as file_handler: - if file_name.endswith('csv'): - first_line = file_handler.readline() - assert first_line[0] == '#' - header = json.loads(first_line[1:]) - data_frame = pandas.read_csv(file_handler, index_col=None) - headers.append(header) - elif file_name.endswith('json'): # Deprecated json format - episodes = [] - lines = file_handler.readlines() - header = json.loads(lines[0]) - headers.append(header) - for line in lines[1:]: - episode = json.loads(line) - episodes.append(episode) - data_frame = pandas.DataFrame(episodes) - else: - assert 0, 'unreachable' + first_line = file_handler.readline() + assert first_line[0] == '#' + header = json.loads(first_line[1:]) + data_frame = pandas.read_csv(file_handler, index_col=None) + headers.append(header) data_frame['t'] += header['t_start'] data_frames.append(data_frame) data_frame = pandas.concat(data_frames) data_frame.sort_values('t', inplace=True) data_frame.reset_index(inplace=True) data_frame['t'] -= min(header['t_start'] for header in headers) - # data_frame.headers = headers # HACK to preserve backwards compatibility return data_frame From e31b139c470c21041ebfe841480e47d18ee89c55 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 14 Feb 2020 14:03:41 +0100 Subject: [PATCH 7/9] Add test for predict method --- tests/test_predict.py | 55 +++++++++++++++++++++++++++ tests/test_run.py | 12 ------ tests/test_save_load.py | 1 + torchy_baselines/common/base_class.py | 37 +++++++++--------- 4 files changed, 75 insertions(+), 30 deletions(-) create mode 100644 tests/test_predict.py diff --git a/tests/test_predict.py b/tests/test_predict.py new file mode 100644 index 000000000..e75deec06 --- /dev/null +++ b/tests/test_predict.py @@ -0,0 +1,55 @@ +import gym +import pytest + +from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3 +from torchy_baselines.common.vec_env import DummyVecEnv + +MODEL_LIST = [ + CEMRL, + PPO, + A2C, + TD3, + SAC, +] + +@pytest.mark.parametrize("model_class", MODEL_LIST) +def test_auto_wrap(model_class): + # test auto wrapping of env into a VecEnv + env = gym.make('Pendulum-v0') + eval_env = gym.make('Pendulum-v0') + model = model_class('MlpPolicy', env) + model.learn(100, eval_env=eval_env) + + +@pytest.mark.parametrize("model_class", MODEL_LIST) +def test_predict(model_class): + # test detection of different shapes by the predict method + model = model_class('MlpPolicy', 'Pendulum-v0') + env = gym.make('Pendulum-v0') + vec_env = DummyVecEnv([lambda: gym.make('Pendulum-v0'), lambda: gym.make('Pendulum-v0')]) + + obs = env.reset() + action = model.predict(obs) + assert action.shape == env.action_space.shape + assert env.action_space.contains(action) + + vec_env_obs = vec_env.reset() + action = model.predict(vec_env_obs) + assert action.shape[0] == vec_env_obs.shape[0] + + +@pytest.mark.parametrize("model_class", [A2C, PPO]) +def test_predict_discrete(model_class): + # test detection of different shapes by the predict method + model = model_class('MlpPolicy', 'CartPole-v1') + env = gym.make('CartPole-v1') + vec_env = DummyVecEnv([lambda: gym.make('CartPole-v1'), lambda: gym.make('CartPole-v1')]) + + obs = env.reset() + action = model.predict(obs) + assert action.shape == () + assert env.action_space.contains(action) + + vec_env_obs = vec_env.reset() + action = model.predict(vec_env_obs) + assert action.shape[0] == vec_env_obs.shape[0] diff --git a/tests/test_run.py b/tests/test_run.py index 1d206c9b8..fdfcff6da 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -14,18 +14,12 @@ def test_td3(action_noise): model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=1000, eval_freq=500) - model.save("test_save") - model.load("test_save") - os.remove("test_save.zip") def test_cemrl(): model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1, learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=1000, eval_freq=500) - model.save("test_save") - model.load("test_save") - os.remove("test_save.zip") @pytest.mark.parametrize("model_class", [A2C, PPO]) @@ -33,9 +27,6 @@ def test_cemrl(): def test_onpolicy(model_class, env_id): model = model_class('MlpPolicy', env_id, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) - model.save("test_save") - model.load("test_save") - os.remove("test_save.zip") @pytest.mark.parametrize("ent_coef", ['auto', 0.01]) @@ -44,6 +35,3 @@ def test_sac(ent_coef): learning_starts=100, verbose=1, create_eval_env=True, ent_coef=ent_coef, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1))) model.learn(total_timesteps=1000, eval_freq=500) - model.save("test_save") - model.load("test_save") - os.remove("test_save.zip") diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 45c0ac5fc..9d73ddf74 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -134,6 +134,7 @@ def test_exclude_include_saved_params(model_class): # clear file from os os.remove("test_save.zip") + @pytest.mark.parametrize("model_class", [SAC, TD3]) def test_save_load_replay_buffer(model_class): log_folder = 'logs' diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index eabc7f1ae..15da3bb6b 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -335,24 +335,25 @@ def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.s else: raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.") - elif isinstance(observation_space, gym.spaces.MultiDiscrete): - if observation.shape == (len(observation_space.nvec),): - return False - elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec): - return True - else: - raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) + - "environment, please use ({},) or ".format(len(observation_space.nvec)) + - "(n_env, {}) for the observation shape.".format(len(observation_space.nvec))) - elif isinstance(observation_space, gym.spaces.MultiBinary): - if observation.shape == (observation_space.n,): - return False - elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n: - return True - else: - raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) + - "environment, please use ({},) or ".format(observation_space.n) + - "(n_env, {}) for the observation shape.".format(observation_space.n)) + # TODO: add support for MultiDiscrete and MultiBinary action spaces + # elif isinstance(observation_space, gym.spaces.MultiDiscrete): + # if observation.shape == (len(observation_space.nvec),): + # return False + # elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec): + # return True + # else: + # raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) + + # "environment, please use ({},) or ".format(len(observation_space.nvec)) + + # "(n_env, {}) for the observation shape.".format(len(observation_space.nvec))) + # elif isinstance(observation_space, gym.spaces.MultiBinary): + # if observation.shape == (observation_space.n,): + # return False + # elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n: + # return True + # else: + # raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) + + # "environment, please use ({},) or ".format(observation_space.n) + + # "(n_env, {}) for the observation shape.".format(observation_space.n)) else: raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}." .format(observation_space)) From 4392759057958520e504bca407dc249dd5db8f33 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 14 Feb 2020 14:15:55 +0100 Subject: [PATCH 8/9] Comment unused code --- tests/test_predict.py | 29 ++++++++------------------- torchy_baselines/common/base_class.py | 18 ++++++++--------- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/tests/test_predict.py b/tests/test_predict.py index e75deec06..6f2245ce8 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -22,32 +22,19 @@ def test_auto_wrap(model_class): @pytest.mark.parametrize("model_class", MODEL_LIST) -def test_predict(model_class): - # test detection of different shapes by the predict method - model = model_class('MlpPolicy', 'Pendulum-v0') - env = gym.make('Pendulum-v0') - vec_env = DummyVecEnv([lambda: gym.make('Pendulum-v0'), lambda: gym.make('Pendulum-v0')]) - - obs = env.reset() - action = model.predict(obs) - assert action.shape == env.action_space.shape - assert env.action_space.contains(action) +@pytest.mark.parametrize("env_id", ['Pendulum-v0', 'CartPole-v1']) +def test_predict(model_class, env_id): + if env_id == 'CartPole-v1' and model_class not in [PPO, A2C]: + return - vec_env_obs = vec_env.reset() - action = model.predict(vec_env_obs) - assert action.shape[0] == vec_env_obs.shape[0] - - -@pytest.mark.parametrize("model_class", [A2C, PPO]) -def test_predict_discrete(model_class): # test detection of different shapes by the predict method - model = model_class('MlpPolicy', 'CartPole-v1') - env = gym.make('CartPole-v1') - vec_env = DummyVecEnv([lambda: gym.make('CartPole-v1'), lambda: gym.make('CartPole-v1')]) + model = model_class('MlpPolicy', env_id) + env = gym.make(env_id) + vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)]) obs = env.reset() action = model.predict(obs) - assert action.shape == () + assert action.shape == env.action_space.shape assert env.action_space.contains(action) vec_env_obs = vec_env.reset() diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 15da3bb6b..6b3fb6a52 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -327,15 +327,15 @@ def _is_vectorized_observation(observation: np.ndarray, observation_space: gym.s "Box environment, please use {} ".format(observation_space.shape) + "or (n_env, {}) for the observation shape." .format(", ".join(map(str, observation_space.shape)))) - elif isinstance(observation_space, gym.spaces.Discrete): - if observation.shape == (): # A numpy array of a number, has shape empty tuple '()' - return False - elif len(observation.shape) == 1: - return True - else: - raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + - "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.") - # TODO: add support for MultiDiscrete and MultiBinary action spaces + # TODO: add support for Discrete, MultiDiscrete and MultiBinary observation spaces + # elif isinstance(observation_space, gym.spaces.Discrete): + # if observation.shape == (): # A numpy array of a number, has shape empty tuple '()' + # return False + # elif len(observation.shape) == 1: + # return True + # else: + # raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + + # "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.") # elif isinstance(observation_space, gym.spaces.MultiDiscrete): # if observation.shape == (len(observation_space.nvec),): # return False From af46aa19d1bfc5b99604710fc9062d5ad32129a0 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 14 Feb 2020 14:33:41 +0100 Subject: [PATCH 9/9] Add copyright notice --- NOTICE | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 NOTICE diff --git a/NOTICE b/NOTICE new file mode 100644 index 000000000..9fc6700ee --- /dev/null +++ b/NOTICE @@ -0,0 +1,27 @@ +Large portion of the code of Torchy-Baselines (in `common/`) were ported from Stable-Baselines, a fork of OpenAI Baselines, +both licensed under the MIT License: + +before the fork (June 2018): +Copyright (c) 2017 OpenAI (http://openai.com) + +after the fork (June 2018): +Copyright (c) 2018-2019 Stable-Baselines Team + + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE.