Merge pull request #89 from DLR-RM/base-class-review

Refactor and clean-up of common code
DLR-RM · Jul 8, 2020 · c39ed39 · c39ed39
2 parents 3756d05 + bea1e02
commit c39ed39
Show file tree

Hide file tree

Showing 9 changed files with 208 additions and 191 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,6 +22,7 @@ keys/
 
 # Virtualenv
 /env
+/venv
 
 
 *.sublime-project

diff --git a/Makefile b/Makefile
@@ -1,4 +1,5 @@
 SHELL=/bin/bash
+LINT_PATHS=stable_baselines3/ tests/ docs/conf.py setup.py
 
 pytest:
 	./scripts/run_tests.sh
@@ -9,9 +10,9 @@ type:
 lint:
 	# stop the build if there are Python syntax errors or undefined names
 	# see https://lintlyci.github.io/Flake8Rules/
-	flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+	flake8 ${LINT_PATHS} --count --select=E9,F63,F7,F82 --show-source --statistics
 	# exit-zero treats all errors as warnings.
-	flake8 . --count --exit-zero --statistics
+	flake8 ${LINT_PATHS} --count --exit-zero --statistics
 
 doc:
 	cd docs && make html

diff --git a/docs/conf.py b/docs/conf.py
@@ -19,7 +19,7 @@
 # We CANNOT enable 'sphinxcontrib.spelling' because ReadTheDocs.org does not support
 # PyEnchant.
 try:
-    import sphinxcontrib.spelling
+    import sphinxcontrib.spelling  # noqa: F401
     enable_spell_check = True
 except ImportError:
     enable_spell_check = False
@@ -129,6 +129,7 @@ def __getattr__(cls, name):
 def setup(app):
     app.add_stylesheet("css/baselines_theme.css")
 
+
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.

diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
@@ -1,5 +1,7 @@
+"""Abstract base classes for RL algorithms."""
+
 import time
-from typing import Union, Type, Optional, Dict, Any, List, Tuple, Callable
+from typing import Union, Type, Optional, Dict, Any, Iterable, List, Tuple, Callable
 from abc import ABC, abstractmethod
 from collections import deque
 import pathlib
@@ -23,12 +25,30 @@
 from stable_baselines3.common.noise import ActionNoise
 
 
+def maybe_make_env(env: Union[GymEnv, str, None], monitor_wrapper: bool, verbose: int) -> Optional[GymEnv]:
+    """If env is a string, make the environment; otherwise, return env.
+
+    :param env: (Union[GymEnv, str, None]) The environment to learn from.
+    :param monitor_wrapper: (bool) Whether to wrap env in a Monitor when creating env.
+    :param verbose: (int) logging verbosity
+    :return A Gym (vector) environment.
+    """
+    if isinstance(env, str):
+        if verbose >= 1:
+            print(f"Creating environment from the given name '{env}'")
+        env = gym.make(env)
+        if monitor_wrapper:
+            env = Monitor(env, filename=None)
+
+    return env
+
+
 class BaseAlgorithm(ABC):
     """
     The base of RL algorithms
 
     :param policy: (Type[BasePolicy]) Policy object
-    :param env: (Union[GymEnv, str]) The environment to learn from
+    :param env: (Union[GymEnv, str, None]) The environment to learn from
                 (if registered in Gym, can be str. Can be None for loading trained models)
     :param policy_base: (Type[BasePolicy]) The base policy used by this method
     :param learning_rate: (float or callable) learning rate for the optimizer,
@@ -54,7 +74,7 @@ class BaseAlgorithm(ABC):
 
     def __init__(self,
                  policy: Type[BasePolicy],
-                 env: Union[GymEnv, str],
+                 env: Union[GymEnv, str, None],
                  policy_base: Type[BasePolicy],
                  learning_rate: Union[float, Callable],
                  policy_kwargs: Dict[str, Any] = None,
@@ -116,18 +136,9 @@ def __init__(self,
         if env is not None:
             if isinstance(env, str):
                 if create_eval_env:
-                    eval_env = gym.make(env)
-                    if monitor_wrapper:
-                        eval_env = Monitor(eval_env, filename=None)
-                    self.eval_env = DummyVecEnv([lambda: eval_env])
-                if self.verbose >= 1:
-                    print("Creating environment from the given name, wrapped in a DummyVecEnv.")
-
-                env = gym.make(env)
-                if monitor_wrapper:
-                    env = Monitor(env, filename=None)
-                env = DummyVecEnv([lambda: env])
+                    self.eval_env = maybe_make_env(env, monitor_wrapper, self.verbose)
 
+            env = maybe_make_env(env, monitor_wrapper, self.verbose)
             env = self._wrap_env(env)
 
             self.observation_space = env.observation_space
@@ -136,8 +147,8 @@ def __init__(self,
             self.env = env
 
             if not support_multi_env and self.n_envs > 1:
-                raise ValueError("Error: the model does not support multiple envs requires a single vectorized"
-                                 " environment.")
+                raise ValueError("Error: the model does not support multiple envs; it requires "
+                                 "a single vectorized environment.")
 
     def _wrap_env(self, env: GymEnv) -> VecEnv:
         if not isinstance(env, VecEnv):
@@ -153,10 +164,7 @@ def _wrap_env(self, env: GymEnv) -> VecEnv:
 
     @abstractmethod
     def _setup_model(self) -> None:
-        """
-        Create networks, buffer and optimizers
-        """
-        raise NotImplementedError()
+        """Create networks, buffer and optimizers."""
 
     def _get_eval_env(self, eval_env: Optional[GymEnv]) -> Optional[GymEnv]:
         """
@@ -238,7 +246,7 @@ def set_env(self, env: GymEnv) -> None:
 
     def get_torch_variables(self) -> Tuple[List[str], List[str]]:
         """
-        Get the name of the torch variable that will be saved.
+        Get the name of the torch variables that will be saved.
         ``th.save`` and ``th.load`` will be used with the right device
         instead of the default pickling strategy.
 
@@ -263,18 +271,16 @@ def learn(self, total_timesteps: int,
         Return a trained model.
 
         :param total_timesteps: (int) The total number of samples (env steps) to train on
-        :param callback: (function (dict, dict)) -> boolean function called at every steps with state of the algorithm.
-            It takes the local and global variables. If it returns False, training is aborted.
+        :param callback: (MaybeCallback) callback(s) called at every step with state of the algorithm.
         :param log_interval: (int) The number of timesteps before logging.
-        :param tb_log_name: (str) the name of the run for tensorboard log
+        :param tb_log_name: (str) the name of the run for TensorBoard logging
         :param eval_env: (gym.Env) Environment that will be used to evaluate the agent
         :param eval_freq: (int) Evaluate the agent every ``eval_freq`` timesteps (this may vary a little)
         :param n_eval_episodes: (int) Number of episode to evaluate the agent
         :param eval_log_path: (Optional[str]) Path to a folder where the evaluations will be saved
         :param reset_num_timesteps: (bool) whether or not to reset the current timestep number (used in logging)
         :return: (BaseAlgorithm) the trained model
         """
-        raise NotImplementedError()
 
     def predict(self, observation: np.ndarray,
                 state: Optional[np.ndarray] = None,
@@ -329,8 +335,6 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> 'BaseAl
         # load parameters
         model.__dict__.update(data)
         model.__dict__.update(kwargs)
-        if not hasattr(model, "_setup_model") and len(params) > 0:
-            raise NotImplementedError(f"{cls} has no ``_setup_model()`` method")
         model._setup_model()
 
         # put state_dicts back in place
@@ -366,14 +370,18 @@ def set_random_seed(self, seed: Optional[int] = None) -> None:
             self.eval_env.seed(seed)
 
     def _init_callback(self,
-                       callback: Union[None, Callable, List[BaseCallback], BaseCallback],
+                       callback: MaybeCallback,
                        eval_env: Optional[VecEnv] = None,
                        eval_freq: int = 10000,
                        n_eval_episodes: int = 5,
                        log_path: Optional[str] = None) -> BaseCallback:
         """
-        :param callback: (Union[callable, [BaseCallback], BaseCallback, None])
-        :return: (BaseCallback)
+        :param callback: (MaybeCallback) Callback(s) called at every step with state of the algorithm.
+        :param eval_freq: (Optional[int]) How many steps between evaluations; if None, do not evaluate.
+        :param n_eval_episodes: (int) How many episodes to play per evaluation
+        :param n_eval_episodes: (int) Number of episodes to rollout during evaluation.
+        :param log_path: (Optional[str]) Path to a folder where the evaluations will be saved
+        :return: (BaseCallback) A hybrid callback calling `callback` and performing evaluation.
         """
         # Convert a list of callbacks into a callback
         if isinstance(callback, list):
@@ -396,7 +404,7 @@ def _init_callback(self,
     def _setup_learn(self,
                      total_timesteps: int,
                      eval_env: Optional[GymEnv],
-                     callback: Union[None, Callable, List[BaseCallback], BaseCallback] = None,
+                     callback: MaybeCallback = None,
                      eval_freq: int = 10000,
                      n_eval_episodes: int = 5,
                      log_path: Optional[str] = None,
@@ -407,11 +415,11 @@ def _setup_learn(self,
         Initialize different variables needed for training.
 
         :param total_timesteps: (int) The total number of samples (env steps) to train on
-        :param eval_env: (Optional[GymEnv])
-        :param callback: (Union[None, BaseCallback, List[BaseCallback, Callable]])
+        :param eval_env: (Optional[VecEnv]) Environment to use for evaluation.
+        :param callback: (MaybeCallback) Callback(s) called at every step with state of the algorithm.
         :param eval_freq: (int) How many steps between evaluations
         :param n_eval_episodes: (int) How many episodes to play per evaluation
-        :param log_path (Optional[str]): Path to a log folder
+        :param log_path: (Optional[str]) Path to a folder where the evaluations will be saved
         :param reset_num_timesteps: (bool) Whether to reset or not the ``num_timesteps`` attribute
         :param tb_log_name: (str) the name of the run for tensorboard log
         :return: (Tuple[int, BaseCallback])
@@ -480,8 +488,8 @@ def excluded_save_params(self) -> List[str]:
     def save(
         self,
         path: Union[str, pathlib.Path, io.BufferedIOBase],
-        exclude: Optional[List[str]] = None,
-        include: Optional[List[str]] = None,
+        exclude: Optional[Iterable[str]] = None,
+        include: Optional[Iterable[str]] = None,
     ) -> None:
         """
         Save all the attributes of the object and the model parameters in a zip-file.
@@ -492,29 +500,27 @@ def save(
         """
         # copy parameter list so we don't mutate the original dict
         data = self.__dict__.copy()
-        # use standard list of excluded parameters if none given
+
+        # Exclude is union of specified parameters (if any) and standard exclusions
         if exclude is None:
-            exclude = self.excluded_save_params()
-        else:
-            # append standard exclude params to the given params
-            exclude.extend([param for param in self.excluded_save_params() if param not in exclude])
+            exclude = []
+        exclude = set(exclude).union(self.excluded_save_params())
 
-        # do not exclude params if they are specifically included
+        # Do not exclude params if they are specifically included
         if include is not None:
-            exclude = [param_name for param_name in exclude if param_name not in include]
+            exclude = exclude.difference(include)
 
         state_dicts_names, tensors_names = self.get_torch_variables()
         # any params that are in the save vars must not be saved by data
         torch_variables = state_dicts_names + tensors_names
         for torch_var in torch_variables:
             # we need to get only the name of the top most module as we'll remove that
             var_name = torch_var.split('.')[0]
-            exclude.append(var_name)
+            exclude.add(var_name)
 
         # Remove parameter entries of parameters which are to be excluded
         for param_name in exclude:
-            if param_name in data:
-                data.pop(param_name, None)
+            data.pop(param_name, None)
 
         # Build dict of tensor variables
         tensors = None