Merge pull request DLR-RM#52 from Antonin-Raffin/refactor/predict

Refactor predict method
Shunian-Chen · Feb 14, 2020 · f8e3995 · f8e3995
2 parents cbb0843 + af46aa1
commit f8e3995
Show file tree

Hide file tree

Showing 22 changed files with 515 additions and 236 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -3,6 +3,8 @@ branch = False
 omit =
     tests/*
     setup.py
+    # Require graphical interface
+    torchy_baselines/common/results_plotter.py
 
 [report]
 exclude_lines =

diff --git a/NOTICE b/NOTICE
@@ -0,0 +1,27 @@
+Large portion of the code of Torchy-Baselines (in `common/`) were ported from Stable-Baselines, a fork of OpenAI Baselines,
+both licensed under the MIT License:
+
+before the fork (June 2018):
+Copyright (c) 2017 OpenAI (http://openai.com)
+
+after the fork (June 2018):
+Copyright (c) 2018-2019 Stable-Baselines Team
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,7 +3,7 @@
 Changelog
 ==========
 
-Pre-Release 0.2.0a1 (WIP)
+Pre-Release 0.2.0a2 (WIP)
 ------------------------------
 
 Breaking Changes:
@@ -12,13 +12,17 @@ Breaking Changes:
 - Return type of `evaluation.evaluate_policy()` has been changed
 - Refactored the replay buffer to avoid transformation between PyTorch and NumPy
 - Created `OffPolicyRLModel` base class
+- Remove deprecated JSON format for `Monitor`
 
 New Features:
 ^^^^^^^^^^^^^
 - Add `seed()` method to `VecEnv` class
 - Add support for Callback (cf https://github.com/hill-a/stable-baselines/pull/644)
 - Add methods for saving and loading replay buffer
 - Add `extend()` method to the buffers
+- Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists
+- Add `results_plotter` from Stable Baselines
+- Improve `predict()` method to handle different type of observations (single, vectorized, ...)
 
 Bug Fixes:
 ^^^^^^^^^^

diff --git a/setup.py b/setup.py
@@ -12,7 +12,9 @@
           'torch>=1.2.0',
           'cloudpickle',
           # For reading logs
-          'pandas'
+          'pandas',
+          # Plotting learning curves
+          'matplotlib'
       ],
       extras_require={
         'tests': [
@@ -45,7 +47,7 @@
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.2.0a1",
+      version="0.2.0a2",
       )
 
 # python setup.py sdist

diff --git a/tests/test_monitor.py b/tests/test_monitor.py
@@ -8,33 +8,47 @@
 from torchy_baselines.common.monitor import Monitor, get_monitor_files, load_results
 
 
-def test_monitor():
+def test_monitor(tmp_path):
     """
     test the monitor wrapper
     """
     env = gym.make("CartPole-v1")
     env.seed(0)
-    monitor_file = "/tmp/stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())
+    monitor_file = os.path.join(str(tmp_path), "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
     monitor_env = Monitor(env, monitor_file)
     monitor_env.reset()
-    for _ in range(1000):
-        _, _, done, _ = monitor_env.step(0)
+    total_steps = 1000
+    ep_rewards = []
+    ep_lengths = []
+    ep_len, ep_reward = 0, 0
+    for _ in range(total_steps):
+        _, reward, done, _ = monitor_env.step(0)
+        ep_len += 1
+        ep_reward += reward
         if done:
+            ep_rewards.append(ep_reward)
+            ep_lengths.append(ep_len)
             monitor_env.reset()
-
-    file_handler = open(monitor_file, 'rt')
-
-    first_line = file_handler.readline()
-    assert first_line.startswith('#')
-    metadata = json.loads(first_line[1:])
-    assert metadata['env_id'] == "CartPole-v1"
-    assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"
-
-    last_logline = pandas.read_csv(file_handler, index_col=None)
-    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
-    file_handler.close()
+            ep_len, ep_reward = 0, 0
+
+    monitor_env.close()
+    assert monitor_env.get_total_steps() == total_steps
+    assert sum(ep_lengths) == sum(monitor_env.get_episode_lengths())
+    assert sum(monitor_env.get_episode_rewards()) == sum(ep_rewards)
+    _ = monitor_env.get_episode_times()
+
+    with open(monitor_file, 'rt') as file_handler:
+        first_line = file_handler.readline()
+        assert first_line.startswith('#')
+        metadata = json.loads(first_line[1:])
+        assert metadata['env_id'] == "CartPole-v1"
+        assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"
+
+        last_logline = pandas.read_csv(file_handler, index_col=None)
+        assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
     os.remove(monitor_file)
 
+
 def test_monitor_load_results(tmp_path):
     """
     test load_results on log files produced by the monitor wrapper

diff --git a/tests/test_predict.py b/tests/test_predict.py
@@ -0,0 +1,42 @@
+import gym
+import pytest
+
+from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3
+from torchy_baselines.common.vec_env import DummyVecEnv
+
+MODEL_LIST = [
+    CEMRL,
+    PPO,
+    A2C,
+    TD3,
+    SAC,
+]
+
+@pytest.mark.parametrize("model_class", MODEL_LIST)
+def test_auto_wrap(model_class):
+    # test auto wrapping of env into a VecEnv
+    env = gym.make('Pendulum-v0')
+    eval_env = gym.make('Pendulum-v0')
+    model = model_class('MlpPolicy', env)
+    model.learn(100, eval_env=eval_env)
+
+
+@pytest.mark.parametrize("model_class", MODEL_LIST)
+@pytest.mark.parametrize("env_id", ['Pendulum-v0', 'CartPole-v1'])
+def test_predict(model_class, env_id):
+    if env_id == 'CartPole-v1' and model_class not in [PPO, A2C]:
+        return
+
+    # test detection of different shapes by the predict method
+    model = model_class('MlpPolicy', env_id)
+    env = gym.make(env_id)
+    vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)])
+
+    obs = env.reset()
+    action = model.predict(obs)
+    assert action.shape == env.action_space.shape
+    assert env.action_space.contains(action)
+
+    vec_env_obs = vec_env.reset()
+    action = model.predict(vec_env_obs)
+    assert action.shape[0] == vec_env_obs.shape[0]
diff --git a/tests/test_run.py b/tests/test_run.py
@@ -14,28 +14,19 @@ def test_td3(action_noise):
     model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
                 learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
     model.learn(total_timesteps=1000, eval_freq=500)
-    model.save("test_save")
-    model.load("test_save")
-    os.remove("test_save.zip")
 
 
 def test_cemrl():
     model = CEMRL('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[16]), pop_size=2, n_grad=1,
                   learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
     model.learn(total_timesteps=1000, eval_freq=500)
-    model.save("test_save")
-    model.load("test_save")
-    os.remove("test_save.zip")
 
 
 @pytest.mark.parametrize("model_class", [A2C, PPO])
 @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0'])
 def test_onpolicy(model_class, env_id):
     model = model_class('MlpPolicy', env_id, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
     model.learn(total_timesteps=1000, eval_freq=500)
-    model.save("test_save")
-    model.load("test_save")
-    os.remove("test_save.zip")
 
 
 @pytest.mark.parametrize("ent_coef", ['auto', 0.01])
@@ -44,6 +35,3 @@ def test_sac(ent_coef):
                 learning_starts=100, verbose=1, create_eval_env=True, ent_coef=ent_coef,
                 action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)))
     model.learn(total_timesteps=1000, eval_freq=500)
-    model.save("test_save")
-    model.load("test_save")
-    os.remove("test_save.zip")
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
@@ -34,7 +34,7 @@ def test_save_load(model_class):
 
     env.reset()
     observations = np.array([env.step(env.action_space.sample())[0] for _ in range(10)])
-    observations = np.squeeze(observations)
+    observations = observations.reshape(10, -1)
 
     # Get dictionary of current parameters
     params = deepcopy(model.policy.state_dict())
@@ -53,7 +53,7 @@ def test_save_load(model_class):
     params = new_params
 
     # get selected actions
-    selected_actions = [model.predict(observation, deterministic=True) for observation in observations]
+    selected_actions = model.predict(observations, deterministic=True)
 
     # Check
     model.save("test_save.zip")
@@ -68,7 +68,7 @@ def test_save_load(model_class):
         assert th.allclose(params[key], new_params[key]), "Model parameters not the same after save and load."
 
     # check if model still selects the same actions
-    new_selected_actions = [model.predict(observation, deterministic=True) for observation in observations]
+    new_selected_actions = model.predict(observations, deterministic=True)
     assert np.allclose(selected_actions, new_selected_actions, 1e-4)
 
     # check if learn still works
@@ -134,6 +134,7 @@ def test_exclude_include_saved_params(model_class):
     # clear file from os
     os.remove("test_save.zip")
 
+
 @pytest.mark.parametrize("model_class", [SAC, TD3])
 def test_save_load_replay_buffer(model_class):
     log_folder = 'logs'

diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
@@ -123,6 +123,8 @@ def test_offpolicy_normalization(model_class):
 
     model = model_class('MlpPolicy', env, verbose=1)
     model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500)
+    # Check getter
+    assert isinstance(model.get_vec_normalize_env(), VecNormalize)
 
 
 def test_sync_vec_normalize():

diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py
@@ -4,4 +4,4 @@
 from torchy_baselines.sac import SAC
 from torchy_baselines.td3 import TD3
 
-__version__ = "0.2.0a1"
+__version__ = "0.2.0a2"