DLR-RM · araffin · Nov 16, 2020 · Nov 12, 2020 · Nov 12, 2020 · Nov 12, 2020
diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
@@ -79,6 +79,9 @@ In the following example, we will train, save and load a DQN model on the Lunar
   model = DQN.load("dqn_lunar")
 
   # Evaluate the agent
+  # NOTE: If you use wrappers with your environment that modify rewards,
+  #       this will be reflected here. To evaluate with original rewards,
+  #       wrap environment in a "Monitor" wrapper before other wrappers.
   mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
 
   # Enjoy trained agent

diff --git a/docs/guide/rl_tips.rst b/docs/guide/rl_tips.rst
@@ -17,7 +17,7 @@ TL;DR
 
 1. Read about RL and Stable Baselines3
 2. Do quantitative experiments and hyperparameter tuning if needed
-3. Evaluate the performance using a separate test environment
+3. Evaluate the performance using a separate test environment (remember to check wrappers!)
 4. For better performance, increase the training budget
 
 
@@ -68,18 +68,24 @@ Other method, like ``TRPO`` or ``PPO`` make use of a *trust region* to minimize
 How to evaluate an RL algorithm?
 --------------------------------
 
+.. note::
+
+  Pay attention to environment wrappers when evaluating your agent and comparing results to others' results. Modifications to episode rewards
+  or lengths may also affect evaluation results which may not be desirable. Check ``evaluate_policy`` helper function in :ref:`Evaluation Helper <eval>` section.
+
 Because most algorithms use exploration noise during training, you need a separate test environment to evaluate the performance
 of your agent at a given time. It is recommended to periodically evaluate your agent for ``n`` test episodes (``n`` is usually between 5 and 20)
 and average the reward per episode to have a good estimate.
 
+.. note::
+
+	We provide an ``EvalCallback`` for doing such evaluation. You can read more about it in the :ref:`Callbacks <callbacks>` section.
+
 As some policy are stochastic by default (e.g. A2C or PPO), you should also try to set `deterministic=True` when calling the `.predict()` method,
 this frequently leads to better performance.
 Looking at the training curve (episode reward function of the timesteps) is a good proxy but underestimates the agent true performance.
 
 
-.. note::
-
-	We provide an ``EvalCallback`` for doing such evaluation. You can read more about it in the :ref:`Callbacks <callbacks>` section.
 
 
 

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -16,6 +16,7 @@ New Features:
   automatic check for image spaces.
 - ``VecFrameStack`` now has a ``channels_order`` argument to tell if observations should be stacked
   on the first or last observation dimension (originally always stacked on last).
+- ``evaluate_policy`` now returns rewards/episode lengths from a Monitor wrapper if one is present.
 
 Bug Fixes:
 ^^^^^^^^^^

diff --git a/stable_baselines3/common/evaluation.py b/stable_baselines3/common/evaluation.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Callable, List, Optional, Tuple, Union
 
 import gym
@@ -21,6 +22,14 @@ def evaluate_policy(
     Runs policy for ``n_eval_episodes`` episodes and returns average reward.
     This is made to work only with one env.
 
+    .. note::
+        If environment has not been wrapped with ``Monitor`` wrapper, reward and
+        episode lengths are counted as it appears with ``env.step`` calls. If
+        the environment contains wrappers that modify rewards or episode lengths
+        (e.g. reward scaling, early episode reset), these will affect the evaluation
+        results as well. You can avoid this by wrapping environment with ``Monitor``
+        wrapper before anything else.
+
     :param model: The RL agent you want to evaluate.
     :param env: The gym environment. In the case of a ``VecEnv``
         this must contain only one environment.
@@ -49,15 +58,32 @@ def evaluate_policy(
         episode_length = 0
         while not done:
             action, state = model.predict(obs, state=state, deterministic=deterministic)
-            obs, reward, done, _info = env.step(action)
+            obs, reward, done, info = env.step(action)
             episode_reward += reward
             if callback is not None:
                 callback(locals(), globals())
             episode_length += 1
             if render:
                 env.render()
-        episode_rewards.append(episode_reward)
-        episode_lengths.append(episode_length)
+
+        # Remove VecEnv stacking (if any)
+        if isinstance(env, VecEnv):
+            info = info[0]
+
+        if "episode" in info.keys():
+            # Monitor wrapper includes "episode" key in info if environment
+            # has been wrapped with it. Use those rewards instead.
+            episode_rewards.append(info["episode"]["r"])
+            episode_lengths.append(info["episode"]["l"])
+        else:
+            episode_rewards.append(episode_reward)
+            episode_lengths.append(episode_length)
+            warnings.warn(
+                "Evaluation environment does not provide 'episode' environment (not wrapped with ``Monitor`` wrapper?). "
+                "This may result in reporting modified episode lengths and results, depending on the other wrappers. "
+                "Consider wrapping environment first with ``Monitor`` wrapper.",
+                UserWarning,
+            )
     mean_reward = np.mean(episode_rewards)
     std_reward = np.std(episode_rewards)
     if reward_threshold is not None:

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -127,6 +127,43 @@ def dummy_callback(locals_, _globals):
     episode_rewards, _ = evaluate_policy(model, model.get_env(), n_eval_episodes, return_episode_rewards=True)
     assert len(episode_rewards) == n_eval_episodes
 
+    # Test that warning is given about no monitor
+    eval_env = gym.make("Pendulum-v0")
+    with pytest.warns(UserWarning):
+        _ = evaluate_policy(model, eval_env, n_eval_episodes)
+
+    # Test that evaluation with VecEnvs works as expected
+    eval_vecenv = make_vec_env("Pendulum-v0", 1)
+    _ = evaluate_policy(model, eval_vecenv, n_eval_episodes)
+    # Test SubProcVecEnv, too
+    eval_vecenv = make_vec_env("Pendulum-v0", 1, vec_env_cls=SubprocVecEnv)
+    _ = evaluate_policy(model, eval_vecenv, n_eval_episodes)
+
+    # Warning without Monitor
+    eval_vecenv = DummyVecEnv([lambda: gym.make("Pendulum-v0")])
+    with pytest.warns(UserWarning):
+        _ = evaluate_policy(model, eval_vecenv, n_eval_episodes)
+
+    # Test that we gather correct reward with Monitor wrapper
+    class ZeroReward(gym.RewardWrapper):
+        def reward(self, reward):
+            return reward * 0
+
+    # CartPole always gives reward
+    model = A2C("MlpPolicy", "CartPole-v0", seed=0)
+    # Sanity check that we get zero-reward without Monitor
+    eval_env = ZeroReward(gym.make("CartPole-v0"))
+    average_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes)
+    assert average_reward == 0.0, "ZeroReward wrapper for testing did not work"
+    # Normal envs
+    eval_env = ZeroReward(Monitor(gym.make("CartPole-v0")))
+    average_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes)
+    assert average_reward > 0.0, "evaluate_policy did not get reward from Monitor"
+    # Same for vecenvs
+    eval_vecenv = DummyVecEnv([lambda: ZeroReward(Monitor(gym.make("CartPole-v0")))])
+    average_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes)
+    assert average_reward > 0.0, "evaluate_policy did not get reward from Monitor"
+
 
 def test_vec_noise():
     num_envs = 4