RLE-Foundation · roger-creus · Nov 28, 2023 · Nov 28, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,7 +63,11 @@ envs = [
   "gymnasium[accept-rom-license]",
   "dm-control",
   "procgen",
-  "minigrid"
+  "minigrid",
+  "gym==0.26.2",
+  "gym-super-mario-bros==7.4.0",
+  "opencv-python==4.8.1.78",
+  "imageio==2.33.0",
 ]
 docs = [
   "mkdocs-material",

diff --git a/rllte/common/prototype/on_policy_agent.py b/rllte/common/prototype/on_policy_agent.py
@@ -144,8 +144,9 @@ def train(
             with th.no_grad():
                 last_values = self.policy.get_value(next_obs).detach()
 
-            # perform return and advantage estimation
-            self.storage.compute_returns_and_advantages(last_values)
+            # perform return and advantage estimation if have access to extrinsic rewards
+            if not self.pretraining:
+                self.storage.compute_returns_and_advantages(last_values)
 
             # deal with the intrinsic reward module
             if self.irs is not None:
@@ -157,6 +158,7 @@ def train(
                         "obs": self.storage.observations[:-1],  # type: ignore
                         "actions": self.storage.actions,
                         "next_obs": self.storage.observations[1:],  # type: ignore
+                        "done": th.logical_or(self.storage.terminateds[:-1], self.storage.truncateds[:-1]) # type: ignore
                     }
                 )
                 # compute intrinsic rewards
@@ -168,9 +170,16 @@ def train(
                     },
                     step=self.global_episode * self.num_envs * self.num_steps,
                 )
-                # only add the intrinsic rewards to the advantages and returns
-                self.storage.advantages += intrinsic_rewards.to(self.device)
-                self.storage.returns += intrinsic_rewards.to(self.device)
+
+                # if pretraining, compute intrinsic returns and advantages
+                if self.pretraining:
+                    self.storage.rewards = intrinsic_rewards.to(self.device)
+                    self.storage.compute_returns_and_advantages(last_values)
+
+                # if combining intrinsic + extrinsic rewards, add intrinsic rewards to extrinsic returns and advantages
+                else:
+                    self.storage.advantages += intrinsic_rewards.to(self.device)
+                    self.storage.returns += intrinsic_rewards.to(self.device)
 
             # update the agent
             self.update()

diff --git a/rllte/env/__init__.py b/rllte/env/__init__.py
@@ -57,3 +57,8 @@
     from .procgen import make_procgen_env as make_procgen_env
 except Exception:
     pass
+
+try:
+    from .mario import make_mario_env as make_mario_env
+except Exception:
+    pass
diff --git a/rllte/env/mario/__init__.py b/rllte/env/mario/__init__.py
@@ -0,0 +1,48 @@
+from typing import Callable, Dict
+
+import gymnasium as gym
+import gym as gym_old
+from gymnasium.vector import AsyncVectorEnv, SyncVectorEnv
+from gymnasium.wrappers import RecordEpisodeStatistics
+
+from nes_py.wrappers import JoypadSpace
+from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
+
+from rllte.env.utils import Gymnasium2Torch
+from rllte.env.mario.wrappers import (
+    EpisodicLifeEnv, 
+    SkipFrame,
+    Gym2Gymnasium,
+    ImageTranspose
+)
+
+def make_mario_env(
+        env_id: str = "SuperMarioBros-v0",
+        num_envs: int = 8,
+        device: str = "cpu",
+        asynchronous: bool = True,
+        seed: int = 0,
+    ) -> Gymnasium2Torch:
+
+    def make_env(env_id: str, seed: int) -> Callable:
+        def _thunk():
+            env = gym_old.make(env_id, apply_api_compatibility=True, render_mode="rgb_array")
+            env = JoypadSpace(env, SIMPLE_MOVEMENT)
+            env = Gym2Gymnasium(env)
+            env = SkipFrame(env, skip=4)
+            env = gym.wrappers.ResizeObservation(env, (84, 84))
+            env = ImageTranspose(env)
+            env = EpisodicLifeEnv(env)
+            env.observation_space.seed(seed)
+            return env
+        return _thunk
+
+    envs = [make_env(env_id, seed + i) for i in range(num_envs)]
+    if asynchronous:
+        envs = AsyncVectorEnv(envs)
+    else:
+        envs = SyncVectorEnv(envs)
+
+    envs = RecordEpisodeStatistics(envs)
+    return Gymnasium2Torch(envs, device=device)
+
diff --git a/rllte/env/mario/wrappers.py b/rllte/env/mario/wrappers.py
@@ -0,0 +1,107 @@
+import gymnasium as gym
+import numpy as np
+
+class EpisodicLifeEnv(gym.Wrapper):
+    def __init__(self, env):
+        """Make end-of-life == end-of-episode, but only reset on true game
+        over.
+        """
+        gym.Wrapper.__init__(self, env)
+        self.lives = 0
+        self.was_real_done = True
+        self.env = env
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        self.was_real_done = np.logical_or(terminated, truncated)
+        try:
+            lives = self.env.unwrapped.env._life
+            if self.lives > lives > 0:
+                terminated, truncated = True, True
+            self.lives = lives
+        except:
+            pass
+        return obs, reward, terminated, truncated, info
+
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
+
+class SkipFrame(gym.Wrapper):
+    def __init__(self, env, skip):
+        """Return only every `skip`-th frame"""
+        super().__init__(env)
+        self._skip = skip
+        self.env = env
+
+    def step(self, action):
+        """Repeat action, and sum reward"""
+        total_reward = 0.0
+        for i in range(self._skip):
+            # Accumulate reward and repeat the same action
+            obs, reward, terminated, truncated, info = self.env.step(action)
+            total_reward += reward
+            if np.logical_or(terminated, truncated):
+                break
+        return obs, total_reward, terminated, truncated, info
+
+    def reset(self, seed=None, options=None):
+        return self.env.reset()
+
+    def render(self):
+        return self.env.render()
+
+
+class Gym2Gymnasium(gym.Wrapper):
+    def __init__(self, env):
+        """Convert gym.Env to gymnasium.Env"""
+        self.env = env
+
+        self.observation_space = gym.spaces.Box(
+            low=0,
+            high=255,
+            shape=env.observation_space.shape,
+            dtype=env.observation_space.dtype,
+        )
+        self.action_space = gym.spaces.Discrete(env.action_space.n)
+
+    def step(self, action):
+        """Repeat action, and sum reward"""
+        return self.env.step(action)
+
+    def reset(self, options=None, seed=None):
+        return self.env.reset()
+
+    def render(self):
+        return self.env.render()
+
+    def close(self):
+        return self.env.close()
+
+    def seed(self, seed=None):
+        return self.env.seed(seed=seed)
+
+class ImageTranspose(gym.ObservationWrapper):
+    """Transpose observation from channels last to channels first.
+
+    Args:
+        env (gym.Env): Environment to wrap.
+
+    Returns:
+        Minigrid2Image instance.
+    """
+
+    def __init__(self, env: gym.Env) -> None:
+        gym.ObservationWrapper.__init__(self, env)
+        shape = env.observation_space.shape
+        dtype = env.observation_space.dtype
+        self.observation_space = gym.spaces.Box(
+            low=0,
+            high=255,
+            shape=(shape[2], shape[0], shape[1]),
+            dtype=dtype,
+        )
+
+    def observation(self, observation):
+        """Convert observation to image."""
+        observation= np.transpose(observation, axes=[2, 0, 1])
+        return observation
diff --git a/rllte/xplore/reward/__init__.py b/rllte/xplore/reward/__init__.py
@@ -32,3 +32,4 @@
 from .ride import RIDE as RIDE
 from .rise import RISE as RISE
 from .rnd import RND as RND
+from .e3b import E3B as E3B