Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added E3B and validated - SuperMarioBros environment - Fixed Pretraining Mode #41

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,11 @@ envs = [
"gymnasium[accept-rom-license]",
"dm-control",
"procgen",
"minigrid"
"minigrid",
"gym==0.26.2",
"gym-super-mario-bros==7.4.0",
"opencv-python==4.8.1.78",
"imageio==2.33.0",
]
docs = [
"mkdocs-material",
Expand Down
19 changes: 14 additions & 5 deletions rllte/common/prototype/on_policy_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,9 @@ def train(
with th.no_grad():
last_values = self.policy.get_value(next_obs).detach()

# perform return and advantage estimation
self.storage.compute_returns_and_advantages(last_values)
# perform return and advantage estimation if have access to extrinsic rewards
if not self.pretraining:
self.storage.compute_returns_and_advantages(last_values)

# deal with the intrinsic reward module
if self.irs is not None:
Expand All @@ -157,6 +158,7 @@ def train(
"obs": self.storage.observations[:-1], # type: ignore
"actions": self.storage.actions,
"next_obs": self.storage.observations[1:], # type: ignore
"done": th.logical_or(self.storage.terminateds[:-1], self.storage.truncateds[:-1]) # type: ignore
}
)
# compute intrinsic rewards
Expand All @@ -168,9 +170,16 @@ def train(
},
step=self.global_episode * self.num_envs * self.num_steps,
)
# only add the intrinsic rewards to the advantages and returns
self.storage.advantages += intrinsic_rewards.to(self.device)
self.storage.returns += intrinsic_rewards.to(self.device)

# if pretraining, compute intrinsic returns and advantages
if self.pretraining:
self.storage.rewards = intrinsic_rewards.to(self.device)
self.storage.compute_returns_and_advantages(last_values)

# if combining intrinsic + extrinsic rewards, add intrinsic rewards to extrinsic returns and advantages
else:
self.storage.advantages += intrinsic_rewards.to(self.device)
self.storage.returns += intrinsic_rewards.to(self.device)

# update the agent
self.update()
Expand Down
5 changes: 5 additions & 0 deletions rllte/env/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,8 @@
from .procgen import make_procgen_env as make_procgen_env
except Exception:
pass

try:
from .mario import make_mario_env as make_mario_env
except Exception:
pass
48 changes: 48 additions & 0 deletions rllte/env/mario/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Callable, Dict

import gymnasium as gym
import gym as gym_old
from gymnasium.vector import AsyncVectorEnv, SyncVectorEnv
from gymnasium.wrappers import RecordEpisodeStatistics

from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

from rllte.env.utils import Gymnasium2Torch
from rllte.env.mario.wrappers import (
EpisodicLifeEnv,
SkipFrame,
Gym2Gymnasium,
ImageTranspose
)

def make_mario_env(
env_id: str = "SuperMarioBros-v0",
num_envs: int = 8,
device: str = "cpu",
asynchronous: bool = True,
seed: int = 0,
) -> Gymnasium2Torch:

def make_env(env_id: str, seed: int) -> Callable:
def _thunk():
env = gym_old.make(env_id, apply_api_compatibility=True, render_mode="rgb_array")
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = Gym2Gymnasium(env)
env = SkipFrame(env, skip=4)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = ImageTranspose(env)
env = EpisodicLifeEnv(env)
env.observation_space.seed(seed)
return env
return _thunk

envs = [make_env(env_id, seed + i) for i in range(num_envs)]
if asynchronous:
envs = AsyncVectorEnv(envs)
else:
envs = SyncVectorEnv(envs)

envs = RecordEpisodeStatistics(envs)
return Gymnasium2Torch(envs, device=device)

107 changes: 107 additions & 0 deletions rllte/env/mario/wrappers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import gymnasium as gym
import numpy as np

class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env):
"""Make end-of-life == end-of-episode, but only reset on true game
over.
"""
gym.Wrapper.__init__(self, env)
self.lives = 0
self.was_real_done = True
self.env = env

def step(self, action):
obs, reward, terminated, truncated, info = self.env.step(action)
self.was_real_done = np.logical_or(terminated, truncated)
try:
lives = self.env.unwrapped.env._life
if self.lives > lives > 0:
terminated, truncated = True, True
self.lives = lives
except:
pass
return obs, reward, terminated, truncated, info

def reset(self, **kwargs):
return self.env.reset(**kwargs)

class SkipFrame(gym.Wrapper):
def __init__(self, env, skip):
"""Return only every `skip`-th frame"""
super().__init__(env)
self._skip = skip
self.env = env

def step(self, action):
"""Repeat action, and sum reward"""
total_reward = 0.0
for i in range(self._skip):
# Accumulate reward and repeat the same action
obs, reward, terminated, truncated, info = self.env.step(action)
total_reward += reward
if np.logical_or(terminated, truncated):
break
return obs, total_reward, terminated, truncated, info

def reset(self, seed=None, options=None):
return self.env.reset()

def render(self):
return self.env.render()


class Gym2Gymnasium(gym.Wrapper):
def __init__(self, env):
"""Convert gym.Env to gymnasium.Env"""
self.env = env

self.observation_space = gym.spaces.Box(
low=0,
high=255,
shape=env.observation_space.shape,
dtype=env.observation_space.dtype,
)
self.action_space = gym.spaces.Discrete(env.action_space.n)

def step(self, action):
"""Repeat action, and sum reward"""
return self.env.step(action)

def reset(self, options=None, seed=None):
return self.env.reset()

def render(self):
return self.env.render()

def close(self):
return self.env.close()

def seed(self, seed=None):
return self.env.seed(seed=seed)

class ImageTranspose(gym.ObservationWrapper):
"""Transpose observation from channels last to channels first.

Args:
env (gym.Env): Environment to wrap.

Returns:
Minigrid2Image instance.
"""

def __init__(self, env: gym.Env) -> None:
gym.ObservationWrapper.__init__(self, env)
shape = env.observation_space.shape
dtype = env.observation_space.dtype
self.observation_space = gym.spaces.Box(
low=0,
high=255,
shape=(shape[2], shape[0], shape[1]),
dtype=dtype,
)

def observation(self, observation):
"""Convert observation to image."""
observation= np.transpose(observation, axes=[2, 0, 1])
return observation
1 change: 1 addition & 0 deletions rllte/xplore/reward/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@
from .ride import RIDE as RIDE
from .rise import RISE as RISE
from .rnd import RND as RND
from .e3b import E3B as E3B
Loading