diff --git a/rl_games/algos_torch/a2c_continuous.py b/rl_games/algos_torch/a2c_continuous.py index e93ea362..786fda61 100644 --- a/rl_games/algos_torch/a2c_continuous.py +++ b/rl_games/algos_torch/a2c_continuous.py @@ -41,7 +41,7 @@ def __init__(self, base_name, params): self.init_rnn_from_model(self.model) self.last_lr = float(self.last_lr) self.bound_loss_type = self.config.get('bound_loss_type', 'bound') # 'regularisation' or 'bound' - self.optimizer = optim.Adam(self.model.parameters(), float(self.last_lr), eps=1e-08, weight_decay=self.weight_decay) + self.optimizer = optim.Adam(self.model.parameters(), float(self.last_lr), eps=1e-08, weight_decay=self.weight_decay, fused=True) if self.has_central_value: cv_config = { diff --git a/rl_games/common/player.py b/rl_games/common/player.py index 33c848cc..f2181603 100644 --- a/rl_games/common/player.py +++ b/rl_games/common/player.py @@ -14,6 +14,7 @@ import pandas as pd + class BasePlayer(object): def __init__(self, params): @@ -392,7 +393,8 @@ def run(self): else: print('av reward:', sum_rewards / games_played * n_game_life, 'av steps:', sum_steps / games_played * n_game_life) - + + # save game data to parquet file df.to_parquet('game_data.parquet') def get_batch_size(self, obses, batch_size): diff --git a/rl_games/common/vecenv.py b/rl_games/common/vecenv.py index c29fd4be..275f9e37 100644 --- a/rl_games/common/vecenv.py +++ b/rl_games/common/vecenv.py @@ -7,6 +7,7 @@ from time import sleep import torch + class RayWorker: """Wrapper around a third-party (gym for example) environment class that enables parallel training. @@ -47,7 +48,7 @@ def step(self, action): """ next_state, reward, is_done, info = self.env.step(action) - + if np.isscalar(is_done): episode_done = is_done else: @@ -64,7 +65,7 @@ def seed(self, seed): np.random.seed(seed) random.seed(seed) self.env.seed(seed) - + def render(self): self.env.render() @@ -95,7 +96,7 @@ def get_env_info(self): info = {} observation_space = self.env.observation_space - #if isinstance(observation_space, gym.spaces.dict.Dict): + # if isinstance(observation_space, gym.spaces.dict.Dict): # observation_space = observation_space['observations'] info['action_space'] = self.env.action_space @@ -115,12 +116,16 @@ def get_env_info(self): class RayVecEnv(IVecEnv): """Main env class that manages several `rl_games.common.vecenv.Rayworker` objects for parallel training - + The RayVecEnv class manages a set of individual environments and wraps around the methods from RayWorker. Each worker is executed asynchronously. """ - import ray + # To avoid import errors when Ray is not installed and this class is not used + try: + import ray + except ImportError: + pass def __init__(self, config_name, num_actors, **kwargs): """Initialise the class. Sets up the config for the environment and creates individual workers to manage. @@ -136,7 +141,6 @@ def __init__(self, config_name, num_actors, **kwargs): self.use_torch = False self.seed = kwargs.pop('seed', None) - self.remote_worker = self.ray.remote(RayWorker) self.workers = [self.remote_worker.remote(self.config_name, kwargs) for i in range(self.num_actors)] @@ -162,7 +166,7 @@ def __init__(self, config_name, num_actors, **kwargs): self.concat_func = np.stack else: self.concat_func = np.concatenate - + def step(self, actions): """Step all individual environments (using the created workers). Returns a concatenated array of observations, rewards, done states, and infos if the env allows concatenation. @@ -201,7 +205,7 @@ def step(self, actions): if self.use_global_obs: newobsdict = {} newobsdict["obs"] = ret_obs - + if self.state_type_dict: newobsdict["states"] = dicts_to_dict_with_arrays(newstates, True) else: @@ -231,7 +235,7 @@ def get_action_masks(self): def reset(self): res_obs = [worker.reset.remote() for worker in self.workers] - newobs, newstates = [],[] + newobs, newstates = [], [] for res in res_obs: cobs = self.ray.get(res) if self.use_global_obs: @@ -248,7 +252,7 @@ def reset(self): if self.use_global_obs: newobsdict = {} newobsdict["obs"] = ret_obs - + if self.state_type_dict: newobsdict["states"] = dicts_to_dict_with_arrays(newstates, True) else: @@ -256,8 +260,10 @@ def reset(self): ret_obs = newobsdict return ret_obs + vecenv_config = {} + def register(config_name, func): """Add an environment type (for example RayVecEnv) to the list of available types `rl_games.common.vecenv.vecenv_config` Args: @@ -267,10 +273,12 @@ def register(config_name, func): """ vecenv_config[config_name] = func + def create_vec_env(config_name, num_actors, **kwargs): vec_env_name = configurations[config_name]['vecenv_type'] return vecenv_config[vec_env_name](config_name, num_actors, **kwargs) + register('RAY', lambda config_name, num_actors, **kwargs: RayVecEnv(config_name, num_actors, **kwargs)) from rl_games.envs.brax import BraxEnv diff --git a/rl_games/common/wrappers.py b/rl_games/common/wrappers.py index dab4a648..5c3b17c7 100644 --- a/rl_games/common/wrappers.py +++ b/rl_games/common/wrappers.py @@ -1,4 +1,3 @@ -import gymnasium import numpy as np from numpy.random import randint @@ -11,12 +10,12 @@ from copy import copy - class InfoWrapper(gym.Wrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) - + self.reward = 0 + def reset(self, **kwargs): self.reward = 0 return self.env.reset(**kwargs) @@ -87,7 +86,7 @@ def __init__(self, env): """ gym.Wrapper.__init__(self, env) self.lives = 0 - self.was_real_done = True + self.was_real_done = True def step(self, action): obs, reward, done, info = self.env.step(action) @@ -122,7 +121,7 @@ def __init__(self, env): gym.Wrapper.__init__(self, env) self.max_stacked_steps = 1000 - self.current_steps=0 + self.current_steps = 0 def step(self, action): obs, reward, done, info = self.env.step(action) @@ -140,17 +139,17 @@ def step(self, action): class MaxAndSkipEnv(gym.Wrapper): - def __init__(self, env,skip=4, use_max = True): + def __init__(self, env, skip=4, use_max=True): """Return only every `skip`-th frame""" gym.Wrapper.__init__(self, env) self.use_max = use_max # most recent raw observations (for max pooling across time steps) if self.use_max: - self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) + self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8) else: - self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.float32) - self._skip = skip - + self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.float32) + self._skip = skip + def step(self, action): """Repeat action, sum reward, and max over last observations.""" total_reward = 0.0 @@ -211,8 +210,9 @@ def observation(self, frame): frame = np.expand_dims(frame, -1) return frame + class FrameStack(gym.Wrapper): - def __init__(self, env, k, flat = False): + def __init__(self, env, k, flat=False): """ Stack k last frames. Returns lazy array, which is much more memory efficient. @@ -262,7 +262,7 @@ def _get_ob(self): class BatchedFrameStack(gym.Wrapper): - def __init__(self, env, k, transpose = False, flatten = False): + def __init__(self, env, k, transpose=False, flatten=False): gym.Wrapper.__init__(self, env) self.k = k self.frames = deque([], maxlen=k) @@ -303,8 +303,9 @@ def _get_ob(self): frames = np.transpose(self.frames, (1, 0, 2)) return frames + class BatchedFrameStackWithStates(gym.Wrapper): - def __init__(self, env, k, transpose = False, flatten = False): + def __init__(self, env, k, transpose=False, flatten=False): gym.Wrapper.__init__(self, env) self.k = k self.obses = deque([], maxlen=k) @@ -363,6 +364,7 @@ def process_data(self, data): obses = np.transpose(data, (1, 0, 2)) return obses + class ProcgenStack(gym.Wrapper): def __init__(self, env, k = 2, greyscale=True): gym.Wrapper.__init__(self, env) @@ -370,7 +372,7 @@ def __init__(self, env, k = 2, greyscale=True): self.curr_frame = 0 self.frames = deque([], maxlen=k) - self.greyscale=greyscale + self.greyscale = greyscale self.prev_frame = None shp = env.observation_space.shape if greyscale: @@ -421,6 +423,7 @@ def observation(self, observation): # with smaller replay buffers only. return np.array(observation).astype(np.float32) / 255.0 + class LazyFrames(object): def __init__(self, frames): """This object ensures that common frames between the observations are only stored once. @@ -449,6 +452,7 @@ def __len__(self): def __getitem__(self, i): return self._force()[i] + class ReallyDoneWrapper(gym.Wrapper): def __init__(self, env): """ @@ -457,7 +461,7 @@ def __init__(self, env): self.old_env = env gym.Wrapper.__init__(self, env) self.lives = 0 - self.was_real_done = True + self.was_real_done = True def step(self, action): old_lives = self.env.unwrapped.ale.lives() @@ -471,6 +475,7 @@ def step(self, action): done = lives == 0 return obs, reward, done, info + class AllowBacktracking(gym.Wrapper): """ Use deltas in max(X) as the reward, rather than deltas @@ -506,6 +511,7 @@ def unwrap(env): else: return env + class StickyActionEnv(gym.Wrapper): def __init__(self, env, p=0.25): super(StickyActionEnv, self).__init__(env) @@ -591,7 +597,7 @@ def step(self, action): obs, reward, done, info = self.env.step(action) obs = { 'observation': obs, - 'reward':np.clip(reward, -1, 1), + 'reward': np.clip(reward, -1, 1), 'last_action': action } return obs, reward, done, info @@ -625,10 +631,13 @@ def __init__(self, env, name): raise NotImplementedError def observation(self, observation): - return observation * self.mask + return observation * self.mask + class OldGymWrapper(gym.Env): def __init__(self, env): + import gymnasium + self.env = env # Convert Gymnasium spaces to Gym spaces @@ -636,6 +645,8 @@ def __init__(self, env): self.action_space = self.convert_space(env.action_space) def convert_space(self, space): + import gymnasium + """Recursively convert Gymnasium spaces to Gym spaces.""" if isinstance(space, gymnasium.spaces.Box): return gym.spaces.Box( @@ -691,6 +702,7 @@ def render(self, mode='human'): def close(self): return self.env.close() + # Example usage: if __name__ == "__main__": # Create a MyoSuite environment @@ -718,19 +730,21 @@ def make_atari(env_id, timelimit=True, noop_max=0, skip=4, sticky=False, directo env = MontezumaInfoWrapper(env, room_address=3 if 'Montezuma' in env_id else 1) env = StickyActionEnv(env) env = InfoWrapper(env) - if directory != None: - env = gym.wrappers.Monitor(env,directory=directory,force=True) + + if directory is not None: + env = gym.wrappers.Monitor(env, directory=directory, force=True) if sticky: env = StickyActionEnv(env) if not timelimit: env = env.env - #assert 'NoFrameskip' in env.spec.id + # assert 'NoFrameskip' in env.spec.id if noop_max > 0: env = NoopResetEnv(env, noop_max=noop_max) env = MaxAndSkipEnv(env, skip=skip) - #env = EpisodeStackedEnv(env) + # env = EpisodeStackedEnv(env) return env + def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, scale =False, wrap_impala=False): """Configure environment for DeepMind-style Atari. """ @@ -749,6 +763,7 @@ def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, env = ImpalaEnvWrapper(env) return env + def wrap_carracing(env, clip_rewards=True, frame_stack=True, scale=False): """Configure environment for DeepMind-style Atari. """ @@ -761,11 +776,12 @@ def wrap_carracing(env, clip_rewards=True, frame_stack=True, scale=False): env = FrameStack(env, 4) return env + def make_car_racing(env_id, skip=4): env = make_atari(env_id, noop_max=0, skip=skip) return wrap_carracing(env, clip_rewards=False) + def make_atari_deepmind(env_id, noop_max=30, skip=4, sticky=False, episode_life=True, wrap_impala=False, **kwargs): env = make_atari(env_id, noop_max=noop_max, skip=skip, sticky=sticky, **kwargs) return wrap_deepmind(env, episode_life=episode_life, clip_rewards=False, wrap_impala=wrap_impala) -