diff --git a/rl_games/common/env_configurations.py b/rl_games/common/env_configurations.py index d8b335e3..8e6f00c7 100644 --- a/rl_games/common/env_configurations.py +++ b/rl_games/common/env_configurations.py @@ -10,7 +10,6 @@ import math - class HCRewardEnv(gym.RewardWrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) @@ -34,8 +33,6 @@ def step(self, action): return observation, reward, done, info - - class DMControlObsWrapper(gym.ObservationWrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) @@ -60,19 +57,22 @@ def create_default_gym_env(**kwargs): env = wrappers.LimitStepsWrapper(env) return env + def create_goal_gym_env(**kwargs): frames = kwargs.pop('frames', 1) name = kwargs.pop('name') limit_steps = kwargs.pop('limit_steps', False) env = gym.make(name, **kwargs) - env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal'])) + env = FlattenObservation(FilterObservation( + env, ['observation', 'desired_goal'])) if frames > 1: env = wrappers.FrameStack(env, frames, False) if limit_steps: env = wrappers.LimitStepsWrapper(env) - return env + return env + def create_slime_gym_env(**kwargs): import slimevolleygym @@ -81,24 +81,26 @@ def create_slime_gym_env(**kwargs): limit_steps = kwargs.pop('limit_steps', False) self_play = kwargs.pop('self_play', False) if self_play: - env = SlimeVolleySelfplay(name, **kwargs) + env = SlimeVolleySelfplay(name, **kwargs) else: env = gym.make(name, **kwargs) return env def create_atari_gym_env(**kwargs): - #frames = kwargs.pop('frames', 1) + # frames = kwargs.pop('frames', 1) name = kwargs.pop('name') - skip = kwargs.pop('skip',4) - episode_life = kwargs.pop('episode_life',True) + skip = kwargs.pop('skip', 4) + episode_life = kwargs.pop('episode_life', True) wrap_impala = kwargs.pop('wrap_impala', False) - env = wrappers.make_atari_deepmind(name, skip=skip,episode_life=episode_life, wrap_impala=wrap_impala, **kwargs) - return env + env = wrappers.make_atari_deepmind( + name, skip=skip, episode_life=episode_life, wrap_impala=wrap_impala, **kwargs) + return env + def create_dm_control_env(**kwargs): frames = kwargs.pop('frames', 1) - name = 'dm2gym:'+ kwargs.pop('name') + name = 'dm2gym:' + kwargs.pop('name') env = gym.make(name, environment_kwargs=kwargs) env = DMControlWrapper(env) env = DMControlObsWrapper(env) @@ -107,6 +109,7 @@ def create_dm_control_env(**kwargs): env = wrappers.FrameStack(env, frames, False) return env + def create_super_mario_env(name='SuperMarioBros-v1'): import gym from nes_py.wrappers import JoypadSpace @@ -114,11 +117,16 @@ def create_super_mario_env(name='SuperMarioBros-v1'): import gym_super_mario_bros env = gym_super_mario_bros.make(name) env = JoypadSpace(env, SIMPLE_MOVEMENT) - + if 'Random' in name: + env = wrappers.EpisodicLifeRandomMarioEnv(env) + else: + env = wrappers.EpisodicLifeMarioEnv(env) env = wrappers.MaxAndSkipEnv(env, skip=4) - env = wrappers.wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) + env = wrappers.wrap_deepmind( + env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) return env + def create_super_mario_env_stage1(name='SuperMarioBrosRandomStage1-v1'): import gym from nes_py.wrappers import JoypadSpace @@ -134,24 +142,29 @@ def create_super_mario_env_stage1(name='SuperMarioBrosRandomStage1-v1'): env = gym_super_mario_bros.make(stage_names[1]) env = JoypadSpace(env, SIMPLE_MOVEMENT) - + + env = wrappers.EpisodicLifeMarioEnv(env) env = wrappers.MaxAndSkipEnv(env, skip=4) - env = wrappers.wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) - #env = wrappers.AllowBacktracking(env) - + env = wrappers.wrap_deepmind( + env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) + # env = wrappers.AllowBacktracking(env) + return env + def create_quadrupped_env(): import gym import roboschool import quadruppedEnv return wrappers.FrameStack(wrappers.MaxAndSkipEnv(gym.make('QuadruppedWalk-v1'), 4, False), 2, True) + def create_roboschool_env(name): import gym import roboschool return gym.make(name) + def create_smac(name, **kwargs): from rl_games.envs.smac_env import SMACEnv, MultiDiscreteSmacWrapper frames = kwargs.pop('frames', 1) @@ -160,18 +173,20 @@ def create_smac(name, **kwargs): has_cv = kwargs.get('central_value', False) as_single_agent = kwargs.pop('as_single_agent', False) env = SMACEnv(name, **kwargs) - - + if frames > 1: if has_cv: - env = wrappers.BatchedFrameStackWithStates(env, frames, transpose=False, flatten=flatten) + env = wrappers.BatchedFrameStackWithStates( + env, frames, transpose=False, flatten=flatten) else: - env = wrappers.BatchedFrameStack(env, frames, transpose=False, flatten=flatten) + env = wrappers.BatchedFrameStack( + env, frames, transpose=False, flatten=flatten) if as_single_agent: env = MultiDiscreteSmacWrapper(env) return env + def create_smac_v2(name, **kwargs): from rl_games.envs.smac_v2_env import SMACEnvV2 frames = kwargs.pop('frames', 1) @@ -179,14 +194,17 @@ def create_smac_v2(name, **kwargs): flatten = kwargs.pop('flatten', True) has_cv = kwargs.get('central_value', False) env = SMACEnvV2(name, **kwargs) - + if frames > 1: if has_cv: - env = wrappers.BatchedFrameStackWithStates(env, frames, transpose=False, flatten=flatten) + env = wrappers.BatchedFrameStackWithStates( + env, frames, transpose=False, flatten=flatten) else: - env = wrappers.BatchedFrameStack(env, frames, transpose=False, flatten=flatten) + env = wrappers.BatchedFrameStack( + env, frames, transpose=False, flatten=flatten) return env + def create_smac_cnn(name, **kwargs): from rl_games.envs.smac_env import SMACEnv, MultiDiscreteSmacWrapper has_cv = kwargs.get('central_value', False) @@ -195,23 +213,25 @@ def create_smac_cnn(name, **kwargs): env = SMACEnv(name, **kwargs) if has_cv: - env = wrappers.BatchedFrameStackWithStates(env, frames, transpose=transpose) + env = wrappers.BatchedFrameStackWithStates( + env, frames, transpose=transpose) else: env = wrappers.BatchedFrameStack(env, frames, transpose=transpose) if as_single_agent: env = MultiDiscreteSmacWrapper(env) return env + def create_test_env(name, **kwargs): import rl_games.envs.test env = gym.make(name, **kwargs) return env + def create_minigrid_env(name, **kwargs): import gym_minigrid import gym_minigrid.wrappers - state_bonus = kwargs.pop('state_bonus', False) action_bonus = kwargs.pop('action_bonus', False) rgb_fully_obs = kwargs.pop('rgb_fully_obs', False) @@ -219,7 +239,6 @@ def create_minigrid_env(name, **kwargs): view_size = kwargs.pop('view_size', 3) env = gym.make(name, **kwargs) - if state_bonus: env = gym_minigrid.wrappers.StateBonus(env) if action_bonus: @@ -229,23 +248,27 @@ def create_minigrid_env(name, **kwargs): env = gym_minigrid.wrappers.RGBImgObsWrapper(env) elif rgb_partial_obs: env = gym_minigrid.wrappers.ViewSizeWrapper(env, view_size) - env = gym_minigrid.wrappers.RGBImgPartialObsWrapper(env, tile_size=84//view_size) # Get pixel observations + env = gym_minigrid.wrappers.RGBImgPartialObsWrapper( + env, tile_size=84//view_size) # Get pixel observations env = gym_minigrid.wrappers.ImgObsWrapper(env) print('minigird_env observation space shape:', env.observation_space) return env + def create_multiwalker_env(**kwargs): from rl_games.envs.multiwalker import MultiWalker - env = MultiWalker('', **kwargs) + env = MultiWalker('', **kwargs) return env + def create_diambra_env(**kwargs): from rl_games.envs.diambra.diambra import DiambraEnv env = DiambraEnv(**kwargs) return env + def create_env(name, **kwargs): steps_limit = kwargs.pop('steps_limit', None) env = gym.make(name, **kwargs) @@ -253,170 +276,171 @@ def create_env(name, **kwargs): env = wrappers.TimeLimit(env, steps_limit) return env + configurations = { - 'CartPole-v1' : { - 'vecenv_type' : 'RAY', - 'env_creator' : lambda **kwargs : gym.make('CartPole-v1'), + 'CartPole-v1': { + 'vecenv_type': 'RAY', + 'env_creator': lambda **kwargs: gym.make('CartPole-v1'), }, - 'CartPoleMaskedVelocity-v1' : { - 'vecenv_type' : 'RAY', - 'env_creator' : lambda **kwargs : wrappers.MaskVelocityWrapper(gym.make('CartPole-v1'), 'CartPole-v1'), + 'CartPoleMaskedVelocity-v1': { + 'vecenv_type': 'RAY', + 'env_creator': lambda **kwargs: wrappers.MaskVelocityWrapper(gym.make('CartPole-v1'), 'CartPole-v1'), }, - 'MountainCarContinuous-v0' : { - 'vecenv_type' : 'RAY', - 'env_creator' : lambda **kwargs : gym.make('MountainCarContinuous-v0'), + 'MountainCarContinuous-v0': { + 'vecenv_type': 'RAY', + 'env_creator': lambda **kwargs: gym.make('MountainCarContinuous-v0'), }, - 'MountainCar-v0' : { - 'vecenv_type' : 'RAY', - 'env_creator' : lambda : gym.make('MountainCar-v0'), + 'MountainCar-v0': { + 'vecenv_type': 'RAY', + 'env_creator': lambda: gym.make('MountainCar-v0'), }, - 'Acrobot-v1' : { - 'env_creator' : lambda **kwargs : gym.make('Acrobot-v1'), - 'vecenv_type' : 'RAY' + 'Acrobot-v1': { + 'env_creator': lambda **kwargs: gym.make('Acrobot-v1'), + 'vecenv_type': 'RAY' }, - 'Pendulum-v0' : { - 'env_creator' : lambda **kwargs : gym.make('Pendulum-v0'), - 'vecenv_type' : 'RAY' + 'Pendulum-v0': { + 'env_creator': lambda **kwargs: gym.make('Pendulum-v0'), + 'vecenv_type': 'RAY' }, - 'LunarLander-v2' : { - 'env_creator' : lambda **kwargs : gym.make('LunarLander-v2'), - 'vecenv_type' : 'RAY' + 'LunarLander-v2': { + 'env_creator': lambda **kwargs: gym.make('LunarLander-v2'), + 'vecenv_type': 'RAY' }, - 'PongNoFrameskip-v4' : { - 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('PongNoFrameskip-v4', skip=4), - 'vecenv_type' : 'RAY' + 'PongNoFrameskip-v4': { + 'env_creator': lambda **kwargs: wrappers.make_atari_deepmind('PongNoFrameskip-v4', skip=4), + 'vecenv_type': 'RAY' }, - 'BreakoutNoFrameskip-v4' : { - 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('BreakoutNoFrameskip-v4', skip=4,sticky=False), - 'vecenv_type' : 'RAY' + 'BreakoutNoFrameskip-v4': { + 'env_creator': lambda **kwargs: wrappers.make_atari_deepmind('BreakoutNoFrameskip-v4', skip=4, sticky=False), + 'vecenv_type': 'RAY' }, - 'MsPacmanNoFrameskip-v4' : { - 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('MsPacmanNoFrameskip-v4', skip=4), - 'vecenv_type' : 'RAY' + 'MsPacmanNoFrameskip-v4': { + 'env_creator': lambda **kwargs: wrappers.make_atari_deepmind('MsPacmanNoFrameskip-v4', skip=4), + 'vecenv_type': 'RAY' }, - 'CarRacing-v0' : { - 'env_creator' : lambda **kwargs : wrappers.make_car_racing('CarRacing-v0', skip=4), - 'vecenv_type' : 'RAY' + 'CarRacing-v0': { + 'env_creator': lambda **kwargs: wrappers.make_car_racing('CarRacing-v0', skip=4), + 'vecenv_type': 'RAY' }, - 'RoboschoolAnt-v1' : { - 'env_creator' : lambda **kwargs : create_roboschool_env('RoboschoolAnt-v1'), - 'vecenv_type' : 'RAY' + 'RoboschoolAnt-v1': { + 'env_creator': lambda **kwargs: create_roboschool_env('RoboschoolAnt-v1'), + 'vecenv_type': 'RAY' }, - 'SuperMarioBros-v1' : { - 'env_creator' : lambda : create_super_mario_env(), - 'vecenv_type' : 'RAY' + 'SuperMarioBros-v1': { + 'env_creator': lambda: create_super_mario_env(), + 'vecenv_type': 'RAY' }, - 'SuperMarioBrosRandomStages-v1' : { - 'env_creator' : lambda : create_super_mario_env('SuperMarioBrosRandomStages-v1'), - 'vecenv_type' : 'RAY' + 'SuperMarioBrosRandomStages-v1': { + 'env_creator': lambda: create_super_mario_env('SuperMarioBrosRandomStages-v1'), + 'vecenv_type': 'RAY' }, - 'SuperMarioBrosRandomStage1-v1' : { - 'env_creator' : lambda **kwargs : create_super_mario_env_stage1('SuperMarioBrosRandomStage1-v1'), - 'vecenv_type' : 'RAY' + 'SuperMarioBrosRandomStage1-v1': { + 'env_creator': lambda **kwargs: create_super_mario_env_stage1('SuperMarioBrosRandomStage1-v1'), + 'vecenv_type': 'RAY' }, - 'RoboschoolHalfCheetah-v1' : { - 'env_creator' : lambda **kwargs : create_roboschool_env('RoboschoolHalfCheetah-v1'), - 'vecenv_type' : 'RAY' + 'RoboschoolHalfCheetah-v1': { + 'env_creator': lambda **kwargs: create_roboschool_env('RoboschoolHalfCheetah-v1'), + 'vecenv_type': 'RAY' }, - 'RoboschoolHumanoid-v1' : { - 'env_creator' : lambda : wrappers.FrameStack(create_roboschool_env('RoboschoolHumanoid-v1'), 1, True), - 'vecenv_type' : 'RAY' + 'RoboschoolHumanoid-v1': { + 'env_creator': lambda: wrappers.FrameStack(create_roboschool_env('RoboschoolHumanoid-v1'), 1, True), + 'vecenv_type': 'RAY' }, - 'LunarLanderContinuous-v2' : { - 'env_creator' : lambda **kwargs : gym.make('LunarLanderContinuous-v2'), - 'vecenv_type' : 'RAY' + 'LunarLanderContinuous-v2': { + 'env_creator': lambda **kwargs: gym.make('LunarLanderContinuous-v2'), + 'vecenv_type': 'RAY' }, - 'RoboschoolHumanoidFlagrun-v1' : { - 'env_creator' : lambda **kwargs : wrappers.FrameStack(create_roboschool_env('RoboschoolHumanoidFlagrun-v1'), 1, True), - 'vecenv_type' : 'RAY' + 'RoboschoolHumanoidFlagrun-v1': { + 'env_creator': lambda **kwargs: wrappers.FrameStack(create_roboschool_env('RoboschoolHumanoidFlagrun-v1'), 1, True), + 'vecenv_type': 'RAY' }, - 'BipedalWalker-v3' : { - 'env_creator' : lambda **kwargs : create_env('BipedalWalker-v3', **kwargs), - 'vecenv_type' : 'RAY' + 'BipedalWalker-v3': { + 'env_creator': lambda **kwargs: create_env('BipedalWalker-v3', **kwargs), + 'vecenv_type': 'RAY' }, - 'BipedalWalkerCnn-v3' : { - 'env_creator' : lambda **kwargs : wrappers.FrameStack(HCRewardEnv(gym.make('BipedalWalker-v3')), 4, False), - 'vecenv_type' : 'RAY' + 'BipedalWalkerCnn-v3': { + 'env_creator': lambda **kwargs: wrappers.FrameStack(HCRewardEnv(gym.make('BipedalWalker-v3')), 4, False), + 'vecenv_type': 'RAY' }, - 'BipedalWalkerHardcore-v3' : { - 'env_creator' : lambda **kwargs : gym.make('BipedalWalkerHardcore-v3'), - 'vecenv_type' : 'RAY' + 'BipedalWalkerHardcore-v3': { + 'env_creator': lambda **kwargs: gym.make('BipedalWalkerHardcore-v3'), + 'vecenv_type': 'RAY' }, - 'ReacherPyBulletEnv-v0' : { - 'env_creator' : lambda **kwargs : create_roboschool_env('ReacherPyBulletEnv-v0'), - 'vecenv_type' : 'RAY' + 'ReacherPyBulletEnv-v0': { + 'env_creator': lambda **kwargs: create_roboschool_env('ReacherPyBulletEnv-v0'), + 'vecenv_type': 'RAY' }, - 'BipedalWalkerHardcoreCnn-v3' : { - 'env_creator' : lambda : wrappers.FrameStack(gym.make('BipedalWalkerHardcore-v3'), 4, False), - 'vecenv_type' : 'RAY' + 'BipedalWalkerHardcoreCnn-v3': { + 'env_creator': lambda: wrappers.FrameStack(gym.make('BipedalWalkerHardcore-v3'), 4, False), + 'vecenv_type': 'RAY' }, - 'QuadruppedWalk-v1' : { - 'env_creator' : lambda **kwargs : create_quadrupped_env(), - 'vecenv_type' : 'RAY' + 'QuadruppedWalk-v1': { + 'env_creator': lambda **kwargs: create_quadrupped_env(), + 'vecenv_type': 'RAY' }, - 'FlexAnt' : { - 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/ant.yaml'), - 'vecenv_type' : 'ISAAC' + 'FlexAnt': { + 'env_creator': lambda **kwargs: create_flex(FLEX_PATH + '/demo/gym/cfg/ant.yaml'), + 'vecenv_type': 'ISAAC' }, - 'FlexHumanoid' : { - 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid.yaml'), - 'vecenv_type' : 'ISAAC' + 'FlexHumanoid': { + 'env_creator': lambda **kwargs: create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid.yaml'), + 'vecenv_type': 'ISAAC' }, - 'FlexHumanoidHard' : { - 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid_hard.yaml'), - 'vecenv_type' : 'ISAAC' + 'FlexHumanoidHard': { + 'env_creator': lambda **kwargs: create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid_hard.yaml'), + 'vecenv_type': 'ISAAC' }, - 'smac' : { - 'env_creator' : lambda **kwargs : create_smac(**kwargs), - 'vecenv_type' : 'RAY' + 'smac': { + 'env_creator': lambda **kwargs: create_smac(**kwargs), + 'vecenv_type': 'RAY' }, - 'smac_v2' : { - 'env_creator' : lambda **kwargs : create_smac_v2(**kwargs), - 'vecenv_type' : 'RAY' + 'smac_v2': { + 'env_creator': lambda **kwargs: create_smac_v2(**kwargs), + 'vecenv_type': 'RAY' }, - 'smac_cnn' : { - 'env_creator' : lambda **kwargs : create_smac_cnn(**kwargs), - 'vecenv_type' : 'RAY' + 'smac_cnn': { + 'env_creator': lambda **kwargs: create_smac_cnn(**kwargs), + 'vecenv_type': 'RAY' }, - 'dm_control' : { - 'env_creator' : lambda **kwargs : create_dm_control_env(**kwargs), - 'vecenv_type' : 'RAY' + 'dm_control': { + 'env_creator': lambda **kwargs: create_dm_control_env(**kwargs), + 'vecenv_type': 'RAY' }, - 'openai_gym' : { - 'env_creator' : lambda **kwargs : create_default_gym_env(**kwargs), - 'vecenv_type' : 'RAY' + 'openai_gym': { + 'env_creator': lambda **kwargs: create_default_gym_env(**kwargs), + 'vecenv_type': 'RAY' }, - 'openai_robot_gym' : { - 'env_creator' : lambda **kwargs : create_goal_gym_env(**kwargs), - 'vecenv_type' : 'RAY' + 'openai_robot_gym': { + 'env_creator': lambda **kwargs: create_goal_gym_env(**kwargs), + 'vecenv_type': 'RAY' }, - 'atari_gym' : { - 'env_creator' : lambda **kwargs : create_atari_gym_env(**kwargs), - 'vecenv_type' : 'RAY' + 'atari_gym': { + 'env_creator': lambda **kwargs: create_atari_gym_env(**kwargs), + 'vecenv_type': 'RAY' }, - 'slime_gym' : { - 'env_creator' : lambda **kwargs : create_slime_gym_env(**kwargs), - 'vecenv_type' : 'RAY' + 'slime_gym': { + 'env_creator': lambda **kwargs: create_slime_gym_env(**kwargs), + 'vecenv_type': 'RAY' }, - 'test_env' : { - 'env_creator' : lambda **kwargs : create_test_env(kwargs.pop('name'), **kwargs), - 'vecenv_type' : 'RAY' + 'test_env': { + 'env_creator': lambda **kwargs: create_test_env(kwargs.pop('name'), **kwargs), + 'vecenv_type': 'RAY' }, - 'minigrid_env' : { - 'env_creator' : lambda **kwargs : create_minigrid_env(kwargs.pop('name'), **kwargs), - 'vecenv_type' : 'RAY' + 'minigrid_env': { + 'env_creator': lambda **kwargs: create_minigrid_env(kwargs.pop('name'), **kwargs), + 'vecenv_type': 'RAY' }, - 'multiwalker_env' : { - 'env_creator' : lambda **kwargs : create_multiwalker_env(**kwargs), - 'vecenv_type' : 'RAY' + 'multiwalker_env': { + 'env_creator': lambda **kwargs: create_multiwalker_env(**kwargs), + 'vecenv_type': 'RAY' }, 'diambra': { 'env_creator': lambda **kwargs: create_diambra_env(**kwargs), 'vecenv_type': 'RAY' }, - 'brax' : { + 'brax': { 'env_creator': lambda **kwargs: create_brax_env(**kwargs), - 'vecenv_type': 'BRAX' + 'vecenv_type': 'BRAX' }, 'envpool': { 'env_creator': lambda **kwargs: create_envpool(**kwargs), @@ -428,6 +452,7 @@ def create_env(name, **kwargs): }, } + def get_env_info(env): result_shapes = {} result_shapes['observation_space'] = env.observation_space @@ -444,11 +469,12 @@ def get_env_info(env): result_shapes['observation_space'] = observation_space['observations'] result_shapes['state_space'] = observation_space['states'] ''' - if hasattr(env, "value_size"): + if hasattr(env, "value_size"): result_shapes['value_size'] = env.value_size print(result_shapes) return result_shapes + def get_obs_and_action_spaces_from_config(config): env_config = config.get('env_config', {}) env = configurations[config['env_name']]['env_creator'](**env_config) @@ -458,4 +484,4 @@ def get_obs_and_action_spaces_from_config(config): def register(name, config): - configurations[name] = config \ No newline at end of file + configurations[name] = config diff --git a/rl_games/common/player.py b/rl_games/common/player.py index 98be6501..70328baa 100644 --- a/rl_games/common/player.py +++ b/rl_games/common/player.py @@ -77,7 +77,8 @@ def __init__(self, params): self.device = torch.device(self.device_name) self.evaluation = self.player_config.get("evaluation", False) - self.update_checkpoint_freq = self.player_config.get("update_checkpoint_freq", 100) + self.update_checkpoint_freq = self.player_config.get( + "update_checkpoint_freq", 100) # if we run player as evaluation worker this will take care of loading new checkpoints self.dir_to_monitor = self.player_config.get("dir_to_monitor") # path to the newest checkpoint @@ -85,7 +86,8 @@ def __init__(self, params): if self.evaluation and self.dir_to_monitor is not None: self.checkpoint_mutex = threading.Lock() - self.eval_checkpoint_dir = os.path.join(self.dir_to_monitor, "eval_checkpoints") + self.eval_checkpoint_dir = os.path.join( + self.dir_to_monitor, "eval_checkpoints") os.makedirs(self.eval_checkpoint_dir, exist_ok=True) patterns = ["*.pth"] @@ -96,7 +98,8 @@ def __init__(self, params): self.file_events.on_modified = self.on_file_modified self.file_observer = Observer() - self.file_observer.schedule(self.file_events, self.dir_to_monitor, recursive=False) + self.file_observer.schedule( + self.file_events, self.dir_to_monitor, recursive=False) self.file_observer.start() def wait_for_checkpoint(self): @@ -109,7 +112,8 @@ def wait_for_checkpoint(self): with self.checkpoint_mutex: if self.checkpoint_to_load is not None: if attempt % 10 == 0: - print(f"Evaluation: waiting for new checkpoint in {self.dir_to_monitor}...") + print( + f"Evaluation: waiting for new checkpoint in {self.dir_to_monitor}...") break time.sleep(1.0) @@ -119,21 +123,24 @@ def maybe_load_new_checkpoint(self): # lock mutex while loading new checkpoint with self.checkpoint_mutex: if self.checkpoint_to_load is not None: - print(f"Evaluation: loading new checkpoint {self.checkpoint_to_load}...") + print( + f"Evaluation: loading new checkpoint {self.checkpoint_to_load}...") # try if we can load anything from the pth file, this will quickly fail if the file is corrupted # without triggering the retry loop in "safe_filesystem_op()" load_error = False try: torch.load(self.checkpoint_to_load) except Exception as e: - print(f"Evaluation: checkpoint file is likely corrupted {self.checkpoint_to_load}: {e}") + print( + f"Evaluation: checkpoint file is likely corrupted {self.checkpoint_to_load}: {e}") load_error = True if not load_error: try: self.restore(self.checkpoint_to_load) except Exception as e: - print(f"Evaluation: failed to load new checkpoint {self.checkpoint_to_load}: {e}") + print( + f"Evaluation: failed to load new checkpoint {self.checkpoint_to_load}: {e}") # whether we succeeded or not, forget about this checkpoint self.checkpoint_to_load = None @@ -146,7 +153,8 @@ def process_new_eval_checkpoint(self, path): # there is a chance that the file is changed/corrupted while we're copying it # not sure what we can do about this. In practice it never happened so far though try: - eval_checkpoint_path = os.path.join(self.eval_checkpoint_dir, basename(path)) + eval_checkpoint_path = os.path.join( + self.eval_checkpoint_dir, basename(path)) shutil.copyfile(path, eval_checkpoint_path) except Exception as e: print(f"Failed to copy {path} to {eval_checkpoint_path}: {e}") @@ -180,6 +188,13 @@ def _preproc_obs(self, obs_batch): def env_step(self, env, actions): if not self.is_tensor_obses: actions = actions.cpu().numpy() + + def check_if_numpy_array_is_scalar(arr): + return arr.size == 1 and arr.shape == () + + if check_if_numpy_array_is_scalar(actions): + actions = actions.item() + obs, rewards, dones, infos = env.step(actions) if hasattr(obs, 'dtype') and obs.dtype == np.float64: obs = np.float32(obs) @@ -364,9 +379,11 @@ def run(self): cur_rewards_done = cur_rewards/done_count cur_steps_done = cur_steps/done_count if print_game_res: - print(f'reward: {cur_rewards_done:.2f} steps: {cur_steps_done:.1f} w: {game_res}') + print( + f'reward: {cur_rewards_done:.2f} steps: {cur_steps_done:.1f} w: {game_res}') else: - print(f'reward: {cur_rewards_done:.2f} steps: {cur_steps_done:.1f}') + print( + f'reward: {cur_rewards_done:.2f} steps: {cur_steps_done:.1f}') sum_game_res += game_res if batch_size//self.num_agents == 1 or games_played >= n_games: diff --git a/rl_games/common/wrappers.py b/rl_games/common/wrappers.py index a62e0855..47dc6ba3 100644 --- a/rl_games/common/wrappers.py +++ b/rl_games/common/wrappers.py @@ -1,21 +1,20 @@ +from copy import copy +from gym import spaces +import gym +from collections import deque import numpy as np from numpy.random import randint import os os.environ.setdefault('PATH', '') -from collections import deque - -import gym -from gym import spaces -from copy import copy - class InfoWrapper(gym.Wrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) - + self.reward = 0 + def reset(self, **kwargs): self.reward = 0 return self.env.reset(**kwargs) @@ -86,7 +85,7 @@ def __init__(self, env): """ gym.Wrapper.__init__(self, env) self.lives = 0 - self.was_real_done = True + self.was_real_done = True def step(self, action): obs, reward, done, info = self.env.step(action) @@ -116,12 +115,86 @@ def reset(self, **kwargs): return obs +class EpisodicLifeMarioEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on True game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + gym.Wrapper.__init__(self, env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped._life + if lives < self.lives and lives > 0: + # for Qbert sometimes we stay in lives == 0 condition for a few frames + # so it's important to keep lives > 0, so that we only reset once + # the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped._life + return obs + + +class EpisodicLifeRandomMarioEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on True game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + gym.Wrapper.__init__(self, env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.env._life + if lives < self.lives and lives > 0: + # for Qbert sometimes we stay in lives == 0 condition for a few frames + # so it's important to keep lives > 0, so that we only reset once + # the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.env._life + return obs + + class EpisodeStackedEnv(gym.Wrapper): def __init__(self, env): gym.Wrapper.__init__(self, env) self.max_stacked_steps = 1000 - self.current_steps=0 + self.current_steps = 0 def step(self, action): obs, reward, done, info = self.env.step(action) @@ -139,17 +212,19 @@ def step(self, action): class MaxAndSkipEnv(gym.Wrapper): - def __init__(self, env,skip=4, use_max = True): + def __init__(self, env, skip=4, use_max=True): """Return only every `skip`-th frame""" gym.Wrapper.__init__(self, env) - self.use_max = use_max + self.use_max = use_max # most recent raw observations (for max pooling across time steps) if self.use_max: - self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) + self._obs_buffer = np.zeros( + (2,)+env.observation_space.shape, dtype=np.uint8) else: - self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.float32) - self._skip = skip - + self._obs_buffer = np.zeros( + (2,)+env.observation_space.shape, dtype=np.float32) + self._skip = skip + def step(self, action): """Repeat action, sum reward, and max over last observations.""" total_reward = 0.0 @@ -157,8 +232,10 @@ def step(self, action): for i in range(self._skip): obs, reward, done, info = self.env.step(action) if self.use_max: - if i == self._skip - 2: self._obs_buffer[0] = obs - if i == self._skip - 1: self._obs_buffer[1] = obs + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs else: self._obs_buffer[0] = obs @@ -196,22 +273,24 @@ def __init__(self, env, width=84, height=84, grayscale=True): self.grayscale = grayscale if self.grayscale: self.observation_space = spaces.Box(low=0, high=255, - shape=(self.height, self.width, 1), dtype=np.uint8) + shape=(self.height, self.width, 1), dtype=np.uint8) else: self.observation_space = spaces.Box(low=0, high=255, - shape=(self.height, self.width, 3), dtype=np.uint8) + shape=(self.height, self.width, 3), dtype=np.uint8) def observation(self, frame): import cv2 if self.grayscale: frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) - frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) + frame = cv2.resize(frame, (self.width, self.height), + interpolation=cv2.INTER_AREA) if self.grayscale: frame = np.expand_dims(frame, -1) return frame + class FrameStack(gym.Wrapper): - def __init__(self, env, k, flat = False): + def __init__(self, env, k, flat=False): """ Stack k last frames. Returns lazy array, which is much more memory efficient. @@ -225,14 +304,17 @@ def __init__(self, env, k, flat = False): self.frames = deque([], maxlen=k) observation_space = env.observation_space self.shp = shp = observation_space.shape - #TODO: remove consts -1 and 1 + # TODO: remove consts -1 and 1 if flat: - self.observation_space = spaces.Box(low=-1, high=1, shape=(shp[:-1] + (shp[-1] * k,)), dtype=observation_space.dtype) + self.observation_space = spaces.Box( + low=-1, high=1, shape=(shp[:-1] + (shp[-1] * k,)), dtype=observation_space.dtype) else: if len(shp) == 1: - self.observation_space = spaces.Box(low=-1, high=1, shape=(k, shp[0]), dtype=observation_space.dtype) + self.observation_space = spaces.Box( + low=-1, high=1, shape=(k, shp[0]), dtype=observation_space.dtype) else: - self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=observation_space.dtype) + self.observation_space = spaces.Box(low=0, high=255, shape=( + shp[:-1] + (shp[-1] * k,)), dtype=observation_space.dtype) def reset(self): ob = self.env.reset() @@ -251,17 +333,18 @@ def _get_ob(self): return np.squeeze(self.frames).flatten() else: if len(self.shp) == 1: - res = np.concatenate([f[..., np.newaxis] for f in self.frames], axis=-1) - #print('shape:', np.shape(res)) - #print('shape:', np.shape(np.transpose(res))) + res = np.concatenate([f[..., np.newaxis] + for f in self.frames], axis=-1) + # print('shape:', np.shape(res)) + # print('shape:', np.shape(np.transpose(res))) return np.transpose(res) else: return np.concatenate(self.frames, axis=-1) - #return LazyFrames(list(self.frames)) + # return LazyFrames(list(self.frames)) class BatchedFrameStack(gym.Wrapper): - def __init__(self, env, k, transpose = False, flatten = False): + def __init__(self, env, k, transpose=False, flatten=False): gym.Wrapper.__init__(self, env) self.k = k self.frames = deque([], maxlen=k) @@ -269,13 +352,16 @@ def __init__(self, env, k, transpose = False, flatten = False): self.transpose = transpose self.flatten = flatten if transpose: - assert(not flatten) - self.observation_space = spaces.Box(low=0, high=1, shape=(shp[0], k), dtype=env.observation_space.dtype) + assert (not flatten) + self.observation_space = spaces.Box(low=0, high=1, shape=( + shp[0], k), dtype=env.observation_space.dtype) else: if flatten: - self.observation_space = spaces.Box(low=0, high=1, shape=(k *shp[0],), dtype=env.observation_space.dtype) + self.observation_space = spaces.Box(low=0, high=1, shape=( + k * shp[0],), dtype=env.observation_space.dtype) else: - self.observation_space = spaces.Box(low=0, high=1, shape=(k, shp[0]), dtype=env.observation_space.dtype) + self.observation_space = spaces.Box(low=0, high=1, shape=( + k, shp[0]), dtype=env.observation_space.dtype) def reset(self): ob = self.env.reset() @@ -297,13 +383,15 @@ def _get_ob(self): frames = np.array(self.frames) shape = np.shape(frames) frames = np.transpose(self.frames, (1, 0, 2)) - frames = np.reshape(self.frames, (shape[1], shape[0] * shape[2])) + frames = np.reshape( + self.frames, (shape[1], shape[0] * shape[2])) else: frames = np.transpose(self.frames, (1, 0, 2)) return frames + class BatchedFrameStackWithStates(gym.Wrapper): - def __init__(self, env, k, transpose = False, flatten = False): + def __init__(self, env, k, transpose=False, flatten=False): gym.Wrapper.__init__(self, env) self.k = k self.obses = deque([], maxlen=k) @@ -313,16 +401,22 @@ def __init__(self, env, k, transpose = False, flatten = False): self.transpose = transpose self.flatten = flatten if transpose: - assert(not flatten) - self.observation_space = spaces.Box(low=0, high=1, shape=(shp[0], k), dtype=env.observation_space.dtype) - self.state_space = spaces.Box(low=0, high=1, shape=(state_shp[0], k), dtype=env.observation_space.dtype) + assert (not flatten) + self.observation_space = spaces.Box(low=0, high=1, shape=( + shp[0], k), dtype=env.observation_space.dtype) + self.state_space = spaces.Box(low=0, high=1, shape=( + state_shp[0], k), dtype=env.observation_space.dtype) else: if flatten: - self.observation_space = spaces.Box(low=0, high=1, shape=(k*shp[0],), dtype=env.observation_space.dtype) - self.state_space = spaces.Box(low=0, high=1, shape=(k*state_shp[0],), dtype=env.observation_space.dtype) + self.observation_space = spaces.Box(low=0, high=1, shape=( + k*shp[0],), dtype=env.observation_space.dtype) + self.state_space = spaces.Box(low=0, high=1, shape=( + k*state_shp[0],), dtype=env.observation_space.dtype) else: - self.observation_space = spaces.Box(low=0, high=1, shape=(k, shp[0]), dtype=env.observation_space.dtype) - self.state_space = spaces.Box(low=0, high=1, shape=(k, state_shp[0]), dtype=env.observation_space.dtype) + self.observation_space = spaces.Box(low=0, high=1, shape=( + k, shp[0]), dtype=env.observation_space.dtype) + self.state_space = spaces.Box(low=0, high=1, shape=( + k, state_shp[0]), dtype=env.observation_space.dtype) def reset(self): obs_dict = self.env.reset() @@ -345,7 +439,7 @@ def _get_ob(self): assert len(self.obses) == self.k obses = self.process_data(self.obses) states = self.process_data(self.states) - return {"obs": obses, "state" : states} + return {"obs": obses, "state": states} def process_data(self, data): if len(np.shape(data)) < 3: @@ -362,21 +456,23 @@ def process_data(self, data): obses = np.transpose(data, (1, 0, 2)) return obses + class ProcgenStack(gym.Wrapper): - def __init__(self, env, k = 2, greyscale=True): + def __init__(self, env, k=2, greyscale=True): gym.Wrapper.__init__(self, env) self.k = k self.curr_frame = 0 self.frames = deque([], maxlen=k) - self.greyscale=greyscale + self.greyscale = greyscale self.prev_frame = None shp = env.observation_space.shape if greyscale: shape = (shp[:-1] + (shp[-1] + k - 1,)) else: shape = (shp[:-1] + (shp[-1] * k,)) - self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8) + self.observation_space = spaces.Box( + low=0, high=255, shape=shape, dtype=np.uint8) def reset(self): import cv2 @@ -384,7 +480,8 @@ def reset(self): self.frames.append(frames) if self.greyscale: - self.prev_frame = np.expand_dims(cv2.cvtColor(frames, cv2.COLOR_RGB2GRAY), axis=-1) + self.prev_frame = np.expand_dims( + cv2.cvtColor(frames, cv2.COLOR_RGB2GRAY), axis=-1) for _ in range(self.k-1): self.frames.append(self.prev_frame) else: @@ -399,7 +496,8 @@ def step(self, action): if self.greyscale: self.frames[self.k-1] = self.prev_frame - self.prev_frame = np.expand_dims(cv2.cvtColor(frames, cv2.COLOR_RGB2GRAY), axis=-1) + self.prev_frame = np.expand_dims( + cv2.cvtColor(frames, cv2.COLOR_RGB2GRAY), axis=-1) self.frames.append(frames) return self._get_ob(), reward, done, info @@ -413,13 +511,15 @@ def _get_ob(self): class ScaledFloatFrame(gym.ObservationWrapper): def __init__(self, env): gym.ObservationWrapper.__init__(self, env) - self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) + self.observation_space = gym.spaces.Box( + low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) def observation(self, observation): # careful! This undoes the memory optimization, use # with smaller replay buffers only. return np.array(observation).astype(np.float32) / 255.0 + class LazyFrames(object): def __init__(self, frames): """This object ensures that common frames between the observations are only stored once. @@ -448,6 +548,7 @@ def __len__(self): def __getitem__(self, i): return self._force()[i] + class ReallyDoneWrapper(gym.Wrapper): def __init__(self, env): """ @@ -456,7 +557,7 @@ def __init__(self, env): self.old_env = env gym.Wrapper.__init__(self, env) self.lives = 0 - self.was_real_done = True + self.was_real_done = True def step(self, action): old_lives = self.env.unwrapped.ale.lives() @@ -470,6 +571,7 @@ def step(self, action): done = lives == 0 return obs, reward, done, info + class AllowBacktracking(gym.Wrapper): """ Use deltas in max(X) as the reward, rather than deltas @@ -477,17 +579,18 @@ class AllowBacktracking(gym.Wrapper): from exploring backwards if there is no way to advance head-on in the level. """ + def __init__(self, env): super(AllowBacktracking, self).__init__(env) self._cur_x = 0 self._max_x = 0 - def reset(self, **kwargs): # pylint: disable=E0202 + def reset(self, **kwargs): # pylint: disable=E0202 self._cur_x = 0 self._max_x = 0 return self.env.reset(**kwargs) - def step(self, action): # pylint: disable=E0202 + def step(self, action): # pylint: disable=E0202 obs, rew, done, info = self.env.step(action) self._cur_x += rew rew = max(0, self._cur_x - self._max_x) @@ -505,6 +608,7 @@ def unwrap(env): else: return env + class StickyActionEnv(gym.Wrapper): def __init__(self, env, p=0.25): super(StickyActionEnv, self).__init__(env) @@ -553,6 +657,7 @@ class TimeLimit(gym.Wrapper): A little bit changed original openai's TimeLimit env. Main difference is that we always send true or false in infos['time_outs'] """ + def __init__(self, env, max_episode_steps=None): super(TimeLimit, self).__init__(env) self.concat_infos = True @@ -580,7 +685,7 @@ def __init__(self, env): self.observation_space = gym.spaces.Dict({ 'observation': self.env.observation_space, - 'reward': gym.spaces.Box(low=0, high=1, shape=( ), dtype=np.float32), + 'reward': gym.spaces.Box(low=0, high=1, shape=(), dtype=np.float32), 'last_action': gym.spaces.Box(low=0, high=self.env.action_space.n, shape=(), dtype=int) }) @@ -590,7 +695,7 @@ def step(self, action): obs, reward, done, info = self.env.step(action) obs = { 'observation': obs, - 'reward':np.clip(reward, -1, 1), + 'reward': np.clip(reward, -1, 1), 'last_action': action } return obs, reward, done, info @@ -610,6 +715,7 @@ class MaskVelocityWrapper(gym.ObservationWrapper): Gym environment observation wrapper used to mask velocity terms in observations. The intention is the make the MDP partially observatiable. """ + def __init__(self, env, name): super(MaskVelocityWrapper, self).__init__(env) if name == "CartPole-v1": @@ -624,29 +730,31 @@ def __init__(self, env, name): raise NotImplementedError def observation(self, observation): - return observation * self.mask + return observation * self.mask def make_atari(env_id, timelimit=True, noop_max=0, skip=4, sticky=False, directory=None, **kwargs): env = gym.make(env_id, **kwargs) if 'Montezuma' in env_id: - env = MontezumaInfoWrapper(env, room_address=3 if 'Montezuma' in env_id else 1) + env = MontezumaInfoWrapper( + env, room_address=3 if 'Montezuma' in env_id else 1) env = StickyActionEnv(env) env = InfoWrapper(env) if directory != None: - env = gym.wrappers.Monitor(env,directory=directory,force=True) + env = gym.wrappers.Monitor(env, directory=directory, force=True) if sticky: env = StickyActionEnv(env) if not timelimit: env = env.env - #assert 'NoFrameskip' in env.spec.id + # assert 'NoFrameskip' in env.spec.id if noop_max > 0: env = NoopResetEnv(env, noop_max=noop_max) env = MaxAndSkipEnv(env, skip=skip) - #env = EpisodeStackedEnv(env) + # env = EpisodeStackedEnv(env) return env -def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, scale =False, wrap_impala=False): + +def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, scale=False, wrap_impala=False): """Configure environment for DeepMind-style Atari. """ if episode_life: @@ -664,6 +772,7 @@ def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, env = ImpalaEnvWrapper(env) return env + def wrap_carracing(env, clip_rewards=True, frame_stack=True, scale=False): """Configure environment for DeepMind-style Atari. """ @@ -676,11 +785,13 @@ def wrap_carracing(env, clip_rewards=True, frame_stack=True, scale=False): env = FrameStack(env, 4) return env + def make_car_racing(env_id, skip=4): env = make_atari(env_id, noop_max=0, skip=skip) return wrap_carracing(env, clip_rewards=False) + def make_atari_deepmind(env_id, noop_max=30, skip=4, sticky=False, episode_life=True, wrap_impala=False, **kwargs): - env = make_atari(env_id, noop_max=noop_max, skip=skip, sticky=sticky, **kwargs) + env = make_atari(env_id, noop_max=noop_max, + skip=skip, sticky=sticky, **kwargs) return wrap_deepmind(env, episode_life=episode_life, clip_rewards=False, wrap_impala=wrap_impala) - diff --git a/rl_games/configs/mario/mario_v1.yaml b/rl_games/configs/mario/mario_v1.yaml index c788be97..c160acfb 100644 --- a/rl_games/configs/mario/mario_v1.yaml +++ b/rl_games/configs/mario/mario_v1.yaml @@ -7,76 +7,65 @@ params: name: discrete_a2c network: - name: actor_critic + name: resnet_actor_critic + require_rewards: False + require_last_actions: False separate: False + value_shape: 1 space: discrete: - + cnn: - #permute_input: False - type: conv2d - activation: elu + conv_depths: [32, 64, 128, 256] + activation: relu initializer: name: default - #name: glorot_normal_initializer - #gain: 1.4142 regularizer: - name: None - convs: - - filters: 32 - kernel_size: 8 - strides: 4 - padding: 0 - - filters: 64 - kernel_size: 4 - strides: 2 - padding: 0 - - filters: 64 - kernel_size: 3 - strides: 1 - padding: 0 + name: 'None' mlp: units: [512] - activation: elu + activation: relu + regularizer: + name: 'None' initializer: - name: orthogonal_initializer - gain: 1.41421356237 - + name: default + config: name: mario_ray env_name: 'SuperMarioBros-v1' - score_to_win: 20.0 + score_to_win: 100500 normalize_value: True - normalize_input: True + normalize_input: False reward_shaper: - min_val: -1 - max_val: 1 + scale_value: 1 normalize_advantage: True gamma: 0.99 tau: 0.95 grad_norm: 1.0 - entropy_coef: 0.01 + entropy_coef: 0.005 truncate_grads: True learning_rate: 3e-4 lr_schedule: adaptive kl_threshold: 0.01 grad_norm: 1.0 entropy_coef: 0.01 - truncate_grads: True e_clip: 0.2 clip_value: False num_actors: 64 horizon_length: 128 - minibatch_size: 2048 + # seq_length: 8 + minibatch_size: 4096 mini_epochs: 4 critic_coef: 2 - max_epochs: 500 + max_epochs: 5000 use_diagnostics: False player: render: True - games_num: 100 + games_num: 1 n_game_life: 1 - deterministic: True \ No newline at end of file + deterministic: False + use_vecenv: False + render_sleep: 0.05 \ No newline at end of file diff --git a/rl_games/configs/mario/mario_v1_random.yaml b/rl_games/configs/mario/mario_v1_random.yaml new file mode 100644 index 00000000..f74b0746 --- /dev/null +++ b/rl_games/configs/mario/mario_v1_random.yaml @@ -0,0 +1,72 @@ + +params: + seed: 322 + algo: + name: a2c_discrete + + model: + name: discrete_a2c + + network: + name: resnet_actor_critic + require_rewards: False + require_last_actions: False + separate: False + value_shape: 1 + space: + discrete: + + cnn: + conv_depths: [32, 64, 128, 256] + activation: relu + initializer: + name: default + regularizer: + name: 'None' + + mlp: + units: [512] + activation: relu + regularizer: + name: 'None' + initializer: + name: default + + config: + name: mario_ray + env_name: 'SuperMarioBrosRandomStages-v1' + score_to_win: 100500 + normalize_value: True + normalize_input: False + reward_shaper: + scale_value: 1 + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + grad_norm: 1.0 + entropy_coef: 0.005 + truncate_grads: True + learning_rate: 3e-4 + lr_schedule: adaptive + kl_threshold: 0.01 + grad_norm: 1.0 + entropy_coef: 0.01 + e_clip: 0.2 + clip_value: False + num_actors: 64 + horizon_length: 128 + # seq_length: 8 + minibatch_size: 4096 + mini_epochs: 4 + critic_coef: 2 + max_epochs: 5000 + use_diagnostics: False + + + player: + render: True + games_num: 1 + n_game_life: 1 + deterministic: False + use_vecenv: False + render_sleep: 0.05 \ No newline at end of file diff --git a/rl_games/configs/mario/mario_v2.yaml b/rl_games/configs/mario/mario_v2.yaml new file mode 100644 index 00000000..ac94451e --- /dev/null +++ b/rl_games/configs/mario/mario_v2.yaml @@ -0,0 +1,87 @@ +params: + seed: 322 + algo: + name: a2c_discrete + + model: + name: discrete_a2c + + network: + name: actor_critic + separate: False + space: + discrete: + + cnn: + type: conv2d + activation: elu + initializer: + name: glorot_normal_initializer + gain: 1.4142 + regularizer: + name: None + convs: + - filters: 32 + kernel_size: 8 + strides: 4 + padding: 0 + - filters: 64 + kernel_size: 4 + strides: 2 + padding: 0 + - filters: 64 + kernel_size: 3 + strides: 1 + padding: 0 + + mlp: + units: [512] + activation: elu + initializer: + name: orthogonal_initializer + gain: 1.41421356237 + # rnn: + # before_mlp: False + # name: lstm + # units: 512 + # layers: 1 + # layer_norm: True + + config: + name: mario_ray + env_name: 'SuperMarioBros-v1' + score_to_win: 2000000.0 + normalize_value: True + normalize_input: False + reward_shaper: + min_val: -1 + max_val: 1 + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + grad_norm: 0.5 + entropy_coef: 0.01 + truncate_grads: True + learning_rate: 5e-4 + lr_schedule: linear + seq_length: 16 + kl_threshold: 0.01 + grad_norm: 1.0 + entropy_coef: 0.01 + truncate_grads: True + e_clip: 0.2 + clip_value: False + num_actors: 64 + horizon_length: 256 + minibatch_size: 8192 + mini_epochs: 4 + critic_coef: 1 + max_epochs: 5000 + use_diagnostics: False + + + player: + render: True + games_num: 1 + n_game_life: 1 + deterministic: True \ No newline at end of file diff --git a/rl_games/configs/mario/mario_v3.yaml b/rl_games/configs/mario/mario_v3.yaml new file mode 100644 index 00000000..90361f86 --- /dev/null +++ b/rl_games/configs/mario/mario_v3.yaml @@ -0,0 +1,80 @@ +params: + seed: 322 + algo: + name: a2c_discrete + + model: + name: discrete_a2c + + network: + name: actor_critic + separate: False + space: + discrete: + + cnn: + #permute_input: False + type: conv2d + activation: elu + initializer: + name: default + #name: glorot_normal_initializer + #gain: 1.4142 + regularizer: + name: None + convs: + - filters: 32 + kernel_size: 8 + strides: 4 + padding: 0 + - filters: 64 + kernel_size: 4 + strides: 2 + padding: 0 + - filters: 64 + kernel_size: 3 + strides: 1 + padding: 0 + + mlp: + units: [512] + activation: elu + initializer: + name: orthogonal_initializer + gain: 1.41421356237 + + config: + name: mario_ray + env_name: 'SuperMarioBros-v1' + score_to_win: 100500 + normalize_value: True + normalize_input: False + reward_shaper: + scale_value: 1 + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + grad_norm: 1.0 + entropy_coef: 0.005 + truncate_grads: True + learning_rate: 3e-4 + lr_schedule: adaptive + kl_threshold: 0.01 + grad_norm: 1.0 + entropy_coef: 0.01 + e_clip: 0.2 + clip_value: False + num_actors: 64 + horizon_length: 128 + minibatch_size: 4096 + mini_epochs: 4 + critic_coef: 2 + max_epochs: 5000 + use_diagnostics: False + player: + render: True + games_num: 1 + n_game_life: 1 + deterministic: True + use_vecenv: False + render_sleep: 0.05 \ No newline at end of file