diff --git a/rl_games/algos_torch/network_builder.py b/rl_games/algos_torch/network_builder.py index 0eba37b0..59059e42 100644 --- a/rl_games/algos_torch/network_builder.py +++ b/rl_games/algos_torch/network_builder.py @@ -654,7 +654,7 @@ def __init__(self, params, **kwargs): rnn_in_size += actions_num self.rnn = self._build_rnn(self.rnn_name, rnn_in_size, self.rnn_units, self.rnn_layers) - #self.layer_norm = torch.nn.LayerNorm(self.rnn_units) + self.layer_norm = torch.nn.LayerNorm(self.rnn_units) mlp_args = { 'input_size' : mlp_input_size, diff --git a/rl_games/common/ivecenv.py b/rl_games/common/ivecenv.py index 97c43cb4..49be387b 100644 --- a/rl_games/common/ivecenv.py +++ b/rl_games/common/ivecenv.py @@ -34,3 +34,6 @@ def get_env_state(self): def set_env_state(self, env_state): pass + + def render(self, mode, **kwargs): + pass \ No newline at end of file diff --git a/rl_games/common/vecenv.py b/rl_games/common/vecenv.py index 66255ac9..104663c1 100644 --- a/rl_games/common/vecenv.py +++ b/rl_games/common/vecenv.py @@ -46,8 +46,8 @@ def seed(self, seed): random.seed(seed) self.env.seed(seed) - def render(self): - self.env.render() + def render(self, **kwargs): + self.env.render(**kwargs) def reset(self): obs = self.env.reset() @@ -72,6 +72,9 @@ def can_concat_infos(self): else: return False + def render(self, mode, **kwargs): + self.env.render(mode, **kwargs) + def get_env_info(self): info = {} observation_space = self.env.observation_space @@ -215,6 +218,10 @@ def reset(self): ret_obs = newobsdict return ret_obs + def render(self, mode, **kwargs): + res = self.workers[0].render.remote(mode, **kwargs) + return self.ray.get(res) + vecenv_config = {} def register(config_name, func): diff --git a/rl_games/common/wrappers.py b/rl_games/common/wrappers.py index 2e9355d6..38b20d26 100644 --- a/rl_games/common/wrappers.py +++ b/rl_games/common/wrappers.py @@ -93,12 +93,14 @@ def step(self, action): # check current lives, make loss of life terminal, # then update lives to handle bonus lives lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: + if lives < self.lives: # for Qbert sometimes we stay in lives == 0 condition for a few frames # so it's important to keep lives > 0, so that we only reset once # the environment advertises done. done = True - self.lives = lives + elif lives > self.lives: + # do not allow use of bonus life + self.lives = lives return obs, reward, done, info def reset(self, **kwargs): @@ -115,20 +117,21 @@ def reset(self, **kwargs): return obs class EpisodicLifeMarioEnv(gym.Wrapper): - def __init__(self, env): + def __init__(self, env, max_lives): """Make end-of-life == end-of-episode, but only reset on True game over. Done by DeepMind for the DQN and co. since it helps value estimation. """ gym.Wrapper.__init__(self, env) self.lives = 0 self.was_real_done = True + self.max_lives = max_lives def step(self, action): obs, reward, done, info = self.env.step(action) self.was_real_done = done # check current lives, make loss of life terminal, # then update lives to handle bonus lives - lives = self.env.unwrapped.env._life + lives = self.env.unwrapped._life if lives < self.lives: # for Qbert sometimes we stay in lives == 0 condition for a few frames # so it's important to keep lives > 0, so that we only reset once @@ -146,42 +149,7 @@ def reset(self, **kwargs): """ if self.was_real_done: obs = self.env.reset(**kwargs) - else: - # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) - self.lives = self.env.unwrapped.env._life - return obs - -class EpisodicLifeMarioEnv(gym.Wrapper): - def __init__(self, env): - """Make end-of-life == end-of-episode, but only reset on True game over. - Done by DeepMind for the DQN and co. since it helps value estimation. - """ - gym.Wrapper.__init__(self, env) - self.lives = 0 - self.was_real_done = True - - def step(self, action): - obs, reward, done, info = self.env.step(action) - self.was_real_done = done - # check current lives, make loss of life terminal, - # then update lives to handle bonus lives - lives = self.env.unwrapped._life - if lives < self.lives and lives > 0: - # for Qbert sometimes we stay in lives == 0 condition for a few frames - # so it's important to keep lives > 0, so that we only reset once - # the environment advertises done. - done = True - self.lives = lives - return obs, reward, done, info - - def reset(self, **kwargs): - """Reset only when lives are exhausted. - This way all states are still reachable even though lives are episodic, - and the learner need not know about any of this behind-the-scenes. - """ - if self.was_real_done: - obs = self.env.reset(**kwargs) + self.env.unwrapped.ram[0x075a] = self.max_lives else: # no-op step to advance from terminal/lost life state obs, _, _, _ = self.env.step(0) @@ -225,6 +193,28 @@ def reset(self, **kwargs): self.lives = self.env.unwrapped.env._life return obs +class PreventSlugEnv(gym.Wrapper): + def __init__(self, env, max_no_rewards=10000): + """Abort if too much time without getting reward.""" + MyWrapper.__init__(self, env) + self.last_reward = 0 + self.steps = 0 + self.max_no_rewards = max_no_rewards + self.got_reward = False + + def step(self, *args, **kwargs): + obs, reward, done, info = self.env.step(*args, **kwargs) + self.steps += 1 + if reward > 0: + self.last_reward = self.steps + if self.steps - self.last_reward > self.max_no_rewards: + done = True + return obs, reward, done, info + + def reset(self): + self.got_reward = False + self.steps = 0 + return self.env.reset() class EpisodeStackedEnv(gym.Wrapper): def __init__(self, env): @@ -791,14 +781,14 @@ def make_atari(env_id, timelimit=True, noop_max=0, skip=4, sticky=False, directo return env -def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, scale=False, wrap_impala=False): +def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, scale=False, wrap_impala=False, gray=True): """Configure environment for DeepMind-style Atari. """ if episode_life: env = EpisodicLifeEnv(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireResetEnv(env) - env = WarpFrame(env) + env = WarpFrame(env, grayscale=gray) if scale: env = ScaledFloatFrame(env) if clip_rewards: diff --git a/rl_games/configs/mario/mario_resnet.yaml b/rl_games/configs/mario/mario_resnet.yaml index 6ad79a54..97e3b2d6 100644 --- a/rl_games/configs/mario/mario_resnet.yaml +++ b/rl_games/configs/mario/mario_resnet.yaml @@ -20,55 +20,54 @@ params: activation: relu initializer: name: default - regularizer: - name: 'None' + mlp: units: [512] activation: relu - regularizer: - name: 'None' initializer: - name: default + name: orthogonal_initializer + gain: 1.41421356237 rnn: name: lstm units: 256 layers: 1 + #layer_norm: True config: reward_shaper: - min_val: -1 - max_val: 1 + scale_value: 1 normalize_advantage: True gamma: 0.995 tau: 0.95 - learning_rate: 3e-4 + learning_rate: 5e-4 name: mario_resnet score_to_win: 100000 - grad_norm: 1.5 - entropy_coef: 0.01 + grad_norm: 1.0 + entropy_coef: 0.005 truncate_grads: True env_name: MarioEnv e_clip: 0.2 clip_value: True num_actors: 16 - horizon_length: 256 - minibatch_size: 2048 - mini_epochs: 2 - critic_coef: 1 - lr_schedule: None - kl_threshold: 0.01 + horizon_length: 512 + minibatch_size: 4096 + mini_epochs: 3 + critic_coef: 2 + lr_schedule: None #adaptive + kl_threshold: 0.008 normalize_input: False + normalize_value: True use_diagnostics: True seq_length: 32 max_epochs: 200000 - + weight_decay: 0.0001 + save_frequency: 50 env_config: use_dict_obs_space: True player: - render: False + render: True games_num: 20 - n_game_life: 5 - deterministic: True - + deterministic: False + use_vecenv: True diff --git a/rl_games/configs/mario/mario_v1_random.yaml b/rl_games/configs/mario/mario_v1_random.yaml index f74b0746..205bf235 100644 --- a/rl_games/configs/mario/mario_v1_random.yaml +++ b/rl_games/configs/mario/mario_v1_random.yaml @@ -8,33 +8,45 @@ params: name: discrete_a2c network: - name: resnet_actor_critic - require_rewards: False - require_last_actions: False + name: actor_critic separate: False - value_shape: 1 space: discrete: - + cnn: - conv_depths: [32, 64, 128, 256] - activation: relu + #permute_input: False + type: conv2d + activation: elu initializer: name: default + #name: glorot_normal_initializer + #gain: 1.4142 regularizer: - name: 'None' + name: None + convs: + - filters: 32 + kernel_size: 8 + strides: 4 + padding: 0 + - filters: 64 + kernel_size: 4 + strides: 2 + padding: 0 + - filters: 64 + kernel_size: 3 + strides: 1 + padding: 0 mlp: units: [512] - activation: relu - regularizer: - name: 'None' + activation: elu initializer: - name: default + name: orthogonal_initializer + gain: 1.41421356237 config: name: mario_ray - env_name: 'SuperMarioBrosRandomStages-v1' + env_name: MarioEnv score_to_win: 100500 normalize_value: True normalize_input: False @@ -53,8 +65,8 @@ params: entropy_coef: 0.01 e_clip: 0.2 clip_value: False - num_actors: 64 - horizon_length: 128 + num_actors: 16 + horizon_length: 512 # seq_length: 8 minibatch_size: 4096 mini_epochs: 4 @@ -65,8 +77,8 @@ params: player: render: True - games_num: 1 - n_game_life: 1 + games_num: 2 + n_game_life: 16 deterministic: False use_vecenv: False render_sleep: 0.05 \ No newline at end of file diff --git a/rl_games/envs/mario.py b/rl_games/envs/mario.py index ddf84d6e..3c69d6a0 100644 --- a/rl_games/envs/mario.py +++ b/rl_games/envs/mario.py @@ -5,7 +5,6 @@ class MarioEnv(gym.Env): def __init__(self, **kwargs): env_name=kwargs.pop('env_name', 'SuperMarioBros-v1') - self.has_lives = kwargs.pop('has_lives', True) self.max_lives = kwargs.pop('max_lives', 16) self.movement = kwargs.pop('movement', 'SIMPLE') self.use_dict_obs_space = kwargs.pop('use_dict_obs_space', False) @@ -26,16 +25,16 @@ def _create_super_mario_env(self, name='SuperMarioBros-v1'): from nes_py.wrappers import JoypadSpace from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT import gym_super_mario_bros - movement = SIMPLE_MOVEMENT if name == 'SIMPLE' else COMPLEX_MOVEMENT + movement = SIMPLE_MOVEMENT if self.movement == 'SIMPLE' else COMPLEX_MOVEMENT env = gym_super_mario_bros.make(name) env = JoypadSpace(env, movement) if 'Random' in name: env = wrappers.EpisodicLifeRandomMarioEnv(env) else: - env = wrappers.EpisodicLifeMarioEnv(env) + env = wrappers.EpisodicLifeMarioEnv(env, self.max_lives) env = wrappers.MaxAndSkipEnv(env, skip=4) env = wrappers.wrap_deepmind( - env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) + env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True, gray=False) return env def step(self, action): @@ -43,14 +42,14 @@ def step(self, action): if self.use_dict_obs_space: next_obs = { 'observation': next_obs, - 'reward': np.clip(np.array(reward, dtype=float), -1, 1), + 'reward': np.array(reward, dtype=float), 'last_action': np.array(action, dtype=int) } return next_obs, reward, is_done, info def reset(self): obs = self.env.reset() - self.env.unwrapped.ram[0x075a] = self.max_lives + if self.use_dict_obs_space: obs = { 'observation': obs, @@ -59,5 +58,8 @@ def reset(self): } return obs + def render(self, mode, **kwargs): + self.env.render(mode, **kwargs) + def get_number_of_agents(self): return 1 \ No newline at end of file