diff --git a/rl_games/algos_torch/running_mean_std.py b/rl_games/algos_torch/running_mean_std.py index 4cdad825..7a6e21df 100644 --- a/rl_games/algos_torch/running_mean_std.py +++ b/rl_games/algos_torch/running_mean_std.py @@ -81,7 +81,7 @@ def forward(self, input, denorm:bool=False, mask:Optional[torch.Tensor]=None): y = torch.sqrt(current_var.float() + self.epsilon)*y + current_mean.float() else: if self.norm_only: - y = input/ torch.sqrt(current_var.float() + self.epsilon) + y = input / torch.sqrt(current_var.float() + self.epsilon) else: y = (input - current_mean.float()) / torch.sqrt(current_var.float() + self.epsilon) y = torch.clamp(y, min=-5.0, max=5.0) diff --git a/rl_games/configs/atari/ppo_breakout.yaml b/rl_games/configs/atari/ppo_breakout.yaml index 95ceee8e..97b43674 100644 --- a/rl_games/configs/atari/ppo_breakout.yaml +++ b/rl_games/configs/atari/ppo_breakout.yaml @@ -17,7 +17,7 @@ params: activation: relu initializer: name: orthogonal_initializer - gain: 1.41421356237 + gain: 1.41421356237 convs: - filters: 32 kernel_size: 8 @@ -31,7 +31,7 @@ params: kernel_size: 3 strides: 1 padding: 0 - + mlp: units: [512] activation: relu @@ -55,7 +55,7 @@ params: learning_rate: 8e-4 lr_schedule: None kl_threshold: 0.01 - + grad_norm: 1.0 entropy_coef: 0.01 truncate_grads: True diff --git a/rl_games/configs/atari/ppo_pacman_envpool_impala.yaml b/rl_games/configs/atari/ppo_pacman_envpool_impala.yaml new file mode 100644 index 00000000..77ca4808 --- /dev/null +++ b/rl_games/configs/atari/ppo_pacman_envpool_impala.yaml @@ -0,0 +1,79 @@ +params: + algo: + name: a2c_discrete + + model: + name: discrete_a2c + + network: + name: resnet_actor_critic + require_rewards: True + require_last_actions: True + separate: False + value_shape: 1 + space: + discrete: + + cnn: + permute_input: False + conv_depths: [16, 32, 32] + activation: relu + initializer: + name: default + regularizer: + name: 'None' + + mlp: + units: [512] + activation: relu + regularizer: + name: None + initializer: + name: default + rnn: + name: lstm + units: 256 + layers: 1 + + config: + name: pacman_impala + env_name: envpool + normalize_advantage: True + normalize_input: False + normalize_value: False + reward_shaper: + min_val: -100 + max_val: 100 + #scale_value: 0.01 + gamma: 0.995 + tau: 0.95 + learning_rate: 3e-4 + score_to_win: 100000 + grad_norm: 1.5 + entropy_coef: 0.01 + truncate_grads: True + e_clip: 0.2 + clip_value: True + num_actors: 64 + horizon_length: 128 + minibatch_size: 2048 + mini_epochs: 2 + critic_coef: 1 + lr_schedule: None + kl_threshold: 0.01 + use_diagnostics: True + seq_length: 32 + max_epochs: 200000 + + env_config: + env_name: MsPacman-v5 + episodic_life: True + has_lives: True + use_dict_obs_space: True + + player: + render: False + games_num: 20 + n_game_life: 3 + deterministic: True + diff --git a/rl_games/configs/atari/ppo_pacman_torch_rnn.yaml b/rl_games/configs/atari/ppo_pacman_torch_rnn.yaml index 195e5af9..f8eee1ee 100644 --- a/rl_games/configs/atari/ppo_pacman_torch_rnn.yaml +++ b/rl_games/configs/atari/ppo_pacman_torch_rnn.yaml @@ -5,8 +5,6 @@ params: model: name: discrete_a2c - - network: name: actor_critic separate: False @@ -18,7 +16,7 @@ params: activation: relu initializer: name: glorot_normal_initializer - gain: 1.4142 + gain: 1.4142 regularizer: name: 'None' convs: @@ -34,7 +32,6 @@ params: kernel_size: 3 strides: 1 padding: 0 - mlp: units: [512] activation: relu @@ -54,7 +51,7 @@ params: #min_val: -1 #max_val: 1 scale_value: 1 - + normalize_advantage: True gamma: 0.99 tau: 0.95 @@ -78,10 +75,12 @@ params: normalize_input: False normalize_value: True max_epochs: 50000 + env_config: skip: 4 name: 'MsPacmanNoFrameskip-v4' episode_life: True + player: render: True games_num: 10 diff --git a/rl_games/configs/atari/ppo_pong_envpool_resnet.yaml b/rl_games/configs/atari/ppo_pong_envpool_resnet.yaml new file mode 100644 index 00000000..945fd8c2 --- /dev/null +++ b/rl_games/configs/atari/ppo_pong_envpool_resnet.yaml @@ -0,0 +1,83 @@ +params: + algo: + name: a2c_discrete + + model: + name: discrete_a2c + + network: + name: e2e_vision_actor_critic + separate: False + value_shape: 1 + space: + discrete: + + backbone: + type: resnet18 # can be efficientnet_v2_s #convnext_tiny #vit_b_16 #resnet18 #resnet34 + pretrained: True + permute_input: False + freeze: False + preprocess_image: True + args: + zero_init_residual: True + norm_layer: None + + mlp: + units: [512] + activation: relu + regularizer: + name: None + initializer: + name: default + rnn: + name: lstm + units: 512 + layers: 1 + before_mlp: True + concat_output: True + + config: + name: Pong_resnet18_LSTM_MLP_512_concat_output_2e-4_linear_LR_norm + env_name: envpool + score_to_win: 20.0 + mixed_precision: True + normalize_input: True + normalize_value: True + normalize_advantage: True + reward_shaper: + min_val: -1 + max_val: 1 + gamma: 0.99 + tau: 0.95 + grad_norm: 1.0 + entropy_coef: 0.01 + truncate_grads: True + e_clip: 0.2 + clip_value: True + save_best_after: 25 + save_frequency: 200 + num_actors: 64 + horizon_length: 128 + minibatch_size: 2048 + mini_epochs: 2 + critic_coef: 1 + learning_rate: 2e-4 + lr_schedule: linear + kl_threshold: 0.01 + use_diagnostics: True + seq_length: 8 + max_epochs: 500 + #weight_decay: 0.001 + + env_config: + env_name: Pong-v5 + has_lives: False + use_dict_obs_space: False #True + stack_num: 1 + gray_scale: False + player: + render: True + games_num: 10 + n_game_life: 1 + deterministic: True + diff --git a/rl_games/configs/maniskill/maniskill_ant.yaml b/rl_games/configs/maniskill/maniskill_ant.yaml new file mode 100644 index 00000000..688e24fc --- /dev/null +++ b/rl_games/configs/maniskill/maniskill_ant.yaml @@ -0,0 +1,65 @@ +params: + seed: 5 + algo: + name: a2c_continuous + + model: + name: continuous_a2c_logstd + + network: + name: actor_critic + separate: False + space: + continuous: + mu_activation: None + sigma_activation: None + mu_init: + name: default + sigma_init: + name: const_initializer + val: 0 + fixed_sigma: True + mlp: + units: [256, 128, 64] + activation: elu + initializer: + name: default + + config: + name: AntRun + env_name: maniskill + normalize_input: True + normalize_value: True + value_bootstrap: True + reward_shaper: + scale_value: 1.0 + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + + learning_rate: 3e-4 + lr_schedule: adaptive + kl_threshold: 0.008 + grad_norm: 1.0 + entropy_coef: 0.0 + truncate_grads: True + e_clip: 0.2 + clip_value: True + use_smooth_clamp: True + bound_loss_type: regularisation + bounds_loss_coef: 0.0005 + max_epochs: 1000 + save_best_after: 25 + save_frequency: 100 + num_actors: 4096 + horizon_length: 16 + minibatch_size: 32768 + mini_epochs: 4 + critic_coef: 2 + + env_config: + env_name: MS-HumanoidRun-v1 + + player: + render: True + render_sleep: 0.0 \ No newline at end of file diff --git a/rl_games/configs/maniskill/maniskill_pickcube_impala.yaml b/rl_games/configs/maniskill/maniskill_pickcube_impala.yaml index 8f3f4786..0b86d9f2 100644 --- a/rl_games/configs/maniskill/maniskill_pickcube_impala.yaml +++ b/rl_games/configs/maniskill/maniskill_pickcube_impala.yaml @@ -23,9 +23,9 @@ params: continuous: mu_activation: None sigma_activation: None - mu_init: name: default + scale: 0.02 sigma_init: name: const_initializer val: 0 @@ -35,25 +35,27 @@ params: conv_depths: [16, 32, 32] activation: relu initializer: - name: default + name: orthogonal_initializer + gain: 1.41421356237 regularizer: name: None mlp: - units: [512, 256] + units: [256] activation: elu regularizer: name: None initializer: name: default - # rnn: - # name: lstm - # units: 512 - # layers: 1 - # before_mlp: True - # concat_output: True + rnn: + name: lstm + layer_norm: True + units: 512 + layers: 1 + before_mlp: True + concat_output: True config: - name: PickCube_RGB_impala + name: PickCube_RGB_impala_lstm_init_2e-4_linear_lr env_name: maniskill reward_shaper: scale_value: 1.0 @@ -70,17 +72,17 @@ params: scale_value: 1.0 gamma: 0.99 tau : 0.95 - learning_rate: 1e-4 - lr_schedule: adaptive + learning_rate: 2e-4 + lr_schedule: linear kl_threshold: 0.008 - max_epochs: 10000 + max_epochs: 20000 save_best_after: 25 save_frequency: 500 grad_norm: 1.0 entropy_coef: 0.0 truncate_grads: True e_clip: 0.2 - horizon_length: 16 + horizon_length: 32 minibatch_size: 2048 mini_epochs: 2 critic_coef: 1 diff --git a/rl_games/configs/maniskill/maniskill_pickcube_impala_lstm.yaml b/rl_games/configs/maniskill/maniskill_pickcube_impala_lstm.yaml new file mode 100644 index 00000000..a0a71480 --- /dev/null +++ b/rl_games/configs/maniskill/maniskill_pickcube_impala_lstm.yaml @@ -0,0 +1,102 @@ +params: + seed: 42 + + # environment wrapper clipping + env: + # added to the wrapper + clip_observations: 5.0 + # can make custom wrapper? + clip_actions: 1.0 + algo: + name: a2c_continuous + + model: + name: continuous_a2c_logstd + + network: + name: vision_actor_critic + require_rewards: False + require_last_actions: False + separate: False + value_shape: 1 + space: + continuous: + mu_activation: None + sigma_activation: None + + mu_init: + name: default + sigma_init: + name: const_initializer + val: 0 + fixed_sigma: True + cnn: + permute_input: True + conv_depths: [16, 32, 32] + activation: relu + initializer: + name: default + regularizer: + name: None + mlp: + units: [256] + activation: elu + regularizer: + name: None + initializer: + name: default + rnn: + name: lstm + layer_norm: True + units: 512 + layers: 1 + before_mlp: True + concat_output: True + + config: + name: PickCube_RGB_impala_LSTM_norm + env_name: maniskill + reward_shaper: + scale_value: 1.0 + device: cuda:0 + device_name: cuda:0 + multi_gpu: False + ppo: True + mixed_precision: True + normalize_input: False + normalize_value: True + normalize_advantage: True + num_actors: 256 + reward_shaper: + scale_value: 1.0 + gamma: 0.99 + tau : 0.95 + learning_rate: 2e-4 + lr_schedule: linear + kl_threshold: 0.008 + max_epochs: 20000 + save_best_after: 25 + save_frequency: 500 + grad_norm: 1.0 + entropy_coef: 0.0 + truncate_grads: True + e_clip: 0.2 + horizon_length: 32 + minibatch_size: 1024 #2048 + mini_epochs: 2 + critic_coef: 1 + clip_value: True + seq_length: 8 + bounds_loss_coef: 0.0001 + #weight_decay: 0.001 + + env_config: + env_name: PickCube-v1 + obs_mode: rgbd + control_mode: pd_ee_delta_pose + reward_mode: dense + + player: + render: True + deterministic: True + diff --git a/rl_games/configs/maniskill/maniskill_pickcube_impala_small_lstm.yaml b/rl_games/configs/maniskill/maniskill_pickcube_impala_small_lstm.yaml new file mode 100644 index 00000000..39ea525a --- /dev/null +++ b/rl_games/configs/maniskill/maniskill_pickcube_impala_small_lstm.yaml @@ -0,0 +1,103 @@ +params: + seed: 42 + + # environment wrapper clipping + env: + # added to the wrapper + clip_observations: 5.0 + # can make custom wrapper? + clip_actions: 1.0 + algo: + name: a2c_continuous + + model: + name: continuous_a2c_logstd + + network: + name: vision_actor_critic + require_rewards: False + require_last_actions: False + separate: False + value_shape: 1 + space: + continuous: + mu_activation: None + sigma_activation: None + mu_init: + name: default + scale: 0.02 + sigma_init: + name: const_initializer + val: 0 + fixed_sigma: True + cnn: + permute_input: True + conv_depths: [8, 16, 16] + activation: relu + initializer: + name: orthogonal_initializer + gain: 1.41421356237 + regularizer: + name: None + mlp: + units: [256] + activation: elu + regularizer: + name: None + initializer: + name: default + rnn: + name: lstm + layer_norm: True + units: 512 + layers: 1 + before_mlp: True + concat_output: True + + config: + name: PickCube_RGB_Impala_Small_LSTM_norm_embedding + env_name: maniskill + reward_shaper: + scale_value: 1.0 + device: cuda:0 + device_name: cuda:0 + multi_gpu: False + ppo: True + mixed_precision: True + normalize_input: False + normalize_value: True + normalize_advantage: True + num_actors: 256 + reward_shaper: + scale_value: 1.0 + gamma: 0.99 + tau : 0.95 + learning_rate: 2e-4 + lr_schedule: None + kl_threshold: 0.008 + max_epochs: 10000 + save_best_after: 25 + save_frequency: 500 + grad_norm: 1.0 + entropy_coef: 0.0 + truncate_grads: True + e_clip: 0.2 + horizon_length: 32 + minibatch_size: 2048 + mini_epochs: 2 + critic_coef: 1 + clip_value: True + seq_length: 8 + bounds_loss_coef: 0.0001 + #weight_decay: 0.001 + + env_config: + env_name: PickCube-v1 + obs_mode: rgbd + control_mode: pd_ee_delta_pose + reward_mode: dense + + player: + render: False + deterministic: True + diff --git a/rl_games/configs/maniskill/maniskill_pickcube_vision.yaml b/rl_games/configs/maniskill/maniskill_pickcube_vision.yaml index 232d3b7c..0b05ff83 100644 --- a/rl_games/configs/maniskill/maniskill_pickcube_vision.yaml +++ b/rl_games/configs/maniskill/maniskill_pickcube_vision.yaml @@ -24,6 +24,7 @@ params: mu_init: name: default + scale: 0.02 sigma_init: name: const_initializer val: 0 @@ -39,7 +40,7 @@ params: zero_init_residual: True norm_layer: None mlp: - units: [512, 256] + units: [512] activation: elu regularizer: name: None @@ -71,16 +72,16 @@ params: gamma: 0.99 tau : 0.95 learning_rate: 1e-4 - lr_schedule: adaptive + lr_schedule: None kl_threshold: 0.008 - max_epochs: 20000 + max_epochs: 50000 save_best_after: 25 save_frequency: 500 grad_norm: 1.0 entropy_coef: 0.0 truncate_grads: True e_clip: 0.2 - horizon_length: 16 + horizon_length: 64 minibatch_size: 512 mini_epochs: 2 critic_coef: 1 diff --git a/rl_games/networks/vision_networks.py b/rl_games/networks/vision_networks.py index 634745d6..15e13de9 100644 --- a/rl_games/networks/vision_networks.py +++ b/rl_games/networks/vision_networks.py @@ -1,6 +1,7 @@ import torch from torch import nn from torchvision import models +from rl_games.algos_torch.running_mean_std import RunningMeanStd, RunningMeanStdObs import torch.nn.functional as F from rl_games.algos_torch import torch_ext from rl_games.algos_torch.network_builder import NetworkBuilder, ImpalaSequential @@ -21,10 +22,13 @@ def __init__(self, params, **kwargs): if type(full_input_shape) is dict: input_shape = full_input_shape['camera'] proprio_shape = full_input_shape['proprio'] + proprio_size = proprio_shape[0] else: input_shape = full_input_shape + self.normalize_emb = kwargs.pop('normalize_emb', False) + self.num_seqs = kwargs.pop('num_seqs', 1) self.value_size = kwargs.pop('value_size', 1) @@ -42,6 +46,9 @@ def __init__(self, params, **kwargs): else: out_size = self.units[-1] + self.running_mean_std = torch.jit.script(RunningMeanStd((mlp_input_size,))) + self.layer_norm_emb = torch.nn.LayerNorm(mlp_input_size) + if self.has_rnn: if not self.is_rnn_before_mlp: rnn_in_size = out_size @@ -55,7 +62,7 @@ def __init__(self, params, **kwargs): mlp_args = { 'input_size' : mlp_input_size, - 'units' :self.units, + 'units' : self.units, 'activation' : self.activation, 'norm_func_name' : self.normalization, 'dense_func' : torch.nn.Linear @@ -101,6 +108,14 @@ def __init__(self, params, **kwargs): mlp_init(self.value.weight) + def norm_emb(self, embedding): + #with torch.no_grad(): + return self.running_mean_std(embedding) if self.normalize_emb else embedding + # if len(self.units) == 0: + # out_size = cnn_output_size + # else: + # out_size = self.units[-1] + def forward(self, obs_dict): obs = obs_dict['obs']['camera'] proprio = obs_dict['obs']['proprio'] @@ -117,6 +132,9 @@ def forward(self, obs_dict): out = self.flatten_act(out) out = torch.cat([out, proprio], dim=1) + #print('out shape: ', out.shape) + #out = self.norm_emb(out) + out = self.layer_norm_emb(out) if self.has_rnn: seq_length = obs_dict.get('seq_length', 1)