From 8b274d10d6c9a570f7c3e07c2f3b04f9b43c34ee Mon Sep 17 00:00:00 2001
From: ViktorM <victor.makoviychuk@gmail.com>
Date: Sun, 8 Sep 2024 13:42:54 -0700
Subject: [PATCH] Layer norm for vision model. Better maniskill training
 configs.

---
 rl_games/configs/maniskill/maniskill_ant.yaml |  1 -
 .../maniskill_pickcube_impala_small_lstm.yaml |  4 +--
 .../maniskill/maniskill_pickcube_state.yaml   |  7 +++--
 .../maniskill/maniskill_pickcube_vision.yaml  | 27 ++++++++++---------
 rl_games/envs/maniskill.py                    | 25 ++---------------
 rl_games/networks/vision_networks.py          |  9 ++++---
 6 files changed, 27 insertions(+), 46 deletions(-)

diff --git a/rl_games/configs/maniskill/maniskill_ant.yaml b/rl_games/configs/maniskill/maniskill_ant.yaml
index 688e24fc..063bd80c 100644
--- a/rl_games/configs/maniskill/maniskill_ant.yaml
+++ b/rl_games/configs/maniskill/maniskill_ant.yaml
@@ -36,7 +36,6 @@ params:
       normalize_advantage: True
       gamma: 0.99
       tau: 0.95
-
       learning_rate: 3e-4
       lr_schedule: adaptive
       kl_threshold: 0.008
diff --git a/rl_games/configs/maniskill/maniskill_pickcube_impala_small_lstm.yaml b/rl_games/configs/maniskill/maniskill_pickcube_impala_small_lstm.yaml
index 39ea525a..01ed1a7f 100644
--- a/rl_games/configs/maniskill/maniskill_pickcube_impala_small_lstm.yaml
+++ b/rl_games/configs/maniskill/maniskill_pickcube_impala_small_lstm.yaml
@@ -55,7 +55,7 @@ params:
       concat_output: True
 
   config:
-    name: PickCube_RGB_Impala_Small_LSTM_norm_embedding
+    name: PickCube_RGB_Impala_Small_LSTM_RMSnorm_embedding_128envs
     env_name: maniskill
     reward_shaper:
       scale_value: 1.0
@@ -67,7 +67,7 @@ params:
     normalize_input: False
     normalize_value: True
     normalize_advantage: True
-    num_actors: 256
+    num_actors: 128 #256
     reward_shaper:
       scale_value: 1.0
     gamma: 0.99
diff --git a/rl_games/configs/maniskill/maniskill_pickcube_state.yaml b/rl_games/configs/maniskill/maniskill_pickcube_state.yaml
index 9779fb41..a3e60b29 100644
--- a/rl_games/configs/maniskill/maniskill_pickcube_state.yaml
+++ b/rl_games/configs/maniskill/maniskill_pickcube_state.yaml
@@ -36,7 +36,6 @@ params:
       normalize_advantage: True
       gamma: 0.99
       tau: 0.95
-
       learning_rate: 3e-4
       lr_schedule: adaptive
       kl_threshold: 0.008
@@ -48,9 +47,9 @@ params:
       use_smooth_clamp: True
       bound_loss_type: regularisation
       bounds_loss_coef: 0.0005
-      max_epochs: 5000
+      max_epochs: 2000
       save_best_after: 25
-      save_frequency: 100
+      save_frequency: 500
       num_actors: 4096
       horizon_length: 16
       minibatch_size: 16384
@@ -58,7 +57,7 @@ params:
       critic_coef: 2
 
       env_config:
-        env_name: PickCube-v1  # todo: add list of all envs
+        env_name: PickCube-v1 # todo: add list of all envs
         observation_mode: state
         control_mode: pd_ee_delta_pose
         reward_mode: dense
diff --git a/rl_games/configs/maniskill/maniskill_pickcube_vision.yaml b/rl_games/configs/maniskill/maniskill_pickcube_vision.yaml
index 0b05ff83..0d8449e3 100644
--- a/rl_games/configs/maniskill/maniskill_pickcube_vision.yaml
+++ b/rl_games/configs/maniskill/maniskill_pickcube_vision.yaml
@@ -40,21 +40,22 @@ params:
         zero_init_residual: True
         norm_layer: None
     mlp:
-      units: [512]
+      units: [256]
       activation: elu
       regularizer:
         name: None
       initializer:
         name: default
-    # rnn:
-    #   name: lstm
-    #   units: 512
-    #   layers: 1
-    #   before_mlp: True
-    #   concat_output: True
+    rnn:
+      name: lstm
+      layer_norm: True
+      units: 512
+      layers: 1
+      before_mlp: True
+      concat_output: True
 
   config:
-    name: PickCube_RGB_resnet18
+    name: PickCube_RGB_resnet18_LSTM_norm_embedding_64envs
     env_name: maniskill
     reward_shaper:
       scale_value: 1.0
@@ -66,22 +67,22 @@ params:
     normalize_input: False
     normalize_value: True
     normalize_advantage: True
-    num_actors: 128
+    num_actors: 64 #128
     reward_shaper:
       scale_value: 1.0
     gamma: 0.99
     tau : 0.95
     learning_rate: 1e-4
-    lr_schedule: None
+    lr_schedule: linear
     kl_threshold: 0.008
-    max_epochs: 50000
+    max_epochs: 20000
     save_best_after: 25
     save_frequency: 500
     grad_norm: 1.0
     entropy_coef: 0.0
     truncate_grads: True
     e_clip: 0.2
-    horizon_length: 64
+    horizon_length: 128
     minibatch_size: 512
     mini_epochs: 2
     critic_coef: 1
@@ -97,6 +98,6 @@ params:
       reward_mode: dense
 
     player:
-        render: True
+        render: False
         deterministic: True
 
diff --git a/rl_games/envs/maniskill.py b/rl_games/envs/maniskill.py
index d31a4695..fa472bcd 100644
--- a/rl_games/envs/maniskill.py
+++ b/rl_games/envs/maniskill.py
@@ -126,7 +126,7 @@ def __init__(self, config_name, num_envs, **kwargs):
                             robot_uids=self.robot_uids,
                             enable_shadow=True # this makes the default lighting cast shadows
                             )
-        
+
         print("Observation Space Before:", self.env.observation_space)
         policy_obs_space = self.env.unwrapped.single_observation_space
         print("Observation Space Unwrapped Before:", policy_obs_space)
@@ -181,22 +181,6 @@ def __init__(self, config_name, num_envs, **kwargs):
         else:
             self.observation_space = gym.spaces.Box(-self._clip_obs, self._clip_obs, policy_obs_space.shape)
 
-        # if isinstance(critic_obs_space, gymnasium.spaces.Dict):
-        #     # check if we have a dictionary of observations
-        #     for key in critic_obs_space.keys():
-        #         if not isinstance(critic_obs_space[key], gymnasium.spaces.Box):
-        #             raise NotImplementedError(
-        #                 f"Dictinary of dictinary observations support has not been tested yet: '{type(policy_obs_space[key])}'."
-        #             )
-        #     self.state_observation_space = gym.spaces.Dict(
-        #         {
-        #             key: gym.spaces.Box(-self._clip_obs, self._clip_obs, critic_obs_space[key].shape)
-        #             for key in critic_obs_space.keys()
-        #         }
-        #     )
-        # else:
-        #     self.observation_space = gym.spaces.Box(-self._clip_obs, self._clip_obs, policy_obs_space.shape)
-
         self._clip_actions = 1.0
 
         action_space = self.env.unwrapped.single_action_space
@@ -213,10 +197,8 @@ def step(self, actions):
         actions = torch.clamp(actions, -self._clip_actions, self._clip_actions)
 
         obs_dict, rew, terminated, truncated, extras = self.env.step(actions)
-        #self.env.render_human()
         # move time out information to the extras dict
-        # this is only needed for infinite horizon tasks
-        # note: only useful when `value_bootstrap` is True in the agent configuration
+        # note: only used when `value_bootstrap` is True in the agent configuration
 
         extras["time_outs"] = truncated
 
@@ -240,7 +222,6 @@ def step(self, actions):
         if "log" in extras:
             extras["episode"] = extras.pop("log")
 
-        # TODO: revisit success calculation
         if "success" in extras:
             extras["successes"] = extras["success"].float().mean()
 
@@ -251,8 +232,6 @@ def step(self, actions):
 
     def reset(self):
         obs = self.env.reset()
-        print("obs:", obs[0].keys())
-
         obs_dict = {'obs': obs[0]}
 
         # if self.obs_mode == 'rgbd':
diff --git a/rl_games/networks/vision_networks.py b/rl_games/networks/vision_networks.py
index 15e13de9..0e005f1e 100644
--- a/rl_games/networks/vision_networks.py
+++ b/rl_games/networks/vision_networks.py
@@ -48,13 +48,14 @@ def __init__(self, params, **kwargs):
 
             self.running_mean_std = torch.jit.script(RunningMeanStd((mlp_input_size,)))
             self.layer_norm_emb = torch.nn.LayerNorm(mlp_input_size)
+            #self.layer_norm_emb = torch.nn.RMSNorm(mlp_input_size)
 
             if self.has_rnn:
                 if not self.is_rnn_before_mlp:
                     rnn_in_size = out_size
                     out_size = self.rnn_units
                 else:
-                    rnn_in_size =  mlp_input_size
+                    rnn_in_size = mlp_input_size
                     mlp_input_size = self.rnn_units
 
                 self.rnn = self._build_rnn(self.rnn_name, rnn_in_size, self.rnn_units, self.rnn_layers)
@@ -132,8 +133,6 @@ def forward(self, obs_dict):
             out = self.flatten_act(out)
 
             out = torch.cat([out, proprio], dim=1)
-            #print('out shape: ', out.shape)
-            #out = self.norm_emb(out)
             out = self.layer_norm_emb(out)
 
             if self.has_rnn:
@@ -295,6 +294,8 @@ def __init__(self, params, **kwargs):
             else:
                 out_size = self.units[-1]
 
+            self.layer_norm_emb = torch.nn.LayerNorm((mlp_input_size,))
+
             if self.has_rnn:
                 if not self.is_rnn_before_mlp:
                     rnn_in_size = out_size
@@ -376,6 +377,8 @@ def forward(self, obs_dict):
             if self.proprio_size > 0:
                 out = torch.cat([out, proprio], dim=1)
 
+            out = self.layer_norm_emb(out)
+
             if self.has_rnn:
                 seq_length = obs_dict.get('seq_length', 1)