From 305f17b7a9c86b9759b6637f6d1c5d1f0223d555 Mon Sep 17 00:00:00 2001
From: Denys Makoviichuk <trrrrr97@gmail.com>
Date: Sat, 19 Oct 2024 13:48:12 -0700
Subject: [PATCH] added humanoid

---
 rl_games/configs/mujoco/ant_envpool_moe.yaml  |  6 +-
 .../configs/mujoco/humanoid_envpool_moe.yaml  | 71 +++++++++++++++++++
 rl_games/networks/moe.py                      |  8 +--
 3 files changed, 78 insertions(+), 7 deletions(-)
 create mode 100644 rl_games/configs/mujoco/humanoid_envpool_moe.yaml

diff --git a/rl_games/configs/mujoco/ant_envpool_moe.yaml b/rl_games/configs/mujoco/ant_envpool_moe.yaml
index 814850cc..41d39dcd 100644
--- a/rl_games/configs/mujoco/ant_envpool_moe.yaml
+++ b/rl_games/configs/mujoco/ant_envpool_moe.yaml
@@ -21,9 +21,9 @@ params:
     num_experts: 4
     hidden_size: 256
     gating_hidden_size: 128
-    use_sparse_gating: true
-    use_entropy_loss: true
-    use_diversity_loss: true
+    use_sparse_gating: True
+    use_entropy_loss: True
+    use_diversity_loss: False
     top_k: 2
     lambda_entropy: 0.01
     lambda_diversity: 0.01
diff --git a/rl_games/configs/mujoco/humanoid_envpool_moe.yaml b/rl_games/configs/mujoco/humanoid_envpool_moe.yaml
new file mode 100644
index 00000000..95dc22df
--- /dev/null
+++ b/rl_games/configs/mujoco/humanoid_envpool_moe.yaml
@@ -0,0 +1,71 @@
+params:
+  seed: 5
+  algo:
+    name: a2c_continuous
+
+  model:
+    name: continuous_a2c_logstd
+
+  network:
+    name: moe
+    space:
+      continuous:
+        mu_activation: None
+        sigma_activation: None
+        mu_init:
+          name: default
+        sigma_init:
+          name: const_initializer
+          val: 0
+        fixed_sigma: True
+    num_experts: 4
+    hidden_size: 512
+    gating_hidden_size: 128
+    use_sparse_gating: True
+    use_entropy_loss: True
+    use_diversity_loss: False
+    top_k: 2
+    lambda_entropy: 0.01
+    lambda_diversity: 0.01
+
+  config:
+    name: Ant-v4_envpool_moe
+    env_name: envpool
+    score_to_win: 20000
+    normalize_input: True
+    normalize_value: True
+    value_bootstrap: True
+    normalize_advantage: True
+    reward_shaper:
+      scale_value: 1
+
+    gamma: 0.99
+    tau: 0.95
+    learning_rate: 3e-4
+    lr_schedule: adaptive
+    kl_threshold: 0.008
+    grad_norm: 1.0
+    entropy_coef: 0.0
+    truncate_grads: True
+    e_clip: 0.2
+    clip_value: True
+    use_smooth_clamp: True
+    bound_loss_type: regularisation
+    bounds_loss_coef: 0.0
+    max_epochs: 2000
+    num_actors: 64
+    horizon_length: 64
+    minibatch_size: 2048
+    mini_epochs: 4
+    critic_coef: 2
+
+    env_config:
+      env_name: Humanoid-v4
+      seed: 5
+      #flat_observation: True
+
+    player:
+      render: False
+      num_actors: 64
+      games_num: 1000
+      use_vecenv: True
\ No newline at end of file
diff --git a/rl_games/networks/moe.py b/rl_games/networks/moe.py
index 081699e8..37d33e79 100644
--- a/rl_games/networks/moe.py
+++ b/rl_games/networks/moe.py
@@ -81,9 +81,9 @@ def __init__(self, params, **kwargs):
         self.aux_loss_map = {
         }
         if self.use_diversity_loss:
-            self.aux_loss_map['diversity_loss'] = 0.0
+            self.aux_loss_map['moe_diversity_loss'] = 0.0
         if self.use_entropy_loss:
-            self.aux_loss_map['entropy_loss'] = 0.0
+            self.aux_loss_map['moe_entropy_loss'] = 0.0
 
     def is_rnn(self):
         return False
@@ -111,7 +111,7 @@ def forward(self, obs_dict):
         # Compute Entropy Loss for Gating Weights
             entropy = -torch.sum(gating_weights * torch.log(gating_weights + 1e-8), dim=1)
             entropy_loss = torch.mean(entropy)
-            self.aux_loss_map['entropy_loss'] = self.lambda_entropy * entropy_loss
+            self.aux_loss_map['moe_entropy_loss'] = self.lambda_entropy * entropy_loss
 
         # Expert Networks Forward Pass
         expert_outputs = []
@@ -129,7 +129,7 @@ def forward(self, obs_dict):
                     diversity_loss += torch.mean(similarity)
             num_pairs = num_experts * (num_experts - 1) / 2
             diversity_loss = diversity_loss / num_pairs
-            self.aux_loss_map['diversity_loss'] = self.lambda_diversity * diversity_loss
+            self.aux_loss_map['moe_diversity_loss'] = self.lambda_diversity * diversity_loss
 
         # Aggregate Expert Outputs
         gating_weights = gating_weights.unsqueeze(-1)  # Shape: [batch_size, num_experts, 1]