opendilab · nighood · Jun 8, 2023 · Nov 13, 2023 · Apr 6, 2024 · Apr 7, 2024
diff --git a/lzero/agent/efficientzero.py b/lzero/agent/efficientzero.py
@@ -110,6 +110,9 @@ def __init__(
             elif self.cfg.policy.model.model_type == 'conv':
                 from lzero.model.efficientzero_model import EfficientZeroModel
                 model = EfficientZeroModel(**self.cfg.policy.model)
+            elif self.cfg.policy.model.model_type == 'mlp_md':
+                from lzero.model.efficientzero_model_md import EfficientZeroModelMD
+                model = EfficientZeroModelMD(**self.cfg.policy.model)
             else:
                 raise NotImplementedError
         if self.cfg.policy.cuda and torch.cuda.is_available():
@@ -124,8 +127,8 @@ def __init__(
         self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env)
 
     def train(
-        self,
-        step: int = int(1e7),
+            self,
+            step: int = int(1e7),
     ) -> TrainingReturn:
         """
         Overview:
@@ -356,8 +359,8 @@ def deploy(
         return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list))
 
     def batch_evaluate(
-        self,
-        n_evaluator_episode: int = None,
+            self,
+            n_evaluator_episode: int = None,
     ) -> EvalReturn:
         """
         Overview:

diff --git a/lzero/agent/muzero.py b/lzero/agent/muzero.py
@@ -110,6 +110,12 @@ def __init__(
             elif self.cfg.policy.model.model_type == 'conv':
                 from lzero.model.muzero_model import MuZeroModel
                 model = MuZeroModel(**self.cfg.policy.model)
+            elif self.cfg.policy.model.model_type == 'rgcn':
+                from lzero.model.muzero_model_gcn import MuZeroModelGCN
+                model = MuZeroModelGCN(**self.cfg.policy.model)
+            elif self.cfg.policy.model.model_type == 'mlp_md':
+                from lzero.model.muzero_model_md import MuZeroModelMD
+                model = MuZeroModelMD(**self.cfg.policy.model)
             else:
                 raise NotImplementedError
         if self.cfg.policy.cuda and torch.cuda.is_available():
@@ -124,8 +130,8 @@ def __init__(
         self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env)
 
     def train(
-        self,
-        step: int = int(1e7),
+            self,
+            step: int = int(1e7),
     ) -> TrainingReturn:
         """
         Overview:
@@ -356,8 +362,8 @@ def deploy(
         return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list))
 
     def batch_evaluate(
-        self,
-        n_evaluator_episode: int = None,
+            self,
+            n_evaluator_episode: int = None,
     ) -> EvalReturn:
         """
         Overview:

diff --git a/lzero/agent/sampled_efficientzero.py b/lzero/agent/sampled_efficientzero.py
@@ -93,7 +93,12 @@ def __init__(
             cfg.main_config.exp_name = exp_name
         self.origin_cfg = cfg
         self.cfg = compile_config(
-            cfg.main_config, seed=seed, env=None, auto=True, policy=SampledEfficientZeroPolicy, create_cfg=cfg.create_config
+            cfg.main_config,
+            seed=seed,
+            env=None,
+            auto=True,
+            policy=SampledEfficientZeroPolicy,
+            create_cfg=cfg.create_config
         )
         self.exp_name = self.cfg.exp_name
 
@@ -110,6 +115,9 @@ def __init__(
             elif self.cfg.policy.model.model_type == 'conv':
                 from lzero.model.sampled_efficientzero_model import SampledEfficientZeroModel
                 model = SampledEfficientZeroModel(**self.cfg.policy.model)
+            elif self.cfg.policy.model.model_type == 'mlp_md':
+                from lzero.model.sampled_efficientzero_model_md import SampledEfficientZeroModelMD
+                model = SampledEfficientZeroModelMD(**self.cfg.policy.model)
             else:
                 raise NotImplementedError
         if self.cfg.policy.cuda and torch.cuda.is_available():
@@ -124,8 +132,8 @@ def __init__(
         self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env)
 
     def train(
-        self,
-        step: int = int(1e7),
+            self,
+            step: int = int(1e7),
     ) -> TrainingReturn:
         """
         Overview:
@@ -356,8 +364,8 @@ def deploy(
         return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list))
 
     def batch_evaluate(
-        self,
-        n_evaluator_episode: int = None,
+            self,
+            n_evaluator_episode: int = None,
     ) -> EvalReturn:
         """
         Overview:

diff --git a/lzero/mcts/utils.py b/lzero/mcts/utils.py
@@ -6,8 +6,9 @@
 from graphviz import Digraph
 
 
-def generate_random_actions_discrete(num_actions: int, action_space_size: int, num_of_sampled_actions: int,
-                                     reshape=False):
+def generate_random_actions_discrete(
+    num_actions: int, action_space_size: int, num_of_sampled_actions: int, reshape=False
+):
     """
     Overview:
         Generate a list of random actions.
@@ -19,10 +20,7 @@ def generate_random_actions_discrete(num_actions: int, action_space_size: int, n
     Returns:
         A list of random actions.
     """
-    actions = [
-        np.random.randint(0, action_space_size, num_of_sampled_actions).reshape(-1)
-        for _ in range(num_actions)
-    ]
+    actions = [np.random.randint(0, action_space_size, num_of_sampled_actions).reshape(-1) for _ in range(num_actions)]
 
     # If num_of_sampled_actions == 1, flatten the actions to a list of numbers
     if num_of_sampled_actions == 1:
@@ -97,7 +95,9 @@ def prepare_observation(observation_list, model_type='conv'):
     Returns:
         - np.ndarray: Reshaped array of observations.
     """
-    assert model_type in ['conv', 'mlp'], "model_type must be either 'conv' or 'mlp'"
+    assert model_type in [
+        'conv', 'mlp', 'rgcn', 'mlp_md'
+    ], "model_type must be either 'conv', 'mlp', 'rgcn' or 'mlp_md'"
     observation_array = np.array(observation_list)
     batch_size = observation_array.shape[0]
 
@@ -110,13 +110,27 @@ def prepare_observation(observation_list, model_type='conv'):
             _, stack_num, channels, width, height = observation_array.shape
             observation_array = observation_array.reshape(batch_size, stack_num * channels, width, height)
 
-    elif model_type == 'mlp':
+    elif model_type == 'mlp' or model_type == 'mlp_md':
         if observation_array.ndim == 3:
             # Flatten the last two dimensions
             observation_array = observation_array.reshape(batch_size, -1)
         else:
             raise ValueError("For 'mlp' model_type, the observation must have 3 dimensions [B, S, O]")
 
+    elif model_type == 'rgcn':
+        if observation_array.ndim == 4:
+            # TODO(rjy): strage process
+            # observation_array should be reshaped to [B, S*M, O], where M is the agent number
+            # now observation_array.shape = [B, S, M, O]
+            observation_array = observation_array.reshape(batch_size, -1, observation_array.shape[-1])
+        elif observation_array.ndim == 3:
+            # Flatten the last two dimensions
+            observation_array = observation_array.reshape(batch_size, -1)
+        else:
+            raise ValueError(
+                "For 'rgcn' model_type, the observation must have 3 dimensions [B, S, O] or 4 dimensions [B, S, M, O]"
+            )
+
     return observation_array
 
 

diff --git a/lzero/model/common.py b/lzero/model/common.py
@@ -8,6 +8,8 @@
 import math
 from typing import Optional, Tuple
 from dataclasses import dataclass
+import logging
+import itertools
 import numpy as np
 import torch
 import torch.nn as nn
@@ -36,10 +38,14 @@ class MZNetworkOutput:
 
 
 class DownSample(nn.Module):
-
-    def __init__(self, observation_shape: SequenceType, out_channels: int, activation: nn.Module = nn.ReLU(inplace=True),
-                 norm_type: Optional[str] = 'BN',
-                 ) -> None:
+
+    def __init__(
+            self,
+            observation_shape: SequenceType,
+            out_channels: int,
+            activation: nn.Module = nn.ReLU(inplace=True),
+            norm_type: Optional[str] = 'BN',
+    ) -> None:
         """
         Overview:
             Define downSample convolution network. Encode the observation into hidden state.
@@ -72,11 +78,7 @@ def __init__(self, observation_shape: SequenceType, out_channels: int, activatio
         self.resblocks1 = nn.ModuleList(
             [
                 ResBlock(
-                    in_channels=out_channels // 2,
-                    activation=activation,
-                    norm_type='BN',
-                    res_type='basic',
-                    bias=False
+                    in_channels=out_channels // 2, activation=activation, norm_type='BN', res_type='basic', bias=False
                 ) for _ in range(1)
             ]
         )
@@ -90,17 +92,15 @@ def __init__(self, observation_shape: SequenceType, out_channels: int, activatio
         )
         self.resblocks2 = nn.ModuleList(
             [
-                ResBlock(
-                    in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
-                ) for _ in range(1)
+                ResBlock(in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False)
+                for _ in range(1)
             ]
         )
         self.pooling1 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
         self.resblocks3 = nn.ModuleList(
             [
-                ResBlock(
-                    in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
-                ) for _ in range(1)
+                ResBlock(in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False)
+                for _ in range(1)
             ]
         )
         self.pooling2 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
@@ -174,15 +174,18 @@ def __init__(
                 self.norm = nn.BatchNorm2d(num_channels)
             elif norm_type == 'LN':
                 if downsample:
-                    self.norm = nn.LayerNorm([num_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)])
+                    self.norm = nn.LayerNorm(
+                        [num_channels,
+                         math.ceil(observation_shape[-2] / 16),
+                         math.ceil(observation_shape[-1] / 16)]
+                    )
                 else:
                     self.norm = nn.LayerNorm([num_channels, observation_shape[-2], observation_shape[-1]])
-            
+
         self.resblocks = nn.ModuleList(
             [
-                ResBlock(
-                    in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
-                ) for _ in range(num_res_blocks)
+                ResBlock(in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False)
+                for _ in range(num_res_blocks)
             ]
         )
         self.activation = activation
@@ -223,13 +226,13 @@ def get_param_mean(self) -> float:
 class RepresentationNetworkMLP(nn.Module):
 
     def __init__(
-            self,
-            observation_shape: int,
-            hidden_channels: int = 64,
-            layer_num: int = 2,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            last_linear_layer_init_zero: bool = True,
-            norm_type: Optional[str] = 'BN',
+        self,
+        observation_shape: int,
+        hidden_channels: int = 64,
+        layer_num: int = 2,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        last_linear_layer_init_zero: bool = True,
+        norm_type: Optional[str] = 'BN',
     ) -> torch.Tensor:
         """
         Overview:
@@ -323,26 +326,35 @@ def __init__(
 
         self.resblocks = nn.ModuleList(
             [
-                ResBlock(
-                    in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
-                ) for _ in range(num_res_blocks)
+                ResBlock(in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False)
+                for _ in range(num_res_blocks)
             ]
         )
 
         self.conv1x1_value = nn.Conv2d(num_channels, value_head_channels, 1)
         self.conv1x1_policy = nn.Conv2d(num_channels, policy_head_channels, 1)
-        
+
         if norm_type == 'BN':
             self.norm_value = nn.BatchNorm2d(value_head_channels)
             self.norm_policy = nn.BatchNorm2d(policy_head_channels)
         elif norm_type == 'LN':
             if downsample:
-                self.norm_value = nn.LayerNorm([value_head_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)])
-                self.norm_policy = nn.LayerNorm([policy_head_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)])
+                self.norm_value = nn.LayerNorm(
+                    [value_head_channels,
+                     math.ceil(observation_shape[-2] / 16),
+                     math.ceil(observation_shape[-1] / 16)]
+                )
+                self.norm_policy = nn.LayerNorm(
+                    [
+                        policy_head_channels,
+                        math.ceil(observation_shape[-2] / 16),
+                        math.ceil(observation_shape[-1] / 16)
+                    ]
+                )
             else:
                 self.norm_value = nn.LayerNorm([value_head_channels, observation_shape[-2], observation_shape[-1]])
                 self.norm_policy = nn.LayerNorm([policy_head_channels, observation_shape[-2], observation_shape[-1]])
-        
+
         self.flatten_output_size_for_value_head = flatten_output_size_for_value_head
         self.flatten_output_size_for_policy_head = flatten_output_size_for_policy_head
         self.activation = activation
@@ -404,16 +416,16 @@ def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tenso
 class PredictionNetworkMLP(nn.Module):
 
     def __init__(
-            self,
-            action_space_size,
-            num_channels,
-            common_layer_num: int = 2,
-            fc_value_layers: SequenceType = [32],
-            fc_policy_layers: SequenceType = [32],
-            output_support_size: int = 601,
-            last_linear_layer_init_zero: bool = True,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            norm_type: Optional[str] = 'BN',
+        self,
+        action_space_size,
+        num_channels,
+        common_layer_num: int = 2,
+        fc_value_layers: SequenceType = [32],
+        fc_policy_layers: SequenceType = [32],
+        output_support_size: int = 601,
+        last_linear_layer_init_zero: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        norm_type: Optional[str] = 'BN',
     ):
         """
         Overview: