🚀 [RofuncRL] RofuncDTrans pass debugging

Skylark0924 · Aug 28, 2023 · 08f73ef · 08f73ef
1 parent 2d11808
commit 08f73ef
Show file tree

Hide file tree

Showing 11 changed files with 369 additions and 47 deletions.
diff --git a/examples/learning_rl/example_D4RL_RofuncRL.py b/examples/learning_rl/example_D4RL_RofuncRL.py
@@ -0,0 +1,102 @@
+"""
+D4RL (RofuncRL)
+=======================
+
+D4RL tasks with RofuncRL offline RL algorithms (BC, DTrans, CQL, etc.)
+"""
+
+import argparse
+
+import gymnasium as gym
+
+from rofunc.config.utils import omegaconf_to_dict, get_config
+from rofunc.learning.RofuncRL.tasks import task_map
+from rofunc.learning.RofuncRL.trainers import trainer_map
+from rofunc.learning.pre_trained_models.download import model_zoo
+from rofunc.learning.utils.download_datasets import download_d4rl_dataset
+from rofunc.learning.utils.utils import set_seed
+
+
+def train(custom_args):
+    # Config task and trainer parameters for Isaac Gym environments
+    args_overrides = ["task={}".format(custom_args.task),
+                      "train={}{}RofuncRL".format(custom_args.task, custom_args.agent.upper()),
+                      "sim_device={}".format(custom_args.sim_device),
+                      "rl_device={}".format(custom_args.rl_device),
+                      "graphics_device_id={}".format(custom_args.graphics_device_id),
+                      "headless={}".format(custom_args.headless)]
+    cfg = get_config('./learning/rl', 'config', args=args_overrides)
+
+    download_d4rl_dataset(save_dir='../data/D4RL')
+
+    set_seed(cfg.train.Trainer.seed)
+
+    # Instantiate the Isaac Gym environment
+    env = gym.make(f'{custom_args.task}-v3')
+
+    # Instantiate the RL trainer
+    trainer = trainer_map[custom_args.agent](cfg=cfg.train,
+                                             env=env,
+                                             device=cfg.rl_device,
+                                             env_name=custom_args.task)
+
+    # Start training
+    trainer.train()
+
+
+def inference(custom_args):
+    # Config task and trainer parameters for Isaac Gym environments
+    args_overrides = ["task={}".format(custom_args.task),
+                      "train={}{}RofuncRL".format(custom_args.task, custom_args.agent.upper()),
+                      "sim_device={}".format(custom_args.sim_device),
+                      "rl_device={}".format(custom_args.rl_device),
+                      "graphics_device_id={}".format(custom_args.graphics_device_id),
+                      "headless={}".format(False),
+                      "num_envs={}".format(16)]
+    cfg = get_config('./learning/rl', 'config', args=args_overrides)
+    cfg_dict = omegaconf_to_dict(cfg.task)
+
+    set_seed(cfg.train.Trainer.seed)
+
+    # Instantiate the Isaac Gym environment
+    infer_env = task_map[custom_args.task](cfg=cfg_dict,
+                                           rl_device=cfg.rl_device,
+                                           sim_device=cfg.sim_device,
+                                           graphics_device_id=cfg.graphics_device_id,
+                                           headless=cfg.headless,
+                                           virtual_screen_capture=cfg.capture_video,  # TODO: check
+                                           force_render=cfg.force_render)
+
+    # Instantiate the RL trainer
+    trainer = trainer_map[custom_args.agent](cfg=cfg.train,
+                                             env=infer_env,
+                                             device=cfg.rl_device,
+                                             env_name=custom_args.task)
+    # load checkpoint
+    if custom_args.ckpt_path is None:
+        custom_args.ckpt_path = model_zoo(name="CURICabinetRofuncRLPPO_left_arm.pth")
+    trainer.agent.load_ckpt(custom_args.ckpt_path)
+
+    # Start inference
+    trainer.inference()
+
+
+if __name__ == '__main__':
+    gpu_id = 0
+
+    parser = argparse.ArgumentParser()
+    # Available tasks: Hopper, HalfCheetah, Walker2d, Reacher2d
+    parser.add_argument("--task", type=str, default="Hopper")
+    parser.add_argument("--agent", type=str, default="dtrans")  # dtrans
+    parser.add_argument("--sim_device", type=str, default="cuda:{}".format(gpu_id))
+    parser.add_argument("--rl_device", type=str, default="cuda:{}".format(gpu_id))
+    parser.add_argument("--graphics_device_id", type=int, default=gpu_id)
+    parser.add_argument("--headless", type=str, default="True")
+    parser.add_argument("--inference", action="store_true", help="turn to inference mode while adding this argument")
+    parser.add_argument("--ckpt_path", type=str, default=None)
+    custom_args = parser.parse_args()
+
+    if not custom_args.inference:
+        train(custom_args)
+    else:
+        inference(custom_args)
diff --git a/rofunc/config/learning/rl/task/Hopper.yaml b/rofunc/config/learning/rl/task/Hopper.yaml
@@ -0,0 +1,2 @@
+name: Hopper
+
diff --git a/rofunc/config/learning/rl/train/BaseTaskDTRANSRofuncRL.yaml b/rofunc/config/learning/rl/train/BaseTaskDTRANSRofuncRL.yaml
@@ -0,0 +1,90 @@
+# ========== Trainer parameters ==========
+Trainer:
+  experiment_name:                    # Experiment name for logging.
+  experiment_directory:               # Experiment directory for logging.
+  write_interval: 100                 # TensorBoard write interval for logging. (timesteps)
+  checkpoint_interval: 1000           # Checkpoint interval for logging. (timesteps)
+  wandb: False                        # If true, log to Weights & Biases.
+  wandb_kwargs: # Weights & Biases kwargs. https://docs.wandb.ai/ref/python/init
+    project:                          # Weights & Biases project name.
+  rofunc_logger_kwargs: # Rofunc BeautyLogger kwargs.
+    verbose: True                     # If true, print to stdout.
+  maximum_steps: 100000               # The maximum number of steps to run for.
+  random_steps: 0                     # The number of random exploration steps to take.
+  start_learning_steps: 0             # The number of steps to take before starting network updating.
+  seed: 42                            # The random seed.
+  rollouts: 16                        # The number of rollouts before updating.
+  eval_flag: False                     # If true, run evaluation.
+  eval_freq: 2500                     # The frequency of evaluation. (timesteps)
+  eval_steps: 1000                    # The number of steps to run for evaluation.
+  use_eval_thread: True               # If true, use a separate thread for evaluation.
+  inference_steps: 1000               # The number of steps to run for inference.
+  max_episode_steps: 1000             # The maximum number of steps per episode.
+
+  dataset_type: medium               # medium, medium-replay, medium-expert, expert
+  mode: normal                       # normal for standard setting, delayed for sparse
+  dataset_root_path: /home/ubuntu/Github/Rofunc/examples/data/D4RL
+  env_targets: [ 3600, 1800 ]         # evaluation conditioning targets
+  scale: 1000.                         # scale for reward and action
+  max_seq_length: 20                 # Maximum length of the sequence for inputting to the GPT model.
+
+
+# ========== Agent parameters ==========
+Agent:
+  discount: 0.99                      # The discount factor, gamma.
+  td_lambda: 0.95                     # TD(lambda) coefficient (lam) for computing returns and advantages.
+
+  learning_epochs: 8                 # The number of epochs to train for per update.
+  batch_size: 1024                   # Batch size for training.
+
+  lr: 1e-4                          # Learning rate for actor.
+  #  lr_scheduler:                    # Learning rate scheduler type.
+  #  lr_scheduler_kwargs:             # Learning rate scheduler kwargs.
+  adam_eps: 1e-5                    # Adam epsilon.
+  weight_decay: 1e-4                # Weight decay.
+
+  # If true, use the Generalized Advantage Estimator (GAE)
+  # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
+  use_gae: True
+
+  entropy_loss_scale: 0.01         # entropy loss scaling factor
+  value_loss_scale: 2.0            # value loss scaling factor
+
+  grad_norm_clip: 1.0              # clipping coefficient for the norm of the gradients
+  ratio_clip: 0.2                  # clipping coefficient for computing the clipped surrogate objective
+  value_clip: 0.2                  # clipping coefficient for computing the value loss (if clip_predicted_values is True)
+  clip_predicted_values: True      # clip predicted values during value loss computation
+
+  kl_threshold: 0                  # Initial coefficient for KL divergence.
+
+#  state_preprocessor:            # State preprocessor type.
+#  state_preprocessor_kwargs:     # State preprocessor kwargs.
+#  value_preprocessor:            # Value preprocessor type.
+#  value_preprocessor_kwargs:     # Value preprocessor kwargs.
+#  rewards_shaper:                # Rewards shaper type.
+
+
+# ========= Model parameters ==========
+Model:
+  use_init: True
+  use_action_clip: False               # If true, clip actions to the action space range.
+  use_action_out_tanh: True            # If true, apply tanh to the output of the actor.
+  action_clip: 1.0                     # clipping coefficient for the norm of the actions
+  action_scale: 1.0                    # scaling action range from [-1, 1] after tanh to [-action_scale, action_scale]
+  use_log_std_clip: True               # If true, clip log standard deviations to the range [-20, 2].
+  log_std_clip_max: 2.0                # clipping coefficient for the log standard deviations
+  log_std_clip_min: -20                # clipping coefficient for the log standard deviations
+
+  actor:
+    n_layer: 3
+    n_head: 1
+    n_embd: 128
+    dropout: 0.1
+    activation_function: relu
+    max_episode_steps: ${train.Trainer.max_episode_steps}
+
+
+
+
+
+
diff --git a/rofunc/config/utils.py b/rofunc/config/utils.py
@@ -31,7 +31,7 @@ def get_config(config_path=None, config_name=None, args=None, debug=False, absl_
     :param config_name: name of the config file (without .yaml)
     :param args: custom args to rewrite some params in the config file
     :param debug: if True, print the config
-    :param absl_config_path: absolute path to the config file (for external user)
+    :param absl_config_path: absolute path to the folder contains config file (for external user)
     :return:
     """
     # reset current hydra config if already parsed (but not passed in here)

diff --git a/rofunc/learning/RofuncRL/agents/offline/dtrans_agent.py b/rofunc/learning/RofuncRL/agents/offline/dtrans_agent.py
@@ -23,7 +23,6 @@
 import rofunc as rf
 from rofunc.learning.RofuncRL.agents.base_agent import BaseAgent
 from rofunc.learning.RofuncRL.models.actor_models import ActorDTrans
-from rofunc.learning.RofuncRL.utils.memory import Memory
 
 
 class DTransAgent(BaseAgent):
@@ -37,23 +36,21 @@ def __init__(self,
                  cfg: DictConfig,
                  observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]],
                  action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]],
-                 memory: Optional[Union[Memory, Tuple[Memory]]] = None,
                  device: Optional[Union[str, torch.device]] = None,
                  experiment_dir: Optional[str] = None,
                  rofunc_logger: Optional[rf.logger.BeautyLogger] = None):
         """
         :param cfg: Configurations
         :param observation_space: Observation space
         :param action_space: Action space
-        :param memory: Memory for storing transitions
         :param device: Device on which the torch tensor is allocated
         :param experiment_dir: Directory for storing experiment data
         :param rofunc_logger: Rofunc logger
         """
 
-        super().__init__(cfg, observation_space, action_space, memory, device, experiment_dir, rofunc_logger)
+        super().__init__(cfg, observation_space, action_space, None, device, experiment_dir, rofunc_logger)
 
-        self.dtrans = ActorDTrans(cfg.Model, observation_space, action_space, device)
+        self.dtrans = ActorDTrans(cfg.Model, observation_space, action_space, self.se).to(self.device)
         self.models = {"dtrans": self.dtrans}
 
         # checkpoint models
@@ -68,7 +65,8 @@ def __init__(self,
         self._lr = self.cfg.Agent.lr
         self._adam_eps = self.cfg.Agent.adam_eps
         self._weight_decay = self.cfg.Agent.weight_decay
-        self._max_length = self.cfg.Agent.max_length
+        self._max_seq_length = self.cfg.Trainer.max_seq_length
+
 
         self._set_up()
 
@@ -94,25 +92,25 @@ def act(self, states, actions, rewards, returns_to_go, timesteps):
         returns_to_go = returns_to_go.reshape(1, -1, 1)
         timesteps = timesteps.reshape(1, -1)
 
-        if self._max_length is not None:
-            states = states[:, -self._max_length:]
-            actions = actions[:, -self._max_length:]
-            returns_to_go = returns_to_go[:, -self._max_length:]
-            timesteps = timesteps[:, -self._max_length:]
+        if self._max_seq_length is not None:
+            states = states[:, -self._max_seq_length:]
+            actions = actions[:, -self._max_seq_length:]
+            returns_to_go = returns_to_go[:, -self._max_seq_length:]
+            timesteps = timesteps[:, -self._max_seq_length:]
 
             # pad all tokens to sequence length
-            attention_mask = torch.cat([torch.zeros(self._max_length - states.shape[1]), torch.ones(states.shape[1])])
+            attention_mask = torch.cat([torch.zeros(self._max_seq_length - states.shape[1]), torch.ones(states.shape[1])])
             attention_mask = attention_mask.to(dtype=torch.long, device=states.device).reshape(1, -1)
             states = torch.cat(
-                [torch.zeros((states.shape[0], self._max_length - states.shape[1], self.dtrans.state_dim),
+                [torch.zeros((states.shape[0], self._max_seq_length - states.shape[1], self.dtrans.state_dim),
                              device=states.device), states], dim=1).to(dtype=torch.float32)
             actions = torch.cat(
-                [torch.zeros((actions.shape[0], self._max_length - actions.shape[1], self.dtrans.action_dim),
+                [torch.zeros((actions.shape[0], self._max_seq_length - actions.shape[1], self.dtrans.action_dim),
                              device=actions.device), actions], dim=1).to(dtype=torch.float32)
             returns_to_go = torch.cat(
-                [torch.zeros((returns_to_go.shape[0], self._max_length - returns_to_go.shape[1], 1),
+                [torch.zeros((returns_to_go.shape[0], self._max_seq_length - returns_to_go.shape[1], 1),
                              device=returns_to_go.device), returns_to_go], dim=1).to(dtype=torch.float32)
-            timesteps = torch.cat([torch.zeros((timesteps.shape[0], self._max_length - timesteps.shape[1]),
+            timesteps = torch.cat([torch.zeros((timesteps.shape[0], self._max_seq_length - timesteps.shape[1]),
                                                device=timesteps.device), timesteps], dim=1).to(dtype=torch.long)
         else:
             attention_mask = None
@@ -122,8 +120,8 @@ def act(self, states, actions, rewards, returns_to_go, timesteps):
 
         return action_preds[0, -1]
 
-    def update_net(self):
-        states, actions, rewards, dones, rtg, timesteps, attention_mask = self.get_batch(self.batch_size)
+    def update_net(self, batch):
+        states, actions, rewards, dones, rtg, timesteps, attention_mask = batch
         action_target = torch.clone(actions)
 
         state_preds, action_preds, reward_preds = self.dtrans.forward(
@@ -139,12 +137,12 @@ def update_net(self):
 
         self.optimizer.zero_grad()
         loss.backward()
-        torch.nn.utils.clip_grad_norm_(self.model.parameters(), .25)
+        torch.nn.utils.clip_grad_norm_(self.dtrans.parameters(), .25)
         self.optimizer.step()
 
-        with torch.no_grad():
-            self.diagnostics['training/action_error'] = torch.mean(
-                (action_preds - action_target) ** 2).detach().cpu().item()
+        # with torch.no_grad():
+        #     self.diagnostics['training/action_error'] = torch.mean(
+        #         (action_preds - action_target) ** 2).detach().cpu().item()
 
         # update learning rate
         if self._lr_scheduler is not None:

diff --git a/rofunc/learning/RofuncRL/models/actor_models.py b/rofunc/learning/RofuncRL/models/actor_models.py
@@ -260,8 +260,8 @@ def __init__(self, cfg: DictConfig,
 
         self.cfg = cfg
         self.action_dim = get_space_dim(action_space)
-        self.gpt2_hidden_size = cfg.actor.gpt2_hidden_size
-        self.max_ep_len = cfg.actor.max_ep_len
+        self.n_embd = cfg.actor.n_embd
+        self.max_ep_len = cfg.actor.max_episode_steps
 
         # state encoder
         self.state_encoder = state_encoder
@@ -272,22 +272,29 @@ def __init__(self, cfg: DictConfig,
 
         gpt_config = transformers.GPT2Config(
             vocab_size=1,  # doesn't matter -- we don't use the vocab
-            n_embd=self.gpt2_hidden_size,
+            n_embd=self.n_embd,
+            n_layer=self.cfg.actor.n_layer,
+            n_head=self.cfg.actor.n_head,
+            n_inner=self.n_embd * 4,
+            activation_function=self.cfg.actor.activation_function,
+            resid_pdrop=self.cfg.actor.dropout,
+            attn_pdrop=self.cfg.actor.dropout,
+            n_positions=1024
         )
 
-        self.embed_timestep = nn.Embedding(self.max_ep_len, self.gpt2_hidden_size)
-        self.embed_return = torch.nn.Linear(1, self.gpt2_hidden_size)
-        self.embed_state = torch.nn.Linear(self.state_dim, self.gpt2_hidden_size)
-        self.embed_action = torch.nn.Linear(self.action_dim, self.gpt2_hidden_size)
-        self.embed_ln = nn.LayerNorm(self.gpt2_hidden_size)
+        self.embed_timestep = nn.Embedding(self.max_ep_len, self.n_embd)
+        self.embed_return = torch.nn.Linear(1, self.n_embd)
+        self.embed_state = torch.nn.Linear(self.state_dim, self.n_embd)
+        self.embed_action = torch.nn.Linear(self.action_dim, self.n_embd)
+        self.embed_ln = nn.LayerNorm(self.n_embd)
 
         self.backbone_net = transformers.GPT2Model(gpt_config)
 
         # note: we don't predict states or returns for the paper
-        self.predict_state = torch.nn.Linear(self.gpt2_hidden_size, self.state_dim)
-        self.predict_action = nn.Sequential(*([nn.Linear(self.gpt2_hidden_size, self.action_dim)] +
+        self.predict_state = torch.nn.Linear(self.n_embd, self.state_dim)
+        self.predict_action = nn.Sequential(*([nn.Linear(self.n_embd, self.action_dim)] +
                                               ([nn.Tanh()] if self.cfg.use_action_out_tanh else [])))
-        self.predict_return = torch.nn.Linear(self.gpt2_hidden_size, 1)
+        self.predict_return = torch.nn.Linear(self.n_embd, 1)
 
     def forward(self, states, actions, rewards, returns_to_go, timesteps, attention_mask=None):
         batch_size, seq_length = states.shape[0], states.shape[1]
@@ -313,7 +320,7 @@ def forward(self, states, actions, rewards, returns_to_go, timesteps, attention_
         # this makes the sequence look like (R_1, s_1, a_1, R_2, s_2, a_2, ...)
         # which works nice in an autoregressive sense since states predict actions
         stacked_inputs = torch.stack((returns_embeddings, state_embeddings, action_embeddings), dim=1
-                                     ).permute(0, 2, 1, 3).reshape(batch_size, 3 * seq_length, self.gpt2_hidden_size)
+                                     ).permute(0, 2, 1, 3).reshape(batch_size, 3 * seq_length, self.n_embd)
         stacked_inputs = self.embed_ln(stacked_inputs)
 
         # to make the attention mask fit the stacked inputs, have to stack it as well
@@ -327,7 +334,7 @@ def forward(self, states, actions, rewards, returns_to_go, timesteps, attention_
 
         # reshape x so that the second dimension corresponds to the original
         # returns (0), states (1), or actions (2); i.e. x[:,1,t] is the token for s_t
-        x = x.reshape(batch_size, seq_length, 3, self.gpt2_hidden_size).permute(0, 2, 1, 3)
+        x = x.reshape(batch_size, seq_length, 3, self.n_embd).permute(0, 2, 1, 3)
 
         # get predictions
         return_preds = self.predict_return(x[:, 2])  # predict next return given state and action

diff --git a/rofunc/learning/RofuncRL/trainers/__init__.py b/rofunc/learning/RofuncRL/trainers/__init__.py
@@ -4,6 +4,7 @@
 from .a2c_trainer import A2CTrainer
 from .amp_trainer import AMPTrainer
 from .ase_trainer import ASETrainer
+from .dtrans_trainer import DTransTrainer
 
 trainer_map = {
     "ppo": PPOTrainer,
@@ -12,4 +13,5 @@
     "a2c": A2CTrainer,
     "amp": AMPTrainer,
     "ase": ASETrainer,
+    "dtrans": DTransTrainer,
 }