diff --git a/docs/source/overview/reinforcement-learning/rl_frameworks.rst b/docs/source/overview/reinforcement-learning/rl_frameworks.rst index 34d47c17cdc..5f9d25e06e0 100644 --- a/docs/source/overview/reinforcement-learning/rl_frameworks.rst +++ b/docs/source/overview/reinforcement-learning/rl_frameworks.rst @@ -71,18 +71,26 @@ Training Performance -------------------- We performed training with each RL library on the same ``Isaac-Humanoid-v0`` environment -with ``--headless`` on a single RTX PRO 6000 GPU using 4096 environments -and logged the total training time for 65.5M steps for each RL library. - +with ``--headless`` on a single NVIDIA GeForce RTX 4090 and logged the total training time +for 65.5M steps (4096 environments x 32 rollout steps x 500 iterations). +--------------------+-----------------+ | RL Library | Time in seconds | +====================+=================+ -| RL-Games | 207 | +| RL-Games | 201 | +--------------------+-----------------+ -| SKRL | 208 | +| SKRL | 201 | +--------------------+-----------------+ -| RSL RL | 199 | +| RSL RL | 198 | +--------------------+-----------------+ -| Stable-Baselines3 | 322 | +| Stable-Baselines3 | 287 | +--------------------+-----------------+ + +Training commands (check for the *'Training time: XXX seconds'* line in the terminal output): + +.. code:: bash + + python scripts/reinforcement_learning/rl_games/train.py --task Isaac-Humanoid-v0 --max_iterations 500 --headless + python scripts/reinforcement_learning/skrl/train.py --task Isaac-Humanoid-v0 --max_iterations 500 --headless + python scripts/reinforcement_learning/rsl_rl/train.py --task Isaac-Humanoid-v0 --max_iterations 500 --headless + python scripts/reinforcement_learning/sb3/train.py --task Isaac-Humanoid-v0 --max_iterations 500 --headless diff --git a/scripts/reinforcement_learning/rl_games/train.py b/scripts/reinforcement_learning/rl_games/train.py index 19a7e1ee943..882d216b01c 100644 --- a/scripts/reinforcement_learning/rl_games/train.py +++ b/scripts/reinforcement_learning/rl_games/train.py @@ -67,6 +67,7 @@ import math import os import random +import time from datetime import datetime from rl_games.common import env_configurations, vecenv @@ -201,6 +202,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen print_dict(video_kwargs, nesting=4) env = gym.wrappers.RecordVideo(env, **video_kwargs) + start_time = time.time() + # wrap around environment for rl-games env = RlGamesVecEnvWrapper(env, rl_device, clip_obs, clip_actions, obs_groups, concate_obs_groups) @@ -250,6 +253,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen else: runner.run({"train": True, "play": False, "sigma": train_sigma}) + print(f"Training time: {round(time.time() - start_time, 2)} seconds") + # close the simulator env.close() diff --git a/scripts/reinforcement_learning/rsl_rl/train.py b/scripts/reinforcement_learning/rsl_rl/train.py index 01d99d02d99..888b8d86a61 100644 --- a/scripts/reinforcement_learning/rsl_rl/train.py +++ b/scripts/reinforcement_learning/rsl_rl/train.py @@ -78,6 +78,7 @@ import gymnasium as gym import logging import os +import time import torch from datetime import datetime @@ -187,6 +188,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen print_dict(video_kwargs, nesting=4) env = gym.wrappers.RecordVideo(env, **video_kwargs) + start_time = time.time() + # wrap around environment for rsl-rl env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions) @@ -212,6 +215,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen # run training runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True) + print(f"Training time: {round(time.time() - start_time, 2)} seconds") + # close the simulator env.close() diff --git a/scripts/reinforcement_learning/sb3/train.py b/scripts/reinforcement_learning/sb3/train.py index 1e04374253e..1d97a74fe94 100644 --- a/scripts/reinforcement_learning/sb3/train.py +++ b/scripts/reinforcement_learning/sb3/train.py @@ -80,6 +80,7 @@ def cleanup_pbar(*args): import numpy as np import os import random +import time from datetime import datetime from stable_baselines3 import PPO @@ -176,6 +177,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen print_dict(video_kwargs, nesting=4) env = gym.wrappers.RecordVideo(env, **video_kwargs) + start_time = time.time() + # wrap around environment for stable baselines env = Sb3VecEnvWrapper(env, fast_variant=not args_cli.keep_all_info) @@ -223,6 +226,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen print("Saving normalization") env.save(os.path.join(log_dir, "model_vecnormalize.pkl")) + print(f"Training time: {round(time.time() - start_time, 2)} seconds") + # close the simulator env.close() diff --git a/scripts/reinforcement_learning/skrl/train.py b/scripts/reinforcement_learning/skrl/train.py index 183d50e61f8..f255d4af1a5 100644 --- a/scripts/reinforcement_learning/skrl/train.py +++ b/scripts/reinforcement_learning/skrl/train.py @@ -78,6 +78,7 @@ import logging import os import random +import time from datetime import datetime import skrl @@ -214,6 +215,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen print_dict(video_kwargs, nesting=4) env = gym.wrappers.RecordVideo(env, **video_kwargs) + start_time = time.time() + # wrap around environment for skrl env = SkrlVecEnvWrapper(env, ml_framework=args_cli.ml_framework) # same as: `wrap_env(env, wrapper="auto")` @@ -229,6 +232,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen # run training runner.run() + print(f"Training time: {round(time.time() - start_time, 2)} seconds") + # close the simulator env.close() diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rl_games_ppo_cfg.yaml b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rl_games_ppo_cfg.yaml index 8774abaca1c..c756670aef2 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rl_games_ppo_cfg.yaml +++ b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rl_games_ppo_cfg.yaml @@ -3,6 +3,14 @@ # # SPDX-License-Identifier: BSD-3-Clause +# ========================================= IMPORTANT NOTICE ========================================= +# +# This file defines the agent configuration used to generate the "Training Performance" table in +# https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html. +# Ensure that the configurations for the other RL libraries are updated if this one is modified. +# +# ==================================================================================================== + params: seed: 42 @@ -50,13 +58,13 @@ params: device_name: 'cuda:0' multi_gpu: False ppo: True - mixed_precision: True + mixed_precision: False normalize_input: True normalize_value: True value_bootstrap: True num_actors: -1 reward_shaper: - scale_value: 0.6 + scale_value: 1.0 normalize_advantage: True gamma: 0.99 tau: 0.95 @@ -72,7 +80,7 @@ params: truncate_grads: True e_clip: 0.2 horizon_length: 32 - minibatch_size: 32768 + minibatch_size: 32768 # num_envs * horizon_length / num_mini_batches mini_epochs: 5 critic_coef: 4 clip_value: True diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rsl_rl_ppo_cfg.py b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rsl_rl_ppo_cfg.py index 663012f94f0..c5f77400cf6 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rsl_rl_ppo_cfg.py +++ b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rsl_rl_ppo_cfg.py @@ -3,6 +3,17 @@ # # SPDX-License-Identifier: BSD-3-Clause +""" +========================================= IMPORTANT NOTICE ========================================= + +This file defines the agent configuration used to generate the "Training Performance" table in +https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html. +Ensure that the configurations for the other RL libraries are updated if this one is modified. + +==================================================================================================== +""" + + from isaaclab.utils import configclass from isaaclab_rl.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlPpoActorCriticCfg, RslRlPpoAlgorithmCfg @@ -12,18 +23,18 @@ class HumanoidPPORunnerCfg(RslRlOnPolicyRunnerCfg): num_steps_per_env = 32 max_iterations = 1000 - save_interval = 50 + save_interval = 100 experiment_name = "humanoid" policy = RslRlPpoActorCriticCfg( init_noise_std=1.0, - actor_obs_normalization=False, - critic_obs_normalization=False, + actor_obs_normalization=True, + critic_obs_normalization=True, actor_hidden_dims=[400, 200, 100], critic_hidden_dims=[400, 200, 100], activation="elu", ) algorithm = RslRlPpoAlgorithmCfg( - value_loss_coef=1.0, + value_loss_coef=2.0, use_clipped_value_loss=True, clip_param=0.2, entropy_coef=0.0, diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/sb3_ppo_cfg.yaml b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/sb3_ppo_cfg.yaml index 73e4e87c6e4..6d8f3d98d4e 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/sb3_ppo_cfg.yaml +++ b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/sb3_ppo_cfg.yaml @@ -3,7 +3,14 @@ # # SPDX-License-Identifier: BSD-3-Clause -# Adapted from rsl_rl config +# ========================================= IMPORTANT NOTICE ========================================= +# +# This file defines the agent configuration used to generate the "Training Performance" table in +# https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html. +# Ensure that the configurations for the other RL libraries are updated if this one is modified. +# +# ==================================================================================================== + seed: 42 policy: "MlpPolicy" n_timesteps: !!float 5e7 @@ -18,7 +25,7 @@ clip_range: 0.2 n_epochs: 5 gae_lambda: 0.95 max_grad_norm: 1.0 -vf_coef: 0.5 +vf_coef: 2.0 policy_kwargs: activation_fn: 'nn.ELU' net_arch: [400, 200, 100] diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/skrl_ppo_cfg.yaml b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/skrl_ppo_cfg.yaml index e9f3913a029..ecfa82513d8 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/skrl_ppo_cfg.yaml +++ b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/skrl_ppo_cfg.yaml @@ -3,6 +3,14 @@ # # SPDX-License-Identifier: BSD-3-Clause +# ========================================= IMPORTANT NOTICE ========================================= +# +# This file defines the agent configuration used to generate the "Training Performance" table in +# https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html. +# Ensure that the configurations for the other RL libraries are updated if this one is modified. +# +# ==================================================================================================== + seed: 42 @@ -67,14 +75,13 @@ agent: entropy_loss_scale: 0.0 value_loss_scale: 2.0 kl_threshold: 0.0 - rewards_shaper_scale: 0.6 time_limit_bootstrap: False # logging and checkpoint experiment: directory: "humanoid" experiment_name: "" - write_interval: auto - checkpoint_interval: auto + write_interval: 32 + checkpoint_interval: 3200 # Sequential trainer