From 1cdf5aa3e79b682e1314369820d7cab8286880ef Mon Sep 17 00:00:00 2001
From: Seonjin Na <sna@nvidia.com>
Date: Thu, 18 Dec 2025 16:53:15 -0800
Subject: [PATCH 1/3] feat: Add GB200 perf recipes for llama3-8b,
 qwen3-30ba3,qwen3-32b

---
 .../grpo-llama3.1-8b-instruct-2n4g.yaml       | 58 +++++++++++++++++++
 .../performance/grpo-qwen3-30ba3b-4n4g.yaml   | 45 ++++++++++++++
 .../llm/performance/grpo-qwen3-32b-4n4g.yaml  | 42 ++++++++++++++
 .../grpo-llama3.1-8b-instruct-2n4g.sh         | 39 +++++++++++++
 .../llm/performance/grpo-qwen3-30ba3b-4n4g.sh | 40 +++++++++++++
 .../llm/performance/grpo-qwen3-32b-4n4g.sh    | 40 +++++++++++++
 tests/test_suites/performance.txt             | 11 +++-
 7 files changed, 274 insertions(+), 1 deletion(-)
 create mode 100644 examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
 create mode 100755 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh

diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
new file mode 100644
index 0000000000..a99f7c1498
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
@@ -0,0 +1,58 @@
+defaults: ../../../grpo_math_1B.yaml
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_num_steps: 500
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g
+policy:
+  model_name: meta-llama/Llama-3.1-8B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+  train_micro_batch_size: 1
+  logprob_batch_size: 2
+  max_total_sequence_length: 4096
+  make_sequence_length_divisible_by: 1
+  dtensor_cfg:
+    enabled: false
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    converter_type: LlamaForCausalLM
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    sequence_parallel: false
+    activation_checkpointing: true
+    defer_fp32_logits: true
+    optimizer:
+      lr: 5.0e-07
+      min_lr: 5.0e-08
+      weight_decay: 0.0
+      use_precision_aware_optimizer: true
+    scheduler:
+      lr_warmup_iters: 2
+      lr_warmup_init: 5.0e-08
+    fp8_cfg:
+      enabled: false
+  generation:
+    max_new_tokens: 4096
+    stop_token_ids:
+    - 128009
+    vllm_cfg:
+      max_model_len: 4096
+      tensor_parallel_size: 1
+data:
+  max_input_seq_length: 4096
+logger:
+  log_dir: logs/grpo-llama3.1-8b-instruct-2n4g
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.1-8b-instruct-2n4g
+cluster:
+  gpus_per_node: 4
+  num_nodes: 2
+
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
new file mode 100644
index 0000000000..21b9746f4b
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
@@ -0,0 +1,45 @@
+defaults: ../../../grpo_math_1B.yaml
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-qwen3-30ba3b-4n4g
+policy:
+  model_name: Qwen/Qwen3-30B-A3B
+  train_micro_batch_size: 1
+  max_total_sequence_length: 4096
+  dtensor_cfg:
+    enabled: false
+  optimizer: null
+  scheduler: null
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 16
+    sequence_parallel: false
+    optimizer:
+      lr: 3.0e-07
+      min_lr: 3.0e-08
+    scheduler:
+      lr_warmup_iters: 50
+      lr_warmup_init: 3.0e-08
+    env_vars:
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 1
+logger:
+  log_dir: logs/grpo-qwen3-30ba3b-4n4g
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen3-30ba3b-4n4g
+cluster:
+  gpus_per_node: 4
+  num_nodes: 4
+
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
new file mode 100644
index 0000000000..9b98877b18
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
@@ -0,0 +1,42 @@
+defaults: ../../../grpo_math_1B.yaml
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-qwen3-32b-4n4g
+policy:
+  model_name: Qwen/Qwen3-32B
+  train_micro_batch_size: 1
+  max_total_sequence_length: 4096
+  dtensor_cfg:
+    enabled: false
+  optimizer: null
+  scheduler: null
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    tensor_model_parallel_size: 4
+    pipeline_model_parallel_size: 1
+    sequence_parallel: true
+    optimizer:
+      lr: 3.0e-07
+      min_lr: 3.0e-08
+    scheduler:
+      lr_warmup_iters: 2
+      lr_warmup_init: 3.0e-08
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 1
+logger:
+  log_dir: logs/grpo-qwen3-32b-4n4g
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen3-32b-4n4g
+cluster:
+  gpus_per_node: 4
+  num_nodes: 4
+
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
new file mode 100755
index 0000000000..e7636f3e93
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
new file mode 100755
index 0000000000..2a56609ffd
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
new file mode 100755
index 0000000000..2a56609ffd
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
+
diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance.txt
index bf714b0e74..7468fab4d3 100644
--- a/tests/test_suites/performance.txt
+++ b/tests/test_suites/performance.txt
@@ -2,6 +2,7 @@
 # GRPO #
 ########
 
+# H100 (8 GPUs/node)
 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
@@ -14,4 +15,12 @@ tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
 
-tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
\ No newline at end of file
+<<<<<<< HEAD
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
+=======
+
+# GB200 (4 GPUs/node)
+tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
+>>>>>>> 878206e8 (feat: Add GB200 perf recipes for llama3-8b, qwen3-30ba3,qwen3-32b)

From 9c2bc1c965e62dfb4623e00a9bda11fb4c218632 Mon Sep 17 00:00:00 2001
From: Seonjin Na <sna@nvidia.com>
Date: Thu, 18 Dec 2025 17:12:31 -0800
Subject: [PATCH 2/3] feat: Add GB200 perf recipes for llama3-8b,
 qwen3-30ba3,qwen3-32b

---
 tests/test_suites/performance.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance.txt
index 7468fab4d3..33c0c99686 100644
--- a/tests/test_suites/performance.txt
+++ b/tests/test_suites/performance.txt
@@ -15,12 +15,9 @@ tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
 
-<<<<<<< HEAD
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
-=======
 
 # GB200 (4 GPUs/node)
 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
 tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
->>>>>>> 878206e8 (feat: Add GB200 perf recipes for llama3-8b, qwen3-30ba3,qwen3-32b)

From 70c1c395d47d6e38c8d868ab0d59d488ea47d5df Mon Sep 17 00:00:00 2001
From: Seonjin Na <sna@nvidia.com>
Date: Fri, 19 Dec 2025 14:27:48 -0800
Subject: [PATCH 3/3]  feat: Add Async 1-off GB200 perf recipes for llama3-8b,
 qwen3-30ba3,qwen3-32b

---
 ...-llama3.1-8b-instruct-2n4g-async-1off.yaml | 32 +++++++++++++++
 .../grpo-qwen3-30ba3b-8n4g-async-1off.yaml    | 33 +++++++++++++++
 .../llm/performance/grpo-qwen3-32b-4n4g.yaml  |  2 +-
 .../grpo-qwen3-32b-8n4g-async-1off.yaml       | 33 +++++++++++++++
 ...po-llama3.1-8b-instruct-2n4g-async-1off.sh | 39 ++++++++++++++++++
 .../grpo-qwen3-30ba3b-8n4g-async-1off.sh      | 40 +++++++++++++++++++
 .../grpo-qwen3-32b-8n4g-async-1off.sh         | 39 ++++++++++++++++++
 tests/test_suites/performance.txt             |  3 ++
 8 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
 create mode 100755 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
 create mode 100644 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
 create mode 100644 tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh

diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
new file mode 100644
index 0000000000..d906eda2b4
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
@@ -0,0 +1,32 @@
+defaults: ./grpo-llama3.1-8b-instruct-2n4g.yaml
+grpo:
+  async_grpo:
+    enabled: true
+    max_trajectory_age_steps: 1
+    in_flight_weight_updates: true
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g-async-1off
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    sequence_parallel: false
+  generation:
+    colocated:
+      enabled: false
+      resources:
+        num_nodes: 1
+        gpus_per_node: 4
+    vllm_cfg:
+      async_engine: true
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.8
+logger:
+  log_dir: logs/grpo-llama3.1-8b-instruct-2n4g-async-1off
+  wandb:
+    name: grpo-llama3.1-8b-instruct-2n4g-async-1off
+cluster:
+  gpus_per_node: 4
+  num_nodes: 2
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
new file mode 100644
index 0000000000..a9837c87f2
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
@@ -0,0 +1,33 @@
+defaults: ./grpo-qwen3-30ba3b-4n4g.yaml
+grpo:
+  async_grpo:
+    enabled: true
+    max_trajectory_age_steps: 1
+    in_flight_weight_updates: true
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-30ba3b-8n4g-async-1off
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 16
+    sequence_parallel: false
+  generation:
+    colocated:
+      enabled: false
+      resources:
+        num_nodes: 4
+        gpus_per_node: 4
+    vllm_cfg:
+      async_engine: true
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.8
+logger:
+  log_dir: logs/grpo-qwen3-30ba3b-8n4g-async-1off
+  wandb:
+    name: grpo-qwen3-30ba3b-8n4g-async-1off
+cluster:
+  gpus_per_node: 4
+  num_nodes: 8
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
index 9b98877b18..2e441cdb5f 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
@@ -17,7 +17,7 @@ policy:
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
-    tensor_model_parallel_size: 4
+    tensor_model_parallel_size: 2
     pipeline_model_parallel_size: 1
     sequence_parallel: true
     optimizer:
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
new file mode 100644
index 0000000000..4f8a0a03bb
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
@@ -0,0 +1,33 @@
+defaults: ./grpo-qwen3-32b-4n4g.yaml
+grpo:
+  async_grpo:
+    enabled: true
+    max_trajectory_age_steps: 1
+    in_flight_weight_updates: true
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-32b-8n4g-async-1off
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: true
+  generation:
+    colocated:
+      enabled: false
+      resources:
+        num_nodes: 4
+        gpus_per_node: 4
+    vllm_cfg:
+      async_engine: true
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.8
+logger:
+  log_dir: logs/grpo-qwen3-32b-8n4g-async-1off
+  wandb:
+    name: grpo-qwen3-32b-8n4g-async-1off
+cluster:
+  gpus_per_node: 4
+  num_nodes: 8
+
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
new file mode 100755
index 0000000000..e7636f3e93
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
new file mode 100644
index 0000000000..8350d128e8
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=8
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
new file mode 100644
index 0000000000..35d58c98f7
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=8
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance.txt
index 1199421349..2bc3e13efd 100644
--- a/tests/test_suites/performance.txt
+++ b/tests/test_suites/performance.txt
@@ -37,5 +37,8 @@ tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
 
 ## ASYNC 1-off
+tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh