From c35d0f4591b6ff5921d70cc7c93d041630932b00 Mon Sep 17 00:00:00 2001
From: Guyue Huang <guyueh@nvidia.com>
Date: Thu, 18 Dec 2025 16:04:58 -0800
Subject: [PATCH 1/9] Perf recipe for v0.5

Signed-off-by: Guyue Huang <guyueh@nvidia.com>
---
 ...oonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml |  7 ------
 .../performance/grpo-deepseek-v3-32n4g.yaml   | 21 ++++++++++++++++++
 .../grpo-deepseek-v3-64n4g-async-1off.yaml    | 22 +++++++++++++++++++
 ...grpo-deepseek-v3-64n8g-fp8-async-1off.yaml | 21 ++++++++++++++++++
 ...-llama3.1-8b-instruct-2n8g-async-1off.yaml |  2 ++
 ...ma3.1-8b-instruct-2n8g-fp8-async-1off.yaml | 20 +++++++++++++++++
 .../performance/grpo-qwen3-235b-16n4g.yaml    | 20 +++++++++++++++++
 .../grpo-qwen3-235b-32n4g-async-1off.yaml     | 20 +++++++++++++++++
 8 files changed, 126 insertions(+), 7 deletions(-)
 create mode 100644 examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml

diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
index 19c86c8a04..98fbed9812 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
@@ -16,8 +16,6 @@ policy:
   max_total_sequence_length: 8192
   dtensor_cfg:
     enabled: false
-  sequence_packing:
-    algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
   megatron_cfg:
@@ -45,11 +43,6 @@ policy:
       precision: fp8
       use_deep_gemm: true
       gpu_memory_utilization: 0.5
-      expert_parallel_size: 4
-      quantization_ignored_layer_kws: [
-        a_proj,
-        b_proj
-      ]
 logger:
   monitor_gpus: false
   wandb:
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
new file mode 100644
index 0000000000..812fdfae9d
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
@@ -0,0 +1,21 @@
+defaults: ./grpo-deepseek-v3-32n8g.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-deepseek-v3-32n4g
+policy:
+  sequence_packing:
+    enabled: false
+  megatron_cfg:
+    pipeline_model_parallel_size: 8
+    expert_model_parallel_size: 16
+    num_layers_in_first_pipeline_stage: 7
+    num_layers_in_last_pipeline_stage: 6
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 16
+logger:
+  log_dir: logs/grpo-deepseek-v3-32n4g
+  wandb:
+    name: grpo-deepseek-v3-32n4g
+cluster:
+  gpus_per_node: 4
+  num_nodes: 32
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
new file mode 100644
index 0000000000..c019e34962
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
@@ -0,0 +1,22 @@
+defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-deepseek-v3-64n4g-async-1off
+policy:
+  sequence_packing:
+    enabled: false
+  megatron_cfg:
+    pipeline_model_parallel_size: 8
+    expert_model_parallel_size: 16
+    num_layers_in_first_pipeline_stage: 7
+    num_layers_in_last_pipeline_stage: 6
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 16
+      gpu_memory_utilization: 0.8
+logger:
+  log_dir: logs/grpo-deepseek-v3-64n4g-async-32T32G-1off
+  wandb:
+    name: grpo-deepseek-v3-64n4g-async-32T32G-1off
+cluster:
+  gpus_per_node: 4
+  num_nodes: 64
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
new file mode 100644
index 0000000000..b47aa65fb6
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
@@ -0,0 +1,21 @@
+defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-deepseek-v3-64n8g-fp8-async-1off
+policy:
+  megatron_cfg:
+    fp8_cfg:
+      enabled: true
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
+    env_vars:
+      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 16
+      precision: "fp8"
+      use_deep_gemm: true
+logger:
+  log_dir: logs/grpo-deepseek-v3-64n8g-fp8-async-1off
+  wandb:
+    name: grpo-deepseek-v3-64n8g-fp8-async-1off
\ No newline at end of file
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
index b6d7ed441d..c0263f68fb 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
@@ -9,6 +9,8 @@ loss_fn:
 checkpointing:
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off
 policy:
+  megatron_cfg:
+    pipeline_model_parallel_size: 1
   generation:
     colocated:
       enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
new file mode 100644
index 0000000000..b32786f7d7
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
@@ -0,0 +1,20 @@
+defaults: ./grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off
+policy:
+  megatron_cfg:
+    fp8_cfg:
+      enabled: true
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
+    env_vars:
+      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
+  generation:
+    vllm_cfg:
+      precision: "fp8"
+      use_deep_gemm: true
+logger:
+  log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off
+  wandb:
+    name: grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off
\ No newline at end of file
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
new file mode 100644
index 0000000000..0b1120b64a
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
@@ -0,0 +1,20 @@
+defaults: ./grpo-qwen3-235b-16n8g.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-235b-16n4g
+policy:
+  sequence_packing:
+    enabled: false
+  megatron_cfg:
+    pipeline_model_parallel_size: 4
+    num_layers_in_first_pipeline_stage: 23
+    num_layers_in_last_pipeline_stage: 23
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 8
+logger:
+  log_dir: logs/grpo-qwen3-235b-16n4g
+  wandb:
+    name: grpo-qwen3-235b-16n4g
+cluster:
+  gpus_per_node: 4
+  num_nodes: 16
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
new file mode 100644
index 0000000000..ae8f4bb25b
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
@@ -0,0 +1,20 @@
+defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off
+policy:
+  sequence_packing:
+    enabled: false
+  megatron_cfg:
+    pipeline_model_parallel_size: 4
+    num_layers_in_first_pipeline_stage: 23
+    num_layers_in_last_pipeline_stage: 23
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 8
+logger:
+  log_dir: logs/grpo-qwen3-235b-32n4g-async-1off
+  wandb:
+    name: grpo-qwen3-235b-32n4g-async-1off
+cluster:
+  gpus_per_node: 4
+  num_nodes: 32

From 7f128544d4d1f17e57cdd0f8eb4f17095b9b2558 Mon Sep 17 00:00:00 2001
From: Guyue Huang <guyueh@nvidia.com>
Date: Fri, 19 Dec 2025 10:11:35 -0800
Subject: [PATCH 2/9] Add tests to performance.txt

Signed-off-by: Guyue Huang <guyueh@nvidia.com>
---
 .../llm/performance/grpo-deepseek-v3-32n4g.sh | 45 +++++++++++++++++++
 .../grpo-deepseek-v3-64n4g-async-1off.sh      | 45 +++++++++++++++++++
 .../grpo-deepseek-v3-64n8g-fp8-async-1off.sh  | 45 +++++++++++++++++++
 ...lama3.1-8b-instruct-2n8g-fp8-async-1off.sh | 39 ++++++++++++++++
 .../llm/performance/grpo-qwen3-235b-16n4g.sh  | 40 +++++++++++++++++
 .../grpo-qwen3-235b-32n4g-async-1off.sh       | 40 +++++++++++++++++
 tests/test_suites/performance.txt             | 23 +++++++++-
 7 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100755 tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh

diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
new file mode 100755
index 0000000000..738b38dd5b
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md
+export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"}
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=32
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    policy.model_name=$MODEL_NAME \
+    policy.tokenizer.name=$MODEL_NAME \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
new file mode 100755
index 0000000000..14138486e1
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md
+export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"}
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=64
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    policy.model_name=$MODEL_NAME \
+    policy.tokenizer.name=$MODEL_NAME \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh
new file mode 100755
index 0000000000..14138486e1
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md
+export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"}
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=64
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    policy.model_name=$MODEL_NAME \
+    policy.tokenizer.name=$MODEL_NAME \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh
new file mode 100755
index 0000000000..e7636f3e93
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
new file mode 100755
index 0000000000..0f9bf9289f
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=16
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh
new file mode 100755
index 0000000000..f7dac553af
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=32
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance.txt
index bf714b0e74..e96ba7e110 100644
--- a/tests/test_suites/performance.txt
+++ b/tests/test_suites/performance.txt
@@ -2,16 +2,37 @@
 # GRPO #
 ########
 
+# H100 BF16
+
+## SYNC
 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh
 
+## ASYNC 1-off
 tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
 
-tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
\ No newline at end of file
+## ASYNC many-off
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
+
+# H100 FP8
+
+## ASYNC 1-off
+tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh
+tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh
+
+# GB200 BF16
+
+## SYNC
+tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
+
+## ASYNC 1-off
+tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh

From 7977eaa73c1174e867bea3791cce6fc969abb9f5 Mon Sep 17 00:00:00 2001
From: Guyue Huang <guyueh@nvidia.com>
Date: Fri, 19 Dec 2025 10:11:51 -0800
Subject: [PATCH 3/9] Revert a change to moonlight16b fp8

Signed-off-by: Guyue Huang <guyueh@nvidia.com>
---
 .../llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
index 98fbed9812..27108c55c7 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
@@ -43,6 +43,10 @@ policy:
       precision: fp8
       use_deep_gemm: true
       gpu_memory_utilization: 0.5
+      quantization_ignored_layer_kws: [
+        a_proj,
+        b_proj
+      ]
 logger:
   monitor_gpus: false
   wandb:

From fedf77081fb4e4f95ee22a6ee7f3389cae54ce74 Mon Sep 17 00:00:00 2001
From: Guyue Huang <guyueh@nvidia.com>
Date: Fri, 19 Dec 2025 10:28:36 -0800
Subject: [PATCH 4/9] fix deepseek fp8 recipe

Signed-off-by: Guyue Huang <guyueh@nvidia.com>
---
 .../performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
index b47aa65fb6..a9be31b136 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
@@ -8,6 +8,7 @@ policy:
       fp8: "e4m3"
       fp8_recipe: "blockwise"
       fp8_param: false
+    moe_router_dtype: fp32
     env_vars:
       NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
   generation:
@@ -15,6 +16,10 @@ policy:
       tensor_parallel_size: 16
       precision: "fp8"
       use_deep_gemm: true
+      quantization_ignored_layer_kws: [
+        a_proj,
+        b_proj
+      ]
 logger:
   log_dir: logs/grpo-deepseek-v3-64n8g-fp8-async-1off
   wandb:

From 0a3c512c0fb467a60ad540628681e8b3c6b73726 Mon Sep 17 00:00:00 2001
From: Guyue Huang <guyueh@login-lyris02.lyris.clusters.nvidia.com>
Date: Fri, 19 Dec 2025 15:41:58 -0800
Subject: [PATCH 5/9] script fix

Signed-off-by: Guyue Huang <guyueh@login-lyris02.lyris.clusters.nvidia.com>
---
 .../llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml   | 3 +++
 .../performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml   | 4 +++-
 .../recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml       | 2 --
 .../llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml    | 5 +++--
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
index c019e34962..bf9a30a5d3 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml
@@ -10,6 +10,9 @@ policy:
     num_layers_in_first_pipeline_stage: 7
     num_layers_in_last_pipeline_stage: 6
   generation:
+    colocated:
+      resources:
+        gpus_per_node: 4
     vllm_cfg:
       tensor_parallel_size: 16
       gpu_memory_utilization: 0.8
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
index a9be31b136..7f6b5ae86b 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml
@@ -20,7 +20,9 @@ policy:
         a_proj,
         b_proj
       ]
+    vllm_kwargs:
+      max_num_seqs: 32
 logger:
   log_dir: logs/grpo-deepseek-v3-64n8g-fp8-async-1off
   wandb:
-    name: grpo-deepseek-v3-64n8g-fp8-async-1off
\ No newline at end of file
+    name: grpo-deepseek-v3-64n8g-fp8-async-1off
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
index 0b1120b64a..1640deda09 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
@@ -2,8 +2,6 @@ defaults: ./grpo-qwen3-235b-16n8g.yaml
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-235b-16n4g
 policy:
-  sequence_packing:
-    enabled: false
   megatron_cfg:
     pipeline_model_parallel_size: 4
     num_layers_in_first_pipeline_stage: 23
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
index ae8f4bb25b..f55b383686 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml
@@ -2,13 +2,14 @@ defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml
 checkpointing:
   checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off
 policy:
-  sequence_packing:
-    enabled: false
   megatron_cfg:
     pipeline_model_parallel_size: 4
     num_layers_in_first_pipeline_stage: 23
     num_layers_in_last_pipeline_stage: 23
   generation:
+    colocated:
+      resources:
+        gpus_per_node: 4
     vllm_cfg:
       tensor_parallel_size: 8
 logger:

From 477dddab7211ed0610921c99f18b3ba1adee7973 Mon Sep 17 00:00:00 2001
From: Seonjin <sna@nvidia.com>
Date: Fri, 19 Dec 2025 16:26:26 -0800
Subject: [PATCH 6/9] feat: GB200 Perf recipes for Qwen3-30BA3, Qwen3-32B,
 LLaMA3.1-8B (#1666)

---
 ...-llama3.1-8b-instruct-2n4g-async-1off.yaml | 32 ++++++++++
 .../grpo-llama3.1-8b-instruct-2n4g.yaml       | 58 +++++++++++++++++++
 .../performance/grpo-qwen3-30ba3b-4n4g.yaml   | 45 ++++++++++++++
 .../grpo-qwen3-30ba3b-8n4g-async-1off.yaml    | 33 +++++++++++
 .../llm/performance/grpo-qwen3-32b-4n4g.yaml  | 42 ++++++++++++++
 .../grpo-qwen3-32b-8n4g-async-1off.yaml       | 33 +++++++++++
 ...po-llama3.1-8b-instruct-2n4g-async-1off.sh | 39 +++++++++++++
 .../grpo-llama3.1-8b-instruct-2n4g.sh         | 39 +++++++++++++
 .../llm/performance/grpo-qwen3-30ba3b-4n4g.sh | 40 +++++++++++++
 .../grpo-qwen3-30ba3b-8n4g-async-1off.sh      | 40 +++++++++++++
 .../llm/performance/grpo-qwen3-32b-4n4g.sh    | 40 +++++++++++++
 .../grpo-qwen3-32b-8n4g-async-1off.sh         | 39 +++++++++++++
 tests/test_suites/performance.txt             |  6 ++
 13 files changed, 486 insertions(+)
 create mode 100644 examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
 create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
 create mode 100755 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
 create mode 100644 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
 create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
 create mode 100644 tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh

diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
new file mode 100644
index 0000000000..d906eda2b4
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml
@@ -0,0 +1,32 @@
+defaults: ./grpo-llama3.1-8b-instruct-2n4g.yaml
+grpo:
+  async_grpo:
+    enabled: true
+    max_trajectory_age_steps: 1
+    in_flight_weight_updates: true
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g-async-1off
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    sequence_parallel: false
+  generation:
+    colocated:
+      enabled: false
+      resources:
+        num_nodes: 1
+        gpus_per_node: 4
+    vllm_cfg:
+      async_engine: true
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.8
+logger:
+  log_dir: logs/grpo-llama3.1-8b-instruct-2n4g-async-1off
+  wandb:
+    name: grpo-llama3.1-8b-instruct-2n4g-async-1off
+cluster:
+  gpus_per_node: 4
+  num_nodes: 2
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
new file mode 100644
index 0000000000..a99f7c1498
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
@@ -0,0 +1,58 @@
+defaults: ../../../grpo_math_1B.yaml
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_num_steps: 500
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g
+policy:
+  model_name: meta-llama/Llama-3.1-8B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+  train_micro_batch_size: 1
+  logprob_batch_size: 2
+  max_total_sequence_length: 4096
+  make_sequence_length_divisible_by: 1
+  dtensor_cfg:
+    enabled: false
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    converter_type: LlamaForCausalLM
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    sequence_parallel: false
+    activation_checkpointing: true
+    defer_fp32_logits: true
+    optimizer:
+      lr: 5.0e-07
+      min_lr: 5.0e-08
+      weight_decay: 0.0
+      use_precision_aware_optimizer: true
+    scheduler:
+      lr_warmup_iters: 2
+      lr_warmup_init: 5.0e-08
+    fp8_cfg:
+      enabled: false
+  generation:
+    max_new_tokens: 4096
+    stop_token_ids:
+    - 128009
+    vllm_cfg:
+      max_model_len: 4096
+      tensor_parallel_size: 1
+data:
+  max_input_seq_length: 4096
+logger:
+  log_dir: logs/grpo-llama3.1-8b-instruct-2n4g
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.1-8b-instruct-2n4g
+cluster:
+  gpus_per_node: 4
+  num_nodes: 2
+
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
new file mode 100644
index 0000000000..21b9746f4b
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
@@ -0,0 +1,45 @@
+defaults: ../../../grpo_math_1B.yaml
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-qwen3-30ba3b-4n4g
+policy:
+  model_name: Qwen/Qwen3-30B-A3B
+  train_micro_batch_size: 1
+  max_total_sequence_length: 4096
+  dtensor_cfg:
+    enabled: false
+  optimizer: null
+  scheduler: null
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 16
+    sequence_parallel: false
+    optimizer:
+      lr: 3.0e-07
+      min_lr: 3.0e-08
+    scheduler:
+      lr_warmup_iters: 50
+      lr_warmup_init: 3.0e-08
+    env_vars:
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 1
+logger:
+  log_dir: logs/grpo-qwen3-30ba3b-4n4g
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen3-30ba3b-4n4g
+cluster:
+  gpus_per_node: 4
+  num_nodes: 4
+
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
new file mode 100644
index 0000000000..a9837c87f2
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
@@ -0,0 +1,33 @@
+defaults: ./grpo-qwen3-30ba3b-4n4g.yaml
+grpo:
+  async_grpo:
+    enabled: true
+    max_trajectory_age_steps: 1
+    in_flight_weight_updates: true
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-30ba3b-8n4g-async-1off
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 16
+    sequence_parallel: false
+  generation:
+    colocated:
+      enabled: false
+      resources:
+        num_nodes: 4
+        gpus_per_node: 4
+    vllm_cfg:
+      async_engine: true
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.8
+logger:
+  log_dir: logs/grpo-qwen3-30ba3b-8n4g-async-1off
+  wandb:
+    name: grpo-qwen3-30ba3b-8n4g-async-1off
+cluster:
+  gpus_per_node: 4
+  num_nodes: 8
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
new file mode 100644
index 0000000000..2e441cdb5f
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
@@ -0,0 +1,42 @@
+defaults: ../../../grpo_math_1B.yaml
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-qwen3-32b-4n4g
+policy:
+  model_name: Qwen/Qwen3-32B
+  train_micro_batch_size: 1
+  max_total_sequence_length: 4096
+  dtensor_cfg:
+    enabled: false
+  optimizer: null
+  scheduler: null
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: true
+    optimizer:
+      lr: 3.0e-07
+      min_lr: 3.0e-08
+    scheduler:
+      lr_warmup_iters: 2
+      lr_warmup_init: 3.0e-08
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 1
+logger:
+  log_dir: logs/grpo-qwen3-32b-4n4g
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen3-32b-4n4g
+cluster:
+  gpus_per_node: 4
+  num_nodes: 4
+
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
new file mode 100644
index 0000000000..4f8a0a03bb
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml
@@ -0,0 +1,33 @@
+defaults: ./grpo-qwen3-32b-4n4g.yaml
+grpo:
+  async_grpo:
+    enabled: true
+    max_trajectory_age_steps: 1
+    in_flight_weight_updates: true
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-32b-8n4g-async-1off
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: true
+  generation:
+    colocated:
+      enabled: false
+      resources:
+        num_nodes: 4
+        gpus_per_node: 4
+    vllm_cfg:
+      async_engine: true
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.8
+logger:
+  log_dir: logs/grpo-qwen3-32b-8n4g-async-1off
+  wandb:
+    name: grpo-qwen3-32b-8n4g-async-1off
+cluster:
+  gpus_per_node: 4
+  num_nodes: 8
+
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
new file mode 100755
index 0000000000..e7636f3e93
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
new file mode 100755
index 0000000000..e7636f3e93
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
new file mode 100755
index 0000000000..2a56609ffd
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
new file mode 100644
index 0000000000..8350d128e8
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=8
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
new file mode 100755
index 0000000000..2a56609ffd
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
new file mode 100644
index 0000000000..35d58c98f7
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=8
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance.txt
index e96ba7e110..2bc3e13efd 100644
--- a/tests/test_suites/performance.txt
+++ b/tests/test_suites/performance.txt
@@ -30,9 +30,15 @@ tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.
 # GB200 BF16
 
 ## SYNC
+tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
 
 ## ASYNC 1-off
+tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh

From dd196a642ee71def344ba93b9fd5f1b8c7942387 Mon Sep 17 00:00:00 2001
From: Guyue Huang <guyueh@login-lyris02.lyris.clusters.nvidia.com>
Date: Fri, 19 Dec 2025 16:28:34 -0800
Subject: [PATCH 7/9] Separate gb200 and h100 perf test

Signed-off-by: Guyue Huang <guyueh@login-lyris02.lyris.clusters.nvidia.com>
---
 tests/test_suites/performance_gb200.txt       | 19 +++++++++++++++++++
 .../{performance.txt => performance_h100.txt} | 15 ---------------
 2 files changed, 19 insertions(+), 15 deletions(-)
 create mode 100644 tests/test_suites/performance_gb200.txt
 rename tests/test_suites/{performance.txt => performance_h100.txt} (58%)

diff --git a/tests/test_suites/performance_gb200.txt b/tests/test_suites/performance_gb200.txt
new file mode 100644
index 0000000000..d958386001
--- /dev/null
+++ b/tests/test_suites/performance_gb200.txt
@@ -0,0 +1,19 @@
+########
+# GRPO #
+########
+
+# GB200 BF16
+
+## SYNC
+tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
+tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
+
+## ASYNC 1-off
+tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh
diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance_h100.txt
similarity index 58%
rename from tests/test_suites/performance.txt
rename to tests/test_suites/performance_h100.txt
index 2bc3e13efd..9e3eb208ce 100644
--- a/tests/test_suites/performance.txt
+++ b/tests/test_suites/performance_h100.txt
@@ -27,18 +27,3 @@ tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh
 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh
 
-# GB200 BF16
-
-## SYNC
-tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
-tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
-tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
-tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
-tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
-
-## ASYNC 1-off
-tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
-tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
-tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
-tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
-tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh

From 22dc2ba6cae56278339dc4f4d598d3f0888d8431 Mon Sep 17 00:00:00 2001
From: Guyue Huang <guyueh@nvidia.com>
Date: Fri, 19 Dec 2025 17:18:42 -0800
Subject: [PATCH 8/9] Fix unit test

Signed-off-by: Guyue Huang <guyueh@nvidia.com>
---
 tests/unit/test_recipes_and_test_suites.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 435742ec4a..101f4b21e5 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -28,7 +28,8 @@
 
 nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt")
 release_test_suite_path = os.path.join(test_suites_dir, "release.txt")
-performance_test_suite_path = os.path.join(test_suites_dir, "performance.txt")
+h100_performance_test_suite_path = os.path.join(test_suites_dir, "performance_h100.txt")
+gb200_performance_test_suite_path = os.path.join(test_suites_dir, "performance_gb200.txt")
 
 # Relative to project root
 ALGO_MAPPING_TO_BASE_YAML = {
@@ -72,7 +73,12 @@ def release_test_suite():
 @pytest.fixture
 def performance_test_suite():
     performance_suite = []
-    with open(performance_test_suite_path, "r") as f:
+    with open(h100_performance_test_suite_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                performance_suite.append(line)
+    with open(gb200_performance_test_suite_path, "r") as f:
         for line in f:
             line = line.strip()
             if line and not line.startswith("#"):
@@ -104,12 +110,14 @@ def all_recipe_yaml_rel_paths():
     [
         nightly_test_suite_path,
         release_test_suite_path,
-        performance_test_suite_path,
+        h100_performance_test_suite_path,
+        gb200_performance_test_suite_path,
     ],
     ids=[
         "nightly_test_suite",
         "release_test_suite",
-        "performance_test_suite",
+        "h100_performance_test_suite",
+        "gb200_performance_test_suite",
     ],
 )
 def test_test_suites_exist(test_suite_path):

From f75a469fc64e191375d87d4c5aeaee0b74ec74aa Mon Sep 17 00:00:00 2001
From: Guyue Huang <guyueh@nvidia.com>
Date: Fri, 19 Dec 2025 20:27:20 -0800
Subject: [PATCH 9/9] Fix lint and unit test

Signed-off-by: Guyue Huang <guyueh@nvidia.com>
---
 .../llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh      | 0
 .../llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh         | 0
 tests/unit/test_recipes_and_test_suites.py                    | 4 +++-
 3 files changed, 3 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
 mode change 100644 => 100755 tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh

diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
old mode 100644
new mode 100755
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
old mode 100644
new mode 100755
diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 101f4b21e5..ade6d49d87 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -29,7 +29,9 @@
 nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt")
 release_test_suite_path = os.path.join(test_suites_dir, "release.txt")
 h100_performance_test_suite_path = os.path.join(test_suites_dir, "performance_h100.txt")
-gb200_performance_test_suite_path = os.path.join(test_suites_dir, "performance_gb200.txt")
+gb200_performance_test_suite_path = os.path.join(
+    test_suites_dir, "performance_gb200.txt"
+)
 
 # Relative to project root
 ALGO_MAPPING_TO_BASE_YAML = {