From 227438368f85f77f615d7faadd56de921b0bee93 Mon Sep 17 00:00:00 2001 From: Yi-Fu Wu Date: Fri, 20 Feb 2026 11:23:28 -0800 Subject: [PATCH] Fix adv estimator configs Previously there were cases where the grpo configs and adv_estimator configs did not match Signed-off-by: Yi-Fu Wu --- examples/configs/grpo_math_1B.yaml | 4 ++-- examples/configs/vlm_grpo_3B.yaml | 4 ++-- examples/configs/vlm_grpo_3B_megatron.yaml | 4 ++-- .../grpo_workplace_assistant_nemotron_nano_v2_9b.yaml | 4 ++-- research/template_project/configs/grpo_math_1B.yaml | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 11aebb9e84..740f9ad24b 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -28,8 +28,8 @@ grpo: # Options: "grpo" (default) or "reinforce_plus_plus" adv_estimator: name: "grpo" # Use "reinforce_plus_plus" for Reinforce++ estimator - normalize_rewards: true - use_leave_one_out_baseline: false + normalize_rewards: ${grpo.normalize_rewards} + use_leave_one_out_baseline: ${grpo.use_leave_one_out_baseline} minus_baseline: true # Reinforce++-baseline specific: subtract per-prompt mean baseline reward_scaling: enabled: false diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml index a6522d6c8e..4cad631c85 100644 --- a/examples/configs/vlm_grpo_3B.yaml +++ b/examples/configs/vlm_grpo_3B.yaml @@ -26,8 +26,8 @@ grpo: # Options: "grpo" (default) or "reinforce_plus_plus" adv_estimator: name: "grpo" # Use "reinforce_plus_plus" for Reinforce++ estimator - normalize_rewards: true - use_leave_one_out_baseline: false + normalize_rewards: ${grpo.normalize_rewards} + use_leave_one_out_baseline: ${grpo.use_leave_one_out_baseline} minus_baseline: true # Reinforce++-baseline specific: subtract per-prompt mean baseline reward_scaling: enabled: false diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml index a38b6e15a8..336d97d79b 100644 --- a/examples/configs/vlm_grpo_3B_megatron.yaml +++ b/examples/configs/vlm_grpo_3B_megatron.yaml @@ -24,8 +24,8 @@ grpo: # Options: "grpo" (default) or "reinforce_plus_plus" adv_estimator: name: "grpo" # Use "reinforce_plus_plus" for Reinforce++ estimator - normalize_rewards: true - use_leave_one_out_baseline: false + normalize_rewards: ${grpo.normalize_rewards} + use_leave_one_out_baseline: ${grpo.use_leave_one_out_baseline} minus_baseline: true # Reinforce++-baseline specific: subtract per-prompt mean baseline reward_scaling: enabled: false diff --git a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml index 87bc70d365..a923f842b7 100644 --- a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml +++ b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml @@ -25,8 +25,8 @@ grpo: # Options: "grpo" (default) or "reinforce_plus_plus" adv_estimator: name: "grpo" # Use "reinforce_plus_plus" for Reinforce++ estimator - normalize_rewards: true - use_leave_one_out_baseline: false + normalize_rewards: ${grpo.normalize_rewards} + use_leave_one_out_baseline: ${grpo.use_leave_one_out_baseline} minus_baseline: true # Reinforce++-baseline specific: subtract per-prompt mean baseline reward_scaling: enabled: false diff --git a/research/template_project/configs/grpo_math_1B.yaml b/research/template_project/configs/grpo_math_1B.yaml index 758a1def74..58c452838c 100644 --- a/research/template_project/configs/grpo_math_1B.yaml +++ b/research/template_project/configs/grpo_math_1B.yaml @@ -18,8 +18,8 @@ grpo: # Options: "grpo" (default) or "reinforce_plus_plus" adv_estimator: name: "grpo" # Use "reinforce_plus_plus" for Reinforce++ estimator - normalize_rewards: true - use_leave_one_out_baseline: false + normalize_rewards: ${grpo.normalize_rewards} + use_leave_one_out_baseline: ${grpo.use_leave_one_out_baseline} minus_baseline: true # Reinforce++-baseline specific: subtract per-prompt mean baseline async_grpo: enabled: false # Set to true to enable async training mode