From f7d4f23623039879c4af161e42b54d472f5e8236 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 6 Jan 2026 08:13:36 +0000 Subject: [PATCH 1/5] fix: mcore generation config restored in nightly test Signed-off-by: Terry Kong --- examples/configs/grpo_math_1B.yaml | 9 +++++++++ examples/configs/grpo_math_1B_megatron.yaml | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 1dd9639472..fafeab31d1 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -216,6 +216,15 @@ policy: top_k: null stop_token_ids: null stop_strings: null + mcore_generation_config: + buffer_size_gb: 20 # Total GPU memory (in GB) allocated for KV cache buffers + buffer_guaranteed_fraction: 0.1 # Fraction of buffer reserved for guaranteed active requests + num_cuda_graphs: 16 # Number of CUDA graphs to pre-compile for different batch sizes + block_size_tokens: 256 # Size of each KV cache block in tokens (affects memory granularity) + use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing + enable_chunked_prefill: true # Split long prefills into chunks for better memory management + unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) + max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step vllm_cfg: async_engine: false precision: ${policy.precision} diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 95d85f74c7..b7bf992f67 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -150,7 +150,7 @@ policy: use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing enable_chunked_prefill: true # Split long prefills into chunks for better memory management unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) - max_tokens: 16384 # Maximum number of tokens to use in a single step + max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step vllm_cfg: tensor_parallel_size: 1 From 45d0ad391f2affe006a09249b3260ae70b04388f Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 6 Jan 2026 20:12:42 +0000 Subject: [PATCH 2/5] fix Signed-off-by: Terry Kong --- examples/configs/grpo_math_1B.yaml | 2 +- examples/configs/grpo_math_1B_megatron.yaml | 2 +- .../policy/workers/megatron_policy_worker.py | 53 +++++++++++++------ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index fafeab31d1..c9a222a3b4 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -224,7 +224,7 @@ policy: use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing enable_chunked_prefill: true # Split long prefills into chunks for better memory management unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) - max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step + max_tokens: 16834 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens vllm_cfg: async_engine: false precision: ${policy.precision} diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index b7bf992f67..4b97604c1c 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -150,7 +150,7 @@ policy: use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing enable_chunked_prefill: true # Split long prefills into chunks for better memory management unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) - max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step + max_tokens: 16834 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens vllm_cfg: tensor_parallel_size: 1 diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py index 66767822e6..291d2b78cb 100644 --- a/nemo_rl/models/policy/workers/megatron_policy_worker.py +++ b/nemo_rl/models/policy/workers/megatron_policy_worker.py @@ -19,7 +19,7 @@ from collections import defaultdict from contextlib import AbstractContextManager, contextmanager, nullcontext from functools import partial -from typing import Any, Iterator, Optional, TypeVar, cast +from typing import Any, Iterator, Optional, TypedDict, TypeVar, cast import ray import torch @@ -145,6 +145,27 @@ TokenizerType = TypeVar("TokenizerType", bound=PreTrainedTokenizerBase) +class MegatronGenerationConfig(TypedDict): + # Total GPU memory (in GB) allocated for KV cache buffers + buffer_size_gb: int + # Fraction of buffer reserved for guaranteed active requests + buffer_guaranteed_fraction: float + # Number of CUDA graphs to pre-compile for different batch sizes + num_cuda_graphs: int + # Size of each KV cache block in tokens (affects memory granularity) + block_size_tokens: int + # Enable CUDA graphs for prefill/context processing + use_cuda_graphs_for_non_decode_steps: bool + # Split long prefills into chunks for better memory management + enable_chunked_prefill: bool + # Unified memory usage level (0=disabled, higher values enable more aggressive paging) + unified_memory_level: int + # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens. + # Can cause OOM if set too high so should be tuned with buffer_size_gb if OOMing. If set too + # low, then will only do 512 tokens at a time, which can be slow. + max_tokens: int + + def broadcast_object_across_pp_ranks(obj): """Broadcast an object across pipeline parallel ranks. @@ -1820,22 +1841,22 @@ def generate( ) from megatron.core.inference.sampling_params import SamplingParams - mcore_generation_config = self.cfg["generation"]["mcore_generation_config"] - buffer_size_gb = mcore_generation_config.get("buffer_size_gb", 20) - - num_cuda_graphs = mcore_generation_config.get("num_cuda_graphs", 16) - block_size_tokens = mcore_generation_config.get("block_size_tokens", 256) - use_cuda_graphs_for_non_decode_steps = mcore_generation_config.get( - "use_cuda_graphs_for_non_decode_steps", True - ) - enable_chunked_prefill = mcore_generation_config.get( - "enable_chunked_prefill", True + mcore_generation_config = cast( + MegatronGenerationConfig, self.cfg["generation"]["mcore_generation_config"] ) - unified_memory_level = mcore_generation_config.get("unified_memory_level", 0) - buffer_guaranteed_fraction = mcore_generation_config.get( - "buffer_guaranteed_fraction", 0.1 - ) - max_tokens = mcore_generation_config.get("max_tokens", 16384) + buffer_size_gb = mcore_generation_config["buffer_size_gb"] + + num_cuda_graphs = mcore_generation_config["num_cuda_graphs"] + block_size_tokens = mcore_generation_config["block_size_tokens"] + use_cuda_graphs_for_non_decode_steps = mcore_generation_config[ + "use_cuda_graphs_for_non_decode_steps" + ] + enable_chunked_prefill = mcore_generation_config["enable_chunked_prefill"] + unified_memory_level = mcore_generation_config["unified_memory_level"] + buffer_guaranteed_fraction = mcore_generation_config[ + "buffer_guaranteed_fraction" + ] + max_tokens = mcore_generation_config["max_tokens"] model_config = self.model.config model_config.cuda_graph_impl = "local" From 1fce97884dfa36dd70fb45c0ec4a825f74147ece Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 7 Jan 2026 06:48:09 +0000 Subject: [PATCH 3/5] 16384 Signed-off-by: Terry Kong --- examples/configs/grpo_math_1B.yaml | 2 +- examples/configs/grpo_math_1B_megatron.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index c9a222a3b4..ec8c2c5ecc 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -224,7 +224,7 @@ policy: use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing enable_chunked_prefill: true # Split long prefills into chunks for better memory management unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) - max_tokens: 16834 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens + max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens vllm_cfg: async_engine: false precision: ${policy.precision} diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 4b97604c1c..1a14b8ce64 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -150,7 +150,7 @@ policy: use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing enable_chunked_prefill: true # Split long prefills into chunks for better memory management unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) - max_tokens: 16834 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens + max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens vllm_cfg: tensor_parallel_size: 1 From 868a898e212316da8d69ed489df80ca4e9ff0a9e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 8 Jan 2026 01:58:35 +0000 Subject: [PATCH 4/5] 17.5 step time Signed-off-by: Terry Kong --- .../llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh index 4fa8068017..d0ef64efac 100755 --- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh @@ -34,11 +34,12 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # total_step_time observed around ~16, so 17.5 for buffer uv run tests/check_metrics.py $JSON_METRICS \ 'median(data["train/token_mult_prob_error"]) < 1.1' \ 'data["train/token_mult_prob_error"]["500"] < 1.1' \ 'data["train/reward"]["500"] > 0.1' \ - 'mean(data["timing/train/total_step_time"], -6, -1) < 10.5' + 'mean(data["timing/train/total_step_time"], -6, -1) < 17.5' # Clean up checkpoint directory after successful run to save space. rm -rf "$CKPT_DIR" From adc02f3c3837942e4aaf9acbdb01306f44b1af9e Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Wed, 7 Jan 2026 21:24:22 -0800 Subject: [PATCH 5/5] fix unit test Signed-off-by: Yuki Huang --- tests/unit/models/policy/test_megatron_worker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 565995985c..5f09460cfb 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -90,6 +90,10 @@ def create_megatron_test_config( "buffer_size_gb": 20, "buffer_guaranteed_fraction": 0.1, "num_cuda_graphs": 16, + "block_size_tokens": 256, + "use_cuda_graphs_for_non_decode_steps": True, + "enable_chunked_prefill": True, + "unified_memory_level": 0, "max_tokens": 16384, }, "colocated": {