From 14e9db787a7852546cb6a87c06ae75a3ba80fa13 Mon Sep 17 00:00:00 2001 From: Sadegh Mahdavi Date: Tue, 9 Dec 2025 15:30:44 -0800 Subject: [PATCH 01/17] Update nemo-rl to latest Signed-off-by: Sadegh Mahdavi --- dockerfiles/Dockerfile.nemo-rl | 75 +++++++-- .../training/nemo_rl/configs/grpo.yaml | 148 ++++++++++-------- nemo_skills/training/nemo_rl/configs/sft.yaml | 34 ++-- nemo_skills/utils.py | 7 +- 4 files changed, 165 insertions(+), 99 deletions(-) diff --git a/dockerfiles/Dockerfile.nemo-rl b/dockerfiles/Dockerfile.nemo-rl index b339b14eea..ee00953949 100644 --- a/dockerfiles/Dockerfile.nemo-rl +++ b/dockerfiles/Dockerfile.nemo-rl @@ -2,7 +2,16 @@ # TODO: from next update try to re-use their dockerfile as is as they support specifying the commit ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04 + +FROM scratch AS nemo-rl + +ARG NRL_GIT_REF=main +ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} / + + FROM ${BASE_IMAGE} AS base +# An environment variable to indicate that we are in a container. +ENV NRL_CONTAINER=1 # It is more convenient for users to run as root USER root @@ -34,7 +43,7 @@ rm -rf /var/lib/apt/lists/* EOF # Install uv and python -ARG UV_VERSION=0.7.2 +ARG UV_VERSION=0.9.7 ARG PYTHON_VERSION=3.12 ENV PATH="/root/.local/bin:$PATH" RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \ @@ -43,36 +52,44 @@ RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \ # Disable usage stats by default for users who are sensitive to sharing usage. # Users are encouraged to enable if the wish. ENV RAY_USAGE_STATS_ENABLED=0 +# After ray>=2.47, this feature is enabled by default which creates uv venvs for any py_executable starting with `uv run`. +# There is severe contention and performance issues with this enabled considering our dependencies are so large and occasionally +# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task. +ENV RAY_ENABLE_UV_RUN_RUNTIME_ENV=0 ENV NEMO_RL_VENV_DIR=/opt/ray_venvs FROM base AS hermetic -ARG NEMO_RL_COMMIT -ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-85eeb8d059b0249cace427dd5dec9573107be224} - -RUN git clone https://github.com/NVIDIA-NeMo/RL.git /opt/NeMo-RL && cd /opt/NeMo-RL && git checkout ${NEMO_RL_COMMIT} && git submodule update --init --recursive - WORKDIR /opt/NeMo-RL # Variables to control the build of TE. If there are issues with parallelization, consider # setting these to 1. ARG MAX_JOBS ARG NVTE_BUILD_THREADS_PER_JOB +# Only use for custom vllm installs. Learn more at https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/use-custom-vllm.md +ARG BUILD_CUSTOM_VLLM ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv ENV UV_LINK_MODE=copy -# This step is to warm the uv cache with flash-attn without invalidating it due to COPY layers -# This layer has to be manually updated -RUN <<"EOF" bash -exu -uv venv ${UV_PROJECT_ENVIRONMENT} +# Ensure DeepEP is built for H100 and B200 (also mcore inference unified memory API now invokes a torch API that requires these to be set) +ENV TORCH_CUDA_ARCH_LIST="9.0 10.0" -VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install --link-mode symlink setuptools torch==2.7.0 psutil ninja --torch-backend=cu128 -VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install --link-mode symlink flash-attn==2.7.4.post1 --no-build-isolation -EOF +# First copy only the dependency files +COPY --from=nemo-rl pyproject.toml uv.lock ./ +# Copy in the top level __init__.py/package_info.py since build-custom-vllm.sh needs the nemo_rl package to exist. +COPY --from=nemo-rl nemo_rl/__init__.py nemo_rl/package_info.py ./nemo_rl/ +COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh +COPY --from=nemo-rl --link research/ ./research/ +COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/ RUN <<"EOF" bash -exu +uv venv --seed +if [[ -n "${BUILD_CUSTOM_VLLM:-}" ]]; then + bash tools/build-custom-vllm.sh + source 3rdparty/vllm/nemo-rl.env +fi # uv sync has a more reliable resolver than simple uv pip install which can fail # Sync each training + inference backend one at a time (since they may conflict) @@ -83,19 +100,45 @@ RUN <<"EOF" bash -exu uv sync --link-mode symlink --locked --no-install-project uv sync --link-mode symlink --locked --extra vllm --no-install-project uv sync --link-mode symlink --locked --extra mcore --no-install-project +uv sync --link-mode symlink --locked --extra automodel --no-install-project uv sync --link-mode symlink --locked --all-groups --no-install-project EOF -RUN VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install --link-mode=symlink /opt/NeMo-RL/3rdparty/Megatron-LM-workspace/Megatron-LM ENV PATH="/opt/nemo_rl_venv/bin:$PATH" ENV NEMO_RL_VENV_DIR=/opt/ray_venvs +WORKDIR /opt/NeMo-RL +FROM hermetic AS release + +ARG NEMO_RL_COMMIT +ARG NVIDIA_BUILD_ID +ARG NVIDIA_BUILD_REF +ARG RC_DATE=00.00 +ARG TARGETARCH +ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-09476838c92c27a7488afbe6febb6339d3d79be9} +ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-} +ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-} +LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}" +LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" -WORKDIR /opt/NeMo-RL ENV NEMO_RL_VENV_DIR=/opt/ray_venvs -# Copy in source and prefetch all virtual environments +# Copy in source from build context (defaults to cloned repo, can be overridden) +# Exclude pyproject.toml and uv.lock since those may be altered by build-custom-vllm.sh +COPY --from=nemo-rl --exclude=pyproject.toml --exclude=uv.lock . /opt/NeMo-RL +# Unshallow the repo to get the full history (in the case it was from the scratch layer). +# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history), +# so do a quick check before trying to unshallow. +RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py +# Generate container fingerprint for frozen environment support +# Store outside /opt/NeMo-RL to avoid being overwritten by user mounts +RUN python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint + +# NOTICES.txt file points to where the OSS source code is archived +RUN echo "This distribution includes open source which is archived at the following URL: https://opensource.nvidia.com/oss/teams/nvidia/nemo-rl/${RC_DATE}:linux-${TARGETARCH}/index.html" > NOTICES.txt && \ + echo "For further inquiries or assistance, contact us at oss-requests@nvidia.com" >> NOTICES.txt + RUN git clone https://github.com/NVIDIA-NeMo/Skills.git /opt/NeMo-Skills && cd /opt/NeMo-Skills && uv pip install . diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml index 53428ca873..187c0ff686 100644 --- a/nemo_skills/training/nemo_rl/configs/grpo.yaml +++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml @@ -1,5 +1,4 @@ -# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/ab1b638a499308caea022648daaf6994d390cbde/examples/configs/grpo_math_1B.yaml - +# Copied and edited from https://github.com/NVIDIA-NeMo/RL/blob/64ab08df3edf25131959fc474b44ed5e36a1600b/examples/configs/grpo_math_1B.yaml # GRPO Algorithm Configuration grpo: num_prompts_per_step: 32 @@ -9,7 +8,7 @@ grpo: max_num_steps: 1000000 normalize_rewards: true use_leave_one_out_baseline: true - val_period: 0 + val_period: 0 # disabled val_at_start: false overlong_filtering: false max_val_samples: 256 @@ -34,9 +33,16 @@ grpo: enabled: false # Set to true to enable async training mode # Max age (in training steps) for trajectories used in training max_trajectory_age_steps: 1 + in_flight_weight_updates: false # Set to true to enable in-flight weight updates + recompute_kv_cache_after_weight_updates: false # Set to true to recompute kv cache after in-flight-weight-updates loss_fn: reference_policy_kl_penalty: 0.01 + # Can be set to k1, k2, k3 + # For more details, see http://joschu.net/blog/kl-approx.html + reference_policy_kl_type: "k3" + kl_input_clamp_value: 20.0 + kl_output_clamp_value: 10.0 ratio_clip_min: 0.2 ratio_clip_max: 0.2 ratio_clip_c: null @@ -48,28 +54,31 @@ loss_fn: truncated_importance_sampling_ratio: null sequence_level_importance_ratios: false token_level_loss: true + force_on_policy_ratio: false # Set to true to force ratio=1.0 (requires train_global_batch_size == num_prompts_per_step * num_generations_per_prompt) checkpointing: enabled: true checkpoint_dir: "results/grpo" - metric_name: "val_reward" + metric_name: "val:reward" # one of "val:" or "train:" followed by the metric name higher_is_better: true keep_top_k: 50 save_period: 10 checkpoint_must_save_by: null + model_save_format: "safetensors" + save_consolidated: false policy: model_name: ??? tokenizer: name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true + hf_config_overrides: {} train_global_batch_size: 512 train_micro_batch_size: 4 generation_batch_size: 32 # Only used when generating using HF backend logprob_batch_size: 4 max_total_sequence_length: 512 precision: "bfloat16" - fsdp_offload_enabled: false activation_checkpointing_enabled: false refit_buffer_size_gb: 4 # used for refitting inference engine, the unit is GB tensor_model_parallel_size: 1 @@ -78,8 +87,11 @@ policy: lr: 1e-6 weight_decay: 0.01 min_lr: 1e-6 + logprob_chunk_size: null + offload_optimizer_for_logprob: false # Only useful for non-colocated generation since colocated generation will always offload optimizer to cuda before refit dtensor_cfg: + _v2: true enabled: true cpu_offload: False sequence_parallel: false @@ -88,58 +100,9 @@ policy: context_parallel_size: ${policy.context_parallel_size} custom_parallel_plan: null - # dynamic_batching improves performance by ensuring logprob and training microbatches - # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length - # responses are sorted by sequence length and bucketed into microbatches with a total - # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the - # training and logprob stages respectively. - # We disable dynamic batching for Megatron as it is incompatible with Pipeline parallelism. - # Instead, we use sequence packing. - dynamic_batching: - enabled: False - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 - - sequence_packing: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 - #If this value is set to null, it will be automatically assigned in the code. - make_sequence_length_divisible_by: null - max_grad_norm: 1.0 - - optimizer: - name: "torch.optim.AdamW" - kwargs: - lr: ${policy.lr} - weight_decay: ${policy.weight_decay} - betas: [0.9, 0.999] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False - - scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 1.0 - end_factor: 1.0 - total_iters: 1 # must be >=1, here it keeps LR constant - - name: "torch.optim.lr_scheduler.CosineAnnealingLR" - kwargs: - T_max: ${grpo.max_num_steps} # total training steps - eta_min: ${policy.lr} # set min_lr = initial_lr -> constant schedule - - milestones: [0] # required to avoid config errors - - - megatron_cfg: enabled: true - empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs + empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. activation_checkpointing: false converter_type: "Qwen2ForCausalLM" tensor_model_parallel_size: ${policy.tensor_model_parallel_size} @@ -156,9 +119,12 @@ policy: moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo moe_permute_fusion: false - bias_activation_fusion: True #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True + # gives ~25% training perf speedup with sequence packing and apply_rope_fusion + bias_activation_fusion: True + defer_fp32_logits: False + moe_per_layer_logging: False optimizer: optimizer: "adam" @@ -204,7 +170,53 @@ policy: use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" + fp8_cfg: null + + env_vars: null + + # See docs/design-docs/sequence-packing-and-dynamic-batching.md + # for more details on dynamic batching and sequence packing. + dynamic_batching: + enabled: False + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + sequence_length_round: 64 + + sequence_packing: + enabled: True + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} + max_grad_norm: 1.0 + + optimizer: + name: "torch.optim.AdamW" + kwargs: + lr: ${policy.lr} + weight_decay: ${policy.weight_decay} + betas: [0.9, 0.999] + eps: 1e-8 + # when using Dtensor, we need to set foreach + # and fused to False + foreach: False + fused: False + scheduler: + - name: "torch.optim.lr_scheduler.LinearLR" + kwargs: + start_factor: 0.1 + end_factor: 1.0 + total_iters: 10 + - name: "torch.optim.lr_scheduler.ConstantLR" + kwargs: + factor: 1.0 + total_iters: 10000000000 + - milestones: [10] generation: backend: "vllm" @@ -215,18 +227,24 @@ policy: stop_token_ids: null stop_strings: null vllm_cfg: - async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447. + async_engine: false precision: ${policy.precision} + kv_cache_dtype: "auto" tensor_parallel_size: 1 pipeline_parallel_size: 1 expert_parallel_size: 1 # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP gpu_memory_utilization: 0.6 max_model_len: ${policy.max_total_sequence_length} - enable_expert_parallel: false - enforce_eager: True # Set as True to avoid vllm bug - # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit - # For Gemma models, we need to use "auto" due to a vllm bug - load_format: dummy + # when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy, + # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile + # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998 + enforce_eager: False + use_deep_gemm: False + num_last_layers_in_bf16: 0 + num_first_layers_in_bf16: 0 + enable_vllm_metrics_logger: true # Set to true to enable vLLM internal metrics logger, turn off for better performance + vllm_metrics_logger_interval: 0.5 # Interval in seconds to collect vLLM logger metrics + vllm_kwargs: {} colocated: # true: generation shares training GPUs # false: uses dedicated generation resources @@ -258,9 +276,9 @@ logger: num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal wandb_enabled: false tensorboard_enabled: false - mlflow_enabled: false + mlflow_enabled: false # Disable MLflow logging swanlab_enabled: false # Disable SwanLab logging - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard + monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard wandb: project: "grpo-dev" name: "grpo-dev-logger" @@ -275,5 +293,3 @@ logger: cluster: gpus_per_node: 1 num_nodes: 1 - -checkpoint_must_save_by: null diff --git a/nemo_skills/training/nemo_rl/configs/sft.yaml b/nemo_skills/training/nemo_rl/configs/sft.yaml index 20cc35ff8d..6781241939 100644 --- a/nemo_skills/training/nemo_rl/configs/sft.yaml +++ b/nemo_skills/training/nemo_rl/configs/sft.yaml @@ -16,7 +16,7 @@ sft: checkpointing: enabled: true checkpoint_dir: "results/sft" - metric_name: "val_loss" + metric_name: "val:val_loss" # one of "val:" or "train:" followed by the metric name higher_is_better: false keep_top_k: 50 save_period: 100 @@ -33,8 +33,7 @@ policy: train_micro_batch_size: 1 max_total_sequence_length: 4096 precision: "bfloat16" - fsdp_offload_enabled: false - activation_checkpointing_enabled: false + offload_optimizer_for_logprob: false tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 context_parallel_size: 1 @@ -43,9 +42,9 @@ policy: weight_decay: 0.01 min_lr: 1e-6 - dtensor_cfg: enabled: true + env_vars: {} cpu_offload: False sequence_parallel: ${policy.sequence_parallel} activation_checkpointing: false @@ -53,9 +52,10 @@ policy: context_parallel_size: ${policy.context_parallel_size} custom_parallel_plan: null - megatron_cfg: enabled: false + env_vars: {} + empty_unused_memory_level: 1 activation_checkpointing: false tensor_model_parallel_size: ${policy.tensor_model_parallel_size} expert_tensor_parallel_size: 1 @@ -72,10 +72,12 @@ policy: moe_router_bias_update_rate: 1e-3 moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + # gives ~25% training perf speedup with sequence packing and apply_rope_fusion bias_activation_fusion: True - apply_rope_fusion: True # Only used if position_embedding_type=rope layernorm_epsilon: 1e-6 - empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs + defer_fp32_logits: False + moe_per_layer_logging: False optimizer: optimizer: "adam" @@ -98,7 +100,7 @@ policy: use_distributed_optimizer: true use_precision_aware_optimizer: true - # clip_grad: ${policy.max_grad_norm} + clip_grad: ${policy.max_grad_norm} # optimizer cpu offload optimizer_cpu_offload: false @@ -117,22 +119,24 @@ policy: grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: true - average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" + use_custom_fsdp: false dynamic_batching: enabled: false - + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + sequence_length_round: 64 sequence_packing: enabled: True train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} algorithm: "modified_first_fit_decreasing" sequence_length_round: 64 - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - #If this value is set to null, it will be automatically assigned in the code. - make_sequence_length_divisible_by: null - max_grad_norm: null + + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} + max_grad_norm: 0.0 # Zero means no clipping optimizer: name: "torch.optim.AdamW" @@ -155,7 +159,7 @@ policy: - name: "torch.optim.lr_scheduler.CosineAnnealingLR" kwargs: T_max: ${sft.max_num_steps} # total training steps - eta_min: ${policy.lr} # set min_lr = initial_lr -> constant schedule + eta_min: ${policy.min_lr} # set min_lr = initial_lr -> constant schedule - milestones: [0] # required to avoid config errors diff --git a/nemo_skills/utils.py b/nemo_skills/utils.py index 18e6f63c73..db1956e9a0 100644 --- a/nemo_skills/utils.py +++ b/nemo_skills/utils.py @@ -25,8 +25,6 @@ from pathlib import Path from typing import Any, Callable, List, Optional, Union -import fire -from fire import decorators as fire_decorators from rich.logging import RichHandler # isort: off @@ -507,6 +505,11 @@ def check_no_extra_args_fire(): RuntimeError: If the function name is not found in the calling context. ValueError: If extra arguments are found that are not accepted by the function. """ + + # Need to import here since nemo-rl async GRPO data processing imports this file and does not have fire installed on its VLLM uv venv. + import fire + from fire import decorators as fire_decorators + args = sys.argv[1:] # Extract the function name and its arguments from the command-line arguments function_name = args[0] From ee566dc21cf905a2f1aa294dbffc5c8bdf5f665a Mon Sep 17 00:00:00 2001 From: Sadegh Mahdavi Date: Tue, 9 Dec 2025 18:07:46 -0800 Subject: [PATCH 02/17] Update start_grpo Signed-off-by: Sadegh Mahdavi --- nemo_skills/training/nemo_rl/start_grpo.py | 73 +++++++++++++++++----- 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/nemo_skills/training/nemo_rl/start_grpo.py b/nemo_skills/training/nemo_rl/start_grpo.py index afe7d3f7be..82e34b2dda 100644 --- a/nemo_skills/training/nemo_rl/start_grpo.py +++ b/nemo_skills/training/nemo_rl/start_grpo.py @@ -328,20 +328,65 @@ def main() -> None: master_config, ) = setup(config, tokenizer, dataset, val_dataset) - grpo_train( - policy, - policy_generation, - dataloader, - val_dataloader, - tokenizer, - loss_fn, - task_to_env, - val_task_to_env, - logger, - checkpointer, - grpo_state, - master_config, - ) + # Check if async mode is enabled + if "async_grpo" in config["grpo"] and config["grpo"]["async_grpo"]["enabled"]: + # Async GRPO does not support dynamic sampling, reward scaling, or reward shaping (DAPO features) + unsupported_features = [ + "use_dynamic_sampling", + "reward_scaling", + "reward_shaping", + ] + + for feature in unsupported_features: + if feature not in config["grpo"]: + continue + + if feature == "use_dynamic_sampling": + if config["grpo"][feature]: + raise NotImplementedError(f"{feature} is not supported with async GRPO") + else: + if config["grpo"][feature]["enabled"]: + raise NotImplementedError(f"{feature} is not supported with async GRPO") + + from nemo_rl.algorithms.grpo import async_grpo_train + + print("🚀 Running async GRPO training") + + async_config = config["grpo"]["async_grpo"] + # Run async GRPO training + async_grpo_train( + policy=policy, + policy_generation=policy_generation, + dataloader=dataloader, + val_dataloader=val_dataloader, + tokenizer=tokenizer, + loss_fn=loss_fn, + task_to_env=task_to_env, + val_task_to_env=val_task_to_env, + logger=logger, + checkpointer=checkpointer, + grpo_save_state=grpo_state, + master_config=master_config, + max_trajectory_age_steps=async_config["max_trajectory_age_steps"], + ) + else: + print("🚀 Running synchronous GRPO training") + + # Run standard GRPO training + grpo_train( + policy, + policy_generation, + dataloader, + val_dataloader, + tokenizer, + loss_fn, + task_to_env, + val_task_to_env, + logger, + checkpointer, + grpo_state, + master_config, + ) if __name__ == "__main__": From 33cf653db2545e7c697cacb3f2c07ad382136540 Mon Sep 17 00:00:00 2001 From: Sadegh Mahdavi Date: Wed, 10 Dec 2025 11:17:38 -0800 Subject: [PATCH 03/17] legacy configs for nemo-rl Signed-off-by: Sadegh Mahdavi --- .../nemo_rl/configs/grpo-legacy-85eeb8d.yaml | 279 ++++++++++++++++++ .../nemo_rl/configs/sft-legacy-85eeb8d.yaml | 197 +++++++++++++ 2 files changed, 476 insertions(+) create mode 100644 nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml create mode 100644 nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml diff --git a/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml b/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml new file mode 100644 index 0000000000..53428ca873 --- /dev/null +++ b/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml @@ -0,0 +1,279 @@ +# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/ab1b638a499308caea022648daaf6994d390cbde/examples/configs/grpo_math_1B.yaml + +# GRPO Algorithm Configuration +grpo: + num_prompts_per_step: 32 + num_generations_per_prompt: 16 + max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question) + max_num_epochs: 1 + max_num_steps: 1000000 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 0 + val_at_start: false + overlong_filtering: false + max_val_samples: 256 + val_batch_size: 256 + seed: 42 + use_dynamic_sampling: false + dynamic_sampling_max_gen_batches: 10 + batch_multiplier: 1 + reward_shaping: + enabled: false + overlong_buffer_length: 128 + overlong_buffer_penalty: 1 + max_response_length: ${policy.max_total_sequence_length} + reward_scaling: + enabled: false + source_min: 0.0 + source_max: 1.0 + target_min: 0.0 + target_max: 1.0 + + async_grpo: + enabled: false # Set to true to enable async training mode + # Max age (in training steps) for trajectories used in training + max_trajectory_age_steps: 1 + +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_clip_min: 0.2 + ratio_clip_max: 0.2 + ratio_clip_c: null + # (default off) loss formulation improvements (docs/guides/grpo.md#loss) + use_on_policy_kl_approximation: false + # Async GRPO requires importance sampling correction enabled + # Set to true when async_grpo.enabled is true + use_importance_sampling_correction: false + truncated_importance_sampling_ratio: null + sequence_level_importance_ratios: false + token_level_loss: true + +checkpointing: + enabled: true + checkpoint_dir: "results/grpo" + metric_name: "val_reward" + higher_is_better: true + keep_top_k: 50 + save_period: 10 + checkpoint_must_save_by: null + +policy: + model_name: ??? + tokenizer: + name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true + train_global_batch_size: 512 + train_micro_batch_size: 4 + generation_batch_size: 32 # Only used when generating using HF backend + logprob_batch_size: 4 + max_total_sequence_length: 512 + precision: "bfloat16" + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 # used for refitting inference engine, the unit is GB + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + lr: 1e-6 + weight_decay: 0.01 + min_lr: 1e-6 + + dtensor_cfg: + enabled: true + cpu_offload: False + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: ${policy.tensor_model_parallel_size} + context_parallel_size: ${policy.context_parallel_size} + custom_parallel_plan: null + + # dynamic_batching improves performance by ensuring logprob and training microbatches + # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length + # responses are sorted by sequence length and bucketed into microbatches with a total + # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the + # training and logprob stages respectively. + # We disable dynamic batching for Megatron as it is incompatible with Pipeline parallelism. + # Instead, we use sequence packing. + dynamic_batching: + enabled: False + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + sequence_length_round: 64 + + sequence_packing: + enabled: True + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + #If this value is set to null, it will be automatically assigned in the code. + make_sequence_length_divisible_by: null + max_grad_norm: 1.0 + + optimizer: + name: "torch.optim.AdamW" + kwargs: + lr: ${policy.lr} + weight_decay: ${policy.weight_decay} + betas: [0.9, 0.999] + eps: 1e-8 + # when using Dtensor, we need to set foreach + # and fused to False + foreach: False + fused: False + + scheduler: + - name: "torch.optim.lr_scheduler.LinearLR" + kwargs: + start_factor: 1.0 + end_factor: 1.0 + total_iters: 1 # must be >=1, here it keeps LR constant + - name: "torch.optim.lr_scheduler.CosineAnnealingLR" + kwargs: + T_max: ${grpo.max_num_steps} # total training steps + eta_min: ${policy.lr} # set min_lr = initial_lr -> constant schedule + - milestones: [0] # required to avoid config errors + + + + megatron_cfg: + enabled: true + empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs + activation_checkpointing: false + converter_type: "Qwen2ForCausalLM" + tensor_model_parallel_size: ${policy.tensor_model_parallel_size} + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: ${policy.pipeline_model_parallel_size} + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + context_parallel_size: ${policy.context_parallel_size} + pipeline_dtype: ${policy.precision} + sequence_parallel: false + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false + bias_activation_fusion: True + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + + optimizer: + optimizer: "adam" + lr: ${policy.lr} + min_lr: ${policy.min_lr} + weight_decay: ${policy.weight_decay} + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + # optimizer cpu offload + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + + clip_grad: ${policy.max_grad_norm} + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "cosine" + lr_decay_iters: ${grpo.max_num_steps} + lr_warmup_iters: 0 + lr_warmup_init: 1.0e-6 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + + + + generation: + backend: "vllm" + max_new_tokens: ${policy.max_total_sequence_length} + temperature: 1.0 + top_p: 1.0 + top_k: null + stop_token_ids: null + stop_strings: null + vllm_cfg: + async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447. + precision: ${policy.precision} + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + expert_parallel_size: 1 # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP + gpu_memory_utilization: 0.6 + max_model_len: ${policy.max_total_sequence_length} + enable_expert_parallel: false + enforce_eager: True # Set as True to avoid vllm bug + # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit + # For Gemma models, we need to use "auto" due to a vllm bug + load_format: dummy + colocated: + # true: generation shares training GPUs + # false: uses dedicated generation resources + enabled: true + # only relevant when enabled is false + resources: + gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 + num_nodes: null # Decides number of nodes to be dedicated to generation + +data: + max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len + shuffle: true + prompt: + prompt_config: ??? + examples_type: null + config_dir: null + template_dir: null + train_data_path: null + val_data_path: null + num_workers: 10 + +env: + math: + env_cls: nemo_skills.training.nemo_rl.environments.math_environment.MathEnvironment + num_workers: 8 + +logger: + log_dir: "logs" # Base directory for all logs + num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal + wandb_enabled: false + tensorboard_enabled: false + mlflow_enabled: false + swanlab_enabled: false # Disable SwanLab logging + monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb: + project: "grpo-dev" + name: "grpo-dev-logger" + tensorboard: {} + mlflow: + experiment_name: "grpo-dev" + run_name: "grpo-dev-logger" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + +cluster: + gpus_per_node: 1 + num_nodes: 1 + +checkpoint_must_save_by: null diff --git a/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml b/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml new file mode 100644 index 0000000000..20cc35ff8d --- /dev/null +++ b/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml @@ -0,0 +1,197 @@ +# SFT Algorithm Configuration +sft: + ## total number of steps to train will equal + ## min((max_num_epochs * len(train_dataloader)), max_num_steps) + # setting both to big values by default, so only one needs to be set + max_num_epochs: 100000000 + max_num_steps: 100000000 + + val_period: 0 + val_batches: 1 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: false + seed: 42 + +checkpointing: + enabled: true + checkpoint_dir: "results/sft" + metric_name: "val_loss" + higher_is_better: false + keep_top_k: 50 + save_period: 100 + checkpoint_must_save_by: null + + +policy: + model_name: ??? + tokenizer: + name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + chat_template: "infer_from_data" ## Can be: null (passthrough), "default" (tokenizer's default), "infer_from_data" (auto-detect from data), or custom jinja2 template + chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 4096 + precision: "bfloat16" + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + sequence_parallel: false + lr: 1e-6 + weight_decay: 0.01 + min_lr: 1e-6 + + + dtensor_cfg: + enabled: true + cpu_offload: False + sequence_parallel: ${policy.sequence_parallel} + activation_checkpointing: false + tensor_parallel_size: ${policy.tensor_model_parallel_size} + context_parallel_size: ${policy.context_parallel_size} + custom_parallel_plan: null + + + megatron_cfg: + enabled: false + activation_checkpointing: false + tensor_model_parallel_size: ${policy.tensor_model_parallel_size} + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: ${policy.pipeline_model_parallel_size} + context_parallel_size: ${policy.context_parallel_size} + pipeline_dtype: ${policy.precision} + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + sequence_parallel: ${policy.sequence_parallel} + freeze_moe_router: false + moe_router_dtype: null + moe_router_load_balancing_type: "aux_loss" + moe_router_bias_update_rate: 1e-3 + moe_permute_fusion: false + #gives ~20% training perf speedup with sequence packing + bias_activation_fusion: True + apply_rope_fusion: True # Only used if position_embedding_type=rope + layernorm_epsilon: 1e-6 + empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs + + optimizer: + optimizer: "adam" + lr: ${policy.lr} + min_lr: ${policy.min_lr} + weight_decay: ${policy.weight_decay} + bf16: true # must be true to avoid checkpoint load error + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + # clip_grad: ${policy.max_grad_norm} + + # optimizer cpu offload + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "cosine" + lr_decay_iters: ${sft.max_num_steps} + lr_warmup_iters: 0 + lr_warmup_init: 1.0e-6 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + data_parallel_sharding_strategy: "optim_grads_params" + + dynamic_batching: + enabled: false + + + sequence_packing: + enabled: True + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + #If this value is set to null, it will be automatically assigned in the code. + make_sequence_length_divisible_by: null + max_grad_norm: null + + optimizer: + name: "torch.optim.AdamW" + kwargs: + lr: ${policy.lr} + weight_decay: ${policy.weight_decay} + betas: [0.9, 0.98] + eps: 1e-8 + # when using Dtensor, we need to set foreach + # and fused to False + foreach: False + fused: False + + scheduler: + - name: "torch.optim.lr_scheduler.LinearLR" + kwargs: + start_factor: 1.0 + end_factor: 1.0 + total_iters: 1 # must be >=1, here it keeps LR constant + - name: "torch.optim.lr_scheduler.CosineAnnealingLR" + kwargs: + T_max: ${sft.max_num_steps} # total training steps + eta_min: ${policy.lr} # set min_lr = initial_lr -> constant schedule + - milestones: [0] # required to avoid config errors + + +data: + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: prompt_response_dataset + add_bos: false + add_eos: false + add_generation_prompt: false + input_key: input + output_key: output + force_reprocess: false + shuffle: true + num_workers: 10 + +logger: + log_dir: "logs" # Base directory for all logs + wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running + tensorboard_enabled: true + mlflow_enabled: false + swanlab_enabled: false # Disable SwanLab logging + monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard + num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal + wandb: + project: "sft-dev" + name: "sft-dev-${data.dataset_name}" + tensorboard: + log_dir: "tb_logs-sft-dev-${data.dataset_name}" + mlflow: + experiment_name: "sft-dev" + run_name: "sft-dev-${data.dataset_name}" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + + +cluster: + gpus_per_node: 1 + num_nodes: 1 From f00cc8e704c9225b7725ea51b13ca5595bde5a4b Mon Sep 17 00:00:00 2001 From: Sadegh Mahdavi Date: Wed, 10 Dec 2025 14:45:12 -0800 Subject: [PATCH 04/17] update dockerfile Signed-off-by: Sadegh Mahdavi --- dockerfiles/Dockerfile.nemo-rl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dockerfiles/Dockerfile.nemo-rl b/dockerfiles/Dockerfile.nemo-rl index ee00953949..27d9b00ce9 100644 --- a/dockerfiles/Dockerfile.nemo-rl +++ b/dockerfiles/Dockerfile.nemo-rl @@ -1,3 +1,4 @@ +# syntax=docker/dockerfile:1 # copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/main/docker/Dockerfile # TODO: from next update try to re-use their dockerfile as is as they support specifying the commit @@ -5,8 +6,8 @@ ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04 FROM scratch AS nemo-rl -ARG NRL_GIT_REF=main -ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} / +ARG NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-09476838c92c27a7488afbe6febb6339d3d79be9} +ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NEMO_RL_COMMIT} / FROM ${BASE_IMAGE} AS base @@ -111,12 +112,10 @@ WORKDIR /opt/NeMo-RL FROM hermetic AS release -ARG NEMO_RL_COMMIT ARG NVIDIA_BUILD_ID ARG NVIDIA_BUILD_REF ARG RC_DATE=00.00 ARG TARGETARCH -ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-09476838c92c27a7488afbe6febb6339d3d79be9} ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-} ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-} LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}" From cf9c856b549cb883b5611153160262655a5a72df Mon Sep 17 00:00:00 2001 From: Sadegh Mahdavi Date: Thu, 15 Jan 2026 12:01:59 -0800 Subject: [PATCH 05/17] update nemo-rl to latest commit Signed-off-by: Sadegh Mahdavi --- dockerfiles/Dockerfile.nemo-rl | 10 +++++-- .../training/nemo_rl/configs/grpo.yaml | 26 +++++++++++----- nemo_skills/training/nemo_rl/configs/sft.yaml | 30 +++++++++++++++++++ 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/dockerfiles/Dockerfile.nemo-rl b/dockerfiles/Dockerfile.nemo-rl index 27d9b00ce9..9b66bf7f44 100644 --- a/dockerfiles/Dockerfile.nemo-rl +++ b/dockerfiles/Dockerfile.nemo-rl @@ -6,7 +6,7 @@ ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04 FROM scratch AS nemo-rl -ARG NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-09476838c92c27a7488afbe6febb6339d3d79be9} +ARG NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-e95efb912a6909b5da91ffeb197debe91fd480d8} ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NEMO_RL_COMMIT} / @@ -38,6 +38,8 @@ apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos apt update apt install -y nsight-systems-cli +# To fix CVE-2025-68973 +apt install -y --only-upgrade gnupg apt-get clean rm -rf /var/lib/apt/lists/* @@ -85,7 +87,7 @@ COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh COPY --from=nemo-rl --link research/ ./research/ COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/ -RUN <<"EOF" bash -exu +RUN --mount=type=ssh <<"EOF" bash -exu uv venv --seed if [[ -n "${BUILD_CUSTOM_VLLM:-}" ]]; then bash tools/build-custom-vllm.sh @@ -103,6 +105,10 @@ uv sync --link-mode symlink --locked --extra vllm --no-install-project uv sync --link-mode symlink --locked --extra mcore --no-install-project uv sync --link-mode symlink --locked --extra automodel --no-install-project uv sync --link-mode symlink --locked --all-groups --no-install-project + +# Remove the aiohttp in this uv cache dir to fully address CVE GHSA-mqqc-3gqh-h2x8 +# The ray install will include the older aiohttp version in its cache +find /root/.cache/uv -type d -path "*ray/_private/runtime_env/agent/thirdparty_files/aiohttp*" -exec rm -rf {} + EOF ENV PATH="/opt/nemo_rl_venv/bin:$PATH" diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml index 187c0ff686..5993e0430d 100644 --- a/nemo_skills/training/nemo_rl/configs/grpo.yaml +++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml @@ -1,4 +1,4 @@ -# Copied and edited from https://github.com/NVIDIA-NeMo/RL/blob/64ab08df3edf25131959fc474b44ed5e36a1600b/examples/configs/grpo_math_1B.yaml +# Copied and edited from https://github.com/NVIDIA-NeMo/RL/blob/e95efb912a6909b5da91ffeb197debe91fd480d8/examples/configs/grpo_math_1B.yaml # GRPO Algorithm Configuration grpo: num_prompts_per_step: 32 @@ -79,8 +79,6 @@ policy: logprob_batch_size: 4 max_total_sequence_length: 512 precision: "bfloat16" - activation_checkpointing_enabled: false - refit_buffer_size_gb: 4 # used for refitting inference engine, the unit is GB tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 context_parallel_size: 1 @@ -157,10 +155,10 @@ policy: start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} weight_decay_incr_style: "constant" - lr_decay_style: "cosine" - lr_decay_iters: ${grpo.max_num_steps} - lr_warmup_iters: 0 - lr_warmup_init: 1.0e-6 + lr_decay_style: "constant" + lr_decay_iters: 1000 + lr_warmup_iters: 13 + lr_warmup_init: 5.0e-7 distributed_data_parallel_config: grad_reduce_in_fp32: false @@ -226,7 +224,16 @@ policy: top_k: null stop_token_ids: null stop_strings: null - vllm_cfg: + mcore_generation_config: # When using megatron for generation + buffer_size_gb: 20 # Total GPU memory (in GB) allocated for KV cache buffers + buffer_guaranteed_fraction: 0.1 # Fraction of buffer reserved for guaranteed active requests + num_cuda_graphs: 16 # Number of CUDA graphs to pre-compile for different batch sizes + block_size_tokens: 256 # Size of each KV cache block in tokens (affects memory granularity) + use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing + enable_chunked_prefill: true # Split long prefills into chunks for better memory management + unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging) + max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens + vllm_cfg: # When using vllm for generation async_engine: false precision: ${policy.precision} kv_cache_dtype: "auto" @@ -282,6 +289,9 @@ logger: wandb: project: "grpo-dev" name: "grpo-dev-logger" + swanlab: + project: "grpo-dev" + name: "grpo-dev-logger" tensorboard: {} mlflow: experiment_name: "grpo-dev" diff --git a/nemo_skills/training/nemo_rl/configs/sft.yaml b/nemo_skills/training/nemo_rl/configs/sft.yaml index 6781241939..3a8ed6158e 100644 --- a/nemo_skills/training/nemo_rl/configs/sft.yaml +++ b/nemo_skills/training/nemo_rl/configs/sft.yaml @@ -43,6 +43,7 @@ policy: min_lr: 1e-6 dtensor_cfg: + _v2: true enabled: true env_vars: {} cpu_offload: False @@ -52,6 +53,19 @@ policy: context_parallel_size: ${policy.context_parallel_size} custom_parallel_plan: null + # LoRA (Low-Rank Adaptation) Configuration + lora_cfg: + enabled: False # Set to True to enable LoRA fine-tuning + target_modules: [] # List of module names to apply LoRA (empty list with match_all_linear=true applies to all linear layers) + exclude_modules: [] # List of module names to exclude from LoRA + match_all_linear: true # If True, applies LoRA to all linear layers (overrides target_modules) + dim: 8 # LoRA rank (r): lower rank = fewer parameters but less capacity. Typical values: 4, 8, 16, 32, 64 + alpha: 32 # LoRA scaling factor: effective learning rate multiplier = alpha/dim. Typical values: 16, 32, 64 + dropout: 0.0 # Dropout probability applied to LoRA layers (0.0 = no dropout) + dropout_position: "post" # Where to apply dropout: "pre" (before LoRA) or "post" (after LoRA) + lora_A_init: "xavier" # Initialization method for LoRA A matrix: "xavier" or "uniform" + use_triton: true # Use Triton-optimized kernels for LoRA (faster but requires flash-attn). Disable when tensor_parallel_size > 1 + megatron_cfg: enabled: false env_vars: {} @@ -79,6 +93,19 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False + peft: + enabled: false + target_modules: [] + exclude_modules: [] + dim: 8 + alpha: 32 + dropout: 0.0 + dropout_position: "post" + lora_A_init_method: "xavier" + lora_B_init_method: "zero" + a2a_experimental: false + lora_dtype: None + optimizer: optimizer: "adam" lr: ${policy.lr} @@ -186,6 +213,9 @@ logger: wandb: project: "sft-dev" name: "sft-dev-${data.dataset_name}" + swanlab: + project: "sft-dev" + name: "sft-dev-${data.dataset_name}" tensorboard: log_dir: "tb_logs-sft-dev-${data.dataset_name}" mlflow: From 53a9e93250fbe4ce42b552ad682c4c5144ee3eea Mon Sep 17 00:00:00 2001 From: Sadegh Mahdavi Date: Thu, 15 Jan 2026 12:04:13 -0800 Subject: [PATCH 06/17] add one more comment Signed-off-by: Sadegh Mahdavi --- nemo_skills/training/nemo_rl/configs/grpo.yaml | 2 +- nemo_skills/training/nemo_rl/configs/sft.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml index 5993e0430d..c6e30dd305 100644 --- a/nemo_skills/training/nemo_rl/configs/grpo.yaml +++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml @@ -190,7 +190,7 @@ policy: # makes the training sequence length divisible by the tensor parallel size # this is useful for sequence parallel training make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 + max_grad_norm: 1.0 # megatron: Zero means no clipping, FSDP: null means no clipping optimizer: name: "torch.optim.AdamW" diff --git a/nemo_skills/training/nemo_rl/configs/sft.yaml b/nemo_skills/training/nemo_rl/configs/sft.yaml index 3a8ed6158e..0c6d470e82 100644 --- a/nemo_skills/training/nemo_rl/configs/sft.yaml +++ b/nemo_skills/training/nemo_rl/configs/sft.yaml @@ -163,7 +163,7 @@ policy: # makes the training sequence length divisible by the tensor parallel size # this is useful for sequence parallel training make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 0.0 # Zero means no clipping + max_grad_norm: 0.0 # megatron: Zero means no clipping, FSDP: null means no clipping optimizer: name: "torch.optim.AdamW" From 95b50a1f4312bfd2a8baa6c081c5f6a615c57949 Mon Sep 17 00:00:00 2001 From: Sadegh Mahdavi Date: Tue, 27 Jan 2026 14:20:35 -0800 Subject: [PATCH 07/17] Remove legacy and rollback grpo configs Signed-off-by: Sadegh Mahdavi --- .../nemo_rl/configs/grpo-legacy-85eeb8d.yaml | 279 ------------------ .../training/nemo_rl/configs/grpo.yaml | 8 +- .../nemo_rl/configs/sft-legacy-85eeb8d.yaml | 197 ------------- 3 files changed, 4 insertions(+), 480 deletions(-) delete mode 100644 nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml delete mode 100644 nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml diff --git a/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml b/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml deleted file mode 100644 index 53428ca873..0000000000 --- a/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml +++ /dev/null @@ -1,279 +0,0 @@ -# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/ab1b638a499308caea022648daaf6994d390cbde/examples/configs/grpo_math_1B.yaml - -# GRPO Algorithm Configuration -grpo: - num_prompts_per_step: 32 - num_generations_per_prompt: 16 - max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question) - max_num_epochs: 1 - max_num_steps: 1000000 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 0 - val_at_start: false - overlong_filtering: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - use_dynamic_sampling: false - dynamic_sampling_max_gen_batches: 10 - batch_multiplier: 1 - reward_shaping: - enabled: false - overlong_buffer_length: 128 - overlong_buffer_penalty: 1 - max_response_length: ${policy.max_total_sequence_length} - reward_scaling: - enabled: false - source_min: 0.0 - source_max: 1.0 - target_min: 0.0 - target_max: 1.0 - - async_grpo: - enabled: false # Set to true to enable async training mode - # Max age (in training steps) for trajectories used in training - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) - use_on_policy_kl_approximation: false - # Async GRPO requires importance sampling correction enabled - # Set to true when async_grpo.enabled is true - use_importance_sampling_correction: false - truncated_importance_sampling_ratio: null - sequence_level_importance_ratios: false - token_level_loss: true - -checkpointing: - enabled: true - checkpoint_dir: "results/grpo" - metric_name: "val_reward" - higher_is_better: true - keep_top_k: 50 - save_period: 10 - checkpoint_must_save_by: null - -policy: - model_name: ??? - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default - chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true - train_global_batch_size: 512 - train_micro_batch_size: 4 - generation_batch_size: 32 # Only used when generating using HF backend - logprob_batch_size: 4 - max_total_sequence_length: 512 - precision: "bfloat16" - fsdp_offload_enabled: false - activation_checkpointing_enabled: false - refit_buffer_size_gb: 4 # used for refitting inference engine, the unit is GB - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - lr: 1e-6 - weight_decay: 0.01 - min_lr: 1e-6 - - dtensor_cfg: - enabled: true - cpu_offload: False - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: ${policy.tensor_model_parallel_size} - context_parallel_size: ${policy.context_parallel_size} - custom_parallel_plan: null - - # dynamic_batching improves performance by ensuring logprob and training microbatches - # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length - # responses are sorted by sequence length and bucketed into microbatches with a total - # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the - # training and logprob stages respectively. - # We disable dynamic batching for Megatron as it is incompatible with Pipeline parallelism. - # Instead, we use sequence packing. - dynamic_batching: - enabled: False - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 - - sequence_packing: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 - #If this value is set to null, it will be automatically assigned in the code. - make_sequence_length_divisible_by: null - max_grad_norm: 1.0 - - optimizer: - name: "torch.optim.AdamW" - kwargs: - lr: ${policy.lr} - weight_decay: ${policy.weight_decay} - betas: [0.9, 0.999] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False - - scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 1.0 - end_factor: 1.0 - total_iters: 1 # must be >=1, here it keeps LR constant - - name: "torch.optim.lr_scheduler.CosineAnnealingLR" - kwargs: - T_max: ${grpo.max_num_steps} # total training steps - eta_min: ${policy.lr} # set min_lr = initial_lr -> constant schedule - - milestones: [0] # required to avoid config errors - - - - megatron_cfg: - enabled: true - empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" - tensor_model_parallel_size: ${policy.tensor_model_parallel_size} - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: ${policy.pipeline_model_parallel_size} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: ${policy.context_parallel_size} - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - bias_activation_fusion: True - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - - optimizer: - optimizer: "adam" - lr: ${policy.lr} - min_lr: ${policy.min_lr} - weight_decay: ${policy.weight_decay} - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - clip_grad: ${policy.max_grad_norm} - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "cosine" - lr_decay_iters: ${grpo.max_num_steps} - lr_warmup_iters: 0 - lr_warmup_init: 1.0e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - - - - generation: - backend: "vllm" - max_new_tokens: ${policy.max_total_sequence_length} - temperature: 1.0 - top_p: 1.0 - top_k: null - stop_token_ids: null - stop_strings: null - vllm_cfg: - async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447. - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP - gpu_memory_utilization: 0.6 - max_model_len: ${policy.max_total_sequence_length} - enable_expert_parallel: false - enforce_eager: True # Set as True to avoid vllm bug - # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit - # For Gemma models, we need to use "auto" due to a vllm bug - load_format: dummy - colocated: - # true: generation shares training GPUs - # false: uses dedicated generation resources - enabled: true - # only relevant when enabled is false - resources: - gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 - num_nodes: null # Decides number of nodes to be dedicated to generation - -data: - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len - shuffle: true - prompt: - prompt_config: ??? - examples_type: null - config_dir: null - template_dir: null - train_data_path: null - val_data_path: null - num_workers: 10 - -env: - math: - env_cls: nemo_skills.training.nemo_rl.environments.math_environment.MathEnvironment - num_workers: 8 - -logger: - log_dir: "logs" # Base directory for all logs - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal - wandb_enabled: false - tensorboard_enabled: false - mlflow_enabled: false - swanlab_enabled: false # Disable SwanLab logging - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard - wandb: - project: "grpo-dev" - name: "grpo-dev-logger" - tensorboard: {} - mlflow: - experiment_name: "grpo-dev" - run_name: "grpo-dev-logger" - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - -cluster: - gpus_per_node: 1 - num_nodes: 1 - -checkpoint_must_save_by: null diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml index c6e30dd305..e48e704997 100644 --- a/nemo_skills/training/nemo_rl/configs/grpo.yaml +++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml @@ -155,7 +155,7 @@ policy: start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} weight_decay_incr_style: "constant" - lr_decay_style: "constant" + lr_decay_style: "cosine" # This is equivalent to constant unless min_lr is set to smaller value lr_decay_iters: 1000 lr_warmup_iters: 13 lr_warmup_init: 5.0e-7 @@ -210,10 +210,10 @@ policy: start_factor: 0.1 end_factor: 1.0 total_iters: 10 - - name: "torch.optim.lr_scheduler.ConstantLR" + - name: "torch.optim.lr_scheduler.CosineAnnealingLR" kwargs: - factor: 1.0 - total_iters: 10000000000 + T_max: ${grpo.max_num_steps} + eta_min: ${policy.min_lr} - milestones: [10] generation: diff --git a/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml b/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml deleted file mode 100644 index 20cc35ff8d..0000000000 --- a/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml +++ /dev/null @@ -1,197 +0,0 @@ -# SFT Algorithm Configuration -sft: - ## total number of steps to train will equal - ## min((max_num_epochs * len(train_dataloader)), max_num_steps) - # setting both to big values by default, so only one needs to be set - max_num_epochs: 100000000 - max_num_steps: 100000000 - - val_period: 0 - val_batches: 1 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: false - seed: 42 - -checkpointing: - enabled: true - checkpoint_dir: "results/sft" - metric_name: "val_loss" - higher_is_better: false - keep_top_k: 50 - save_period: 100 - checkpoint_must_save_by: null - - -policy: - model_name: ??? - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default - chat_template: "infer_from_data" ## Can be: null (passthrough), "default" (tokenizer's default), "infer_from_data" (auto-detect from data), or custom jinja2 template - chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true - train_global_batch_size: 32 - train_micro_batch_size: 1 - max_total_sequence_length: 4096 - precision: "bfloat16" - fsdp_offload_enabled: false - activation_checkpointing_enabled: false - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - sequence_parallel: false - lr: 1e-6 - weight_decay: 0.01 - min_lr: 1e-6 - - - dtensor_cfg: - enabled: true - cpu_offload: False - sequence_parallel: ${policy.sequence_parallel} - activation_checkpointing: false - tensor_parallel_size: ${policy.tensor_model_parallel_size} - context_parallel_size: ${policy.context_parallel_size} - custom_parallel_plan: null - - - megatron_cfg: - enabled: false - activation_checkpointing: false - tensor_model_parallel_size: ${policy.tensor_model_parallel_size} - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: ${policy.pipeline_model_parallel_size} - context_parallel_size: ${policy.context_parallel_size} - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: ${policy.sequence_parallel} - freeze_moe_router: false - moe_router_dtype: null - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - bias_activation_fusion: True - apply_rope_fusion: True # Only used if position_embedding_type=rope - layernorm_epsilon: 1e-6 - empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs - - optimizer: - optimizer: "adam" - lr: ${policy.lr} - min_lr: ${policy.min_lr} - weight_decay: ${policy.weight_decay} - bf16: true # must be true to avoid checkpoint load error - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - # clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "cosine" - lr_decay_iters: ${sft.max_num_steps} - lr_warmup_iters: 0 - lr_warmup_init: 1.0e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - data_parallel_sharding_strategy: "optim_grads_params" - - dynamic_batching: - enabled: false - - - sequence_packing: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - #If this value is set to null, it will be automatically assigned in the code. - make_sequence_length_divisible_by: null - max_grad_norm: null - - optimizer: - name: "torch.optim.AdamW" - kwargs: - lr: ${policy.lr} - weight_decay: ${policy.weight_decay} - betas: [0.9, 0.98] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False - - scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 1.0 - end_factor: 1.0 - total_iters: 1 # must be >=1, here it keeps LR constant - - name: "torch.optim.lr_scheduler.CosineAnnealingLR" - kwargs: - T_max: ${sft.max_num_steps} # total training steps - eta_min: ${policy.lr} # set min_lr = initial_lr -> constant schedule - - milestones: [0] # required to avoid config errors - - -data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: prompt_response_dataset - add_bos: false - add_eos: false - add_generation_prompt: false - input_key: input - output_key: output - force_reprocess: false - shuffle: true - num_workers: 10 - -logger: - log_dir: "logs" # Base directory for all logs - wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running - tensorboard_enabled: true - mlflow_enabled: false - swanlab_enabled: false # Disable SwanLab logging - monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal - wandb: - project: "sft-dev" - name: "sft-dev-${data.dataset_name}" - tensorboard: - log_dir: "tb_logs-sft-dev-${data.dataset_name}" - mlflow: - experiment_name: "sft-dev" - run_name: "sft-dev-${data.dataset_name}" - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - - -cluster: - gpus_per_node: 1 - num_nodes: 1 From 502df62f80cba18f930058af3ff0f683dd8a00a5 Mon Sep 17 00:00:00 2001 From: Sadegh Mahdavi Date: Tue, 27 Jan 2026 14:21:29 -0800 Subject: [PATCH 08/17] Remove legacy and rollback grpo configs Signed-off-by: Sadegh Mahdavi --- nemo_skills/training/nemo_rl/configs/grpo.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml index e48e704997..9ec2af3358 100644 --- a/nemo_skills/training/nemo_rl/configs/grpo.yaml +++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml @@ -156,7 +156,7 @@ policy: end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} weight_decay_incr_style: "constant" lr_decay_style: "cosine" # This is equivalent to constant unless min_lr is set to smaller value - lr_decay_iters: 1000 + lr_decay_iters: ${grpo.max_num_steps} lr_warmup_iters: 13 lr_warmup_init: 5.0e-7 From 02f5e89a94263f1ea480443e638e9f833c99242b Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 27 Jan 2026 17:35:28 -0800 Subject: [PATCH 09/17] Update conversion script Signed-off-by: Igor Gitman --- .../training/nemo_rl/convert_dcp_to_hf.py | 110 ++++++++++++++++-- 1 file changed, 101 insertions(+), 9 deletions(-) diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py index c54e5842b9..da28b51cf2 100644 --- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py +++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py @@ -16,11 +16,12 @@ # and added logic to figure out max step automatically import argparse +import glob import os import re +import shutil import yaml -from nemo_rl.utils.native_checkpoint import convert_dcp_to_hf def parse_args(): @@ -80,6 +81,84 @@ def find_max_step_folder(training_folder, step_override=None): return os.path.join(training_folder, f"step_{chosen_step}") +def is_safetensors_checkpoint(weights_path): + """Check if checkpoint is in the new safetensors format (has model/.hf_metadata/).""" + hf_metadata_path = os.path.join(weights_path, "model", ".hf_metadata") + return os.path.isdir(hf_metadata_path) + + +def convert_safetensors_to_hf(weights_path, hf_ckpt_path, tokenizer_path, hf_overrides=None): + """Convert safetensors checkpoint to HF format by reorganizing files.""" + model_dir = os.path.join(weights_path, "model") + hf_metadata_dir = os.path.join(model_dir, ".hf_metadata") + + os.makedirs(hf_ckpt_path, exist_ok=True) + + # Copy config.json from .hf_metadata + config_src = os.path.join(hf_metadata_dir, "config.json") + if os.path.exists(config_src): + shutil.copy2(config_src, os.path.join(hf_ckpt_path, "config.json")) + + # Copy generation_config.json if exists + gen_config_src = os.path.join(hf_metadata_dir, "generation_config.json") + if os.path.exists(gen_config_src): + shutil.copy2(gen_config_src, os.path.join(hf_ckpt_path, "generation_config.json")) + + # Find and copy safetensors files + safetensors_files = glob.glob(os.path.join(model_dir, "*.safetensors")) + if len(safetensors_files) == 1: + # Single shard - rename to model.safetensors + shutil.copy2(safetensors_files[0], os.path.join(hf_ckpt_path, "model.safetensors")) + else: + # Multiple shards - copy with standard naming and create index + import json + + weight_map = {} + for i, src_file in enumerate(sorted(safetensors_files), 1): + dst_name = f"model-{i:05d}-of-{len(safetensors_files):05d}.safetensors" + shutil.copy2(src_file, os.path.join(hf_ckpt_path, dst_name)) + + # Read keys from safetensors file to build weight_map + from safetensors import safe_open + + with safe_open(src_file, framework="pt") as f: + for key in f.keys(): + weight_map[key] = dst_name + + # Write index file + index = {"metadata": {}, "weight_map": weight_map} + with open(os.path.join(hf_ckpt_path, "model.safetensors.index.json"), "w") as f: + json.dump(index, f, indent=2) + + # Copy tokenizer files from the original model + tokenizer_files = [ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + "added_tokens.json", + "chat_template.jinja", + ] + for fname in tokenizer_files: + src = os.path.join(tokenizer_path, fname) + if os.path.exists(src): + shutil.copy2(src, os.path.join(hf_ckpt_path, fname)) + + # Apply hf_overrides to config.json if provided + if hf_overrides: + import json + + config_path = os.path.join(hf_ckpt_path, "config.json") + with open(config_path, "r") as f: + config = json.load(f) + config.update(hf_overrides) + with open(config_path, "w") as f: + json.dump(config, f, indent=2) + + return hf_ckpt_path + + def main(): """Main entry point.""" args = parse_args() @@ -122,14 +201,27 @@ def main(): if args.max_position_embeddings: hf_overrides["max_position_embeddings"] = args.max_position_embeddings - hf_ckpt = convert_dcp_to_hf( - dcp_ckpt_path=dcp_ckpt_path, - hf_ckpt_path=args.hf_ckpt_path, - model_name_or_path=model_name_or_path, - tokenizer_name_or_path=tokenizer_name_or_path, - overwrite=True, - hf_overrides=hf_overrides, - ) + # Check if checkpoint is in the new safetensors format + if is_safetensors_checkpoint(dcp_ckpt_path): + print("Detected safetensors checkpoint format, using direct conversion...") + hf_ckpt = convert_safetensors_to_hf( + weights_path=dcp_ckpt_path, + hf_ckpt_path=args.hf_ckpt_path, + tokenizer_path=tokenizer_name_or_path, + hf_overrides=hf_overrides if hf_overrides else None, + ) + else: + print("Detected DCP checkpoint format, using DCP conversion...") + from nemo_rl.utils.native_checkpoint import convert_dcp_to_hf + + hf_ckpt = convert_dcp_to_hf( + dcp_ckpt_path=dcp_ckpt_path, + hf_ckpt_path=args.hf_ckpt_path, + model_name_or_path=model_name_or_path, + tokenizer_name_or_path=tokenizer_name_or_path, + overwrite=True, + hf_overrides=hf_overrides, + ) print(f"Saved HF checkpoint to: {hf_ckpt}") From 0ee9c7763ebd6cdd3a18c0a92cb0fc1748529a71 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 27 Jan 2026 18:20:26 -0800 Subject: [PATCH 10/17] Adjust test for warmup Signed-off-by: Igor Gitman --- tests/gpu-tests/test_train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/gpu-tests/test_train.py b/tests/gpu-tests/test_train.py index fa45c1c113..1a0b5efdad 100644 --- a/tests/gpu-tests/test_train.py +++ b/tests/gpu-tests/test_train.py @@ -147,10 +147,11 @@ def test_grpo_nemo_rl(backend): grpo_nemo_rl( ctx=wrap_arguments( "++data.prompt.prompt_config=qwen/math-cot " - "++grpo.max_num_steps=5 " + "++grpo.lr_warmup_steps=2 " "++grpo.num_prompts_per_step=2 " "++policy.max_total_sequence_length=256 " "++policy.dtensor_cfg.tensor_parallel_size=1 " + "++policy.megatron_cfg.scheduler.lr_warmup_iters=2 " "++checkpointing.save_period=2 " "++policy.train_global_batch_size=2 " "++policy.train_micro_batch_size=1 " From 6dcbd91678fb799be89f8c5e0f6f1786360cb133 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 27 Jan 2026 20:58:47 -0800 Subject: [PATCH 11/17] Switch to a proper conversion script Signed-off-by: Igor Gitman --- .../training/nemo_rl/convert_dcp_to_hf.py | 85 +++------- .../nemo_rl/offline_hf_consolidation.py | 146 ++++++++++++++++++ 2 files changed, 171 insertions(+), 60 deletions(-) create mode 100644 nemo_skills/training/nemo_rl/offline_hf_consolidation.py diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py index da28b51cf2..b2522e22ac 100644 --- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py +++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py @@ -16,10 +16,11 @@ # and added logic to figure out max step automatically import argparse -import glob +import json import os import re -import shutil +import subprocess +import sys import yaml @@ -87,68 +88,32 @@ def is_safetensors_checkpoint(weights_path): return os.path.isdir(hf_metadata_path) -def convert_safetensors_to_hf(weights_path, hf_ckpt_path, tokenizer_path, hf_overrides=None): - """Convert safetensors checkpoint to HF format by reorganizing files.""" +def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, hf_overrides=None): + """Convert safetensors checkpoint to HF format using offline_hf_consolidation.py.""" model_dir = os.path.join(weights_path, "model") - hf_metadata_dir = os.path.join(model_dir, ".hf_metadata") - os.makedirs(hf_ckpt_path, exist_ok=True) - - # Copy config.json from .hf_metadata - config_src = os.path.join(hf_metadata_dir, "config.json") - if os.path.exists(config_src): - shutil.copy2(config_src, os.path.join(hf_ckpt_path, "config.json")) - - # Copy generation_config.json if exists - gen_config_src = os.path.join(hf_metadata_dir, "generation_config.json") - if os.path.exists(gen_config_src): - shutil.copy2(gen_config_src, os.path.join(hf_ckpt_path, "generation_config.json")) - - # Find and copy safetensors files - safetensors_files = glob.glob(os.path.join(model_dir, "*.safetensors")) - if len(safetensors_files) == 1: - # Single shard - rename to model.safetensors - shutil.copy2(safetensors_files[0], os.path.join(hf_ckpt_path, "model.safetensors")) - else: - # Multiple shards - copy with standard naming and create index - import json - - weight_map = {} - for i, src_file in enumerate(sorted(safetensors_files), 1): - dst_name = f"model-{i:05d}-of-{len(safetensors_files):05d}.safetensors" - shutil.copy2(src_file, os.path.join(hf_ckpt_path, dst_name)) - - # Read keys from safetensors file to build weight_map - from safetensors import safe_open - - with safe_open(src_file, framework="pt") as f: - for key in f.keys(): - weight_map[key] = dst_name - - # Write index file - index = {"metadata": {}, "weight_map": weight_map} - with open(os.path.join(hf_ckpt_path, "model.safetensors.index.json"), "w") as f: - json.dump(index, f, indent=2) - - # Copy tokenizer files from the original model - tokenizer_files = [ - "tokenizer.json", - "tokenizer_config.json", - "special_tokens_map.json", - "vocab.json", - "merges.txt", - "added_tokens.json", - "chat_template.jinja", + # Get the path to the consolidation script (same directory as this script) + script_dir = os.path.dirname(os.path.abspath(__file__)) + consolidation_script = os.path.join(script_dir, "offline_hf_consolidation.py") + + # Run the consolidation script + # Reference: https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/offline_hf_consolidation.py + cmd = [ + sys.executable, + consolidation_script, + "--model-name", + model_name, + "--input-dir", + model_dir, + "--output-dir", + hf_ckpt_path, ] - for fname in tokenizer_files: - src = os.path.join(tokenizer_path, fname) - if os.path.exists(src): - shutil.copy2(src, os.path.join(hf_ckpt_path, fname)) + + print(f"Running consolidation: {' '.join(cmd)}") + subprocess.run(cmd, check=True) # Apply hf_overrides to config.json if provided if hf_overrides: - import json - config_path = os.path.join(hf_ckpt_path, "config.json") with open(config_path, "r") as f: config = json.load(f) @@ -203,11 +168,11 @@ def main(): # Check if checkpoint is in the new safetensors format if is_safetensors_checkpoint(dcp_ckpt_path): - print("Detected safetensors checkpoint format, using direct conversion...") + print("Detected safetensors checkpoint format, using offline consolidation...") hf_ckpt = convert_safetensors_to_hf( weights_path=dcp_ckpt_path, hf_ckpt_path=args.hf_ckpt_path, - tokenizer_path=tokenizer_name_or_path, + model_name=model_name_or_path, hf_overrides=hf_overrides if hf_overrides else None, ) else: diff --git a/nemo_skills/training/nemo_rl/offline_hf_consolidation.py b/nemo_skills/training/nemo_rl/offline_hf_consolidation.py new file mode 100644 index 0000000000..5fc1330564 --- /dev/null +++ b/nemo_skills/training/nemo_rl/offline_hf_consolidation.py @@ -0,0 +1,146 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script can be used to consolidate sharded HF safetensors checkpoints +# to the consolidated format. + +# Example model directory structure: +# model/ +# ├── shard-00001-model-00001-of-00001.safetensors +# └── shard-00002-model-00001-of-00001.safetensors +# ... + +# This script works on both single and multiple workers: +# Example usage on 2 GPUs: +# torchrun --nproc-per-node=2 tools/offline_hf_consolidation.py --model-name meta-llama/Llama-3.2-1B --input-dir checkpoints/epoch_0_step_19/model/ --output-dir checkpoints/epoch_0_step_19/model/consolidated/ +# +# Example usage on 1 GPU: +# python tools/offline_hf_consolidation.py --model-name meta-llama/Llama-3.2-1B --input-dir checkpoints/epoch_0_step_19/model/ --output-dir checkpoints/epoch_0_step_19/model/consolidated/ + +# copied from https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/offline_hf_consolidation.py + +import argparse +import json +import os +import shutil + +import torch +import torch.distributed as dist +from nemo_automodel.components.checkpoint._backports.consolidate_hf_safetensors import ( + consolidate_safetensors_files_on_every_rank, +) +from nemo_automodel.components.distributed.init_utils import ( + get_rank_safe, + get_world_size_safe, + initialize_distributed, +) + + +def copy_metadata_files(input_dir, output_dir): + """ + Copy the metadata files over from the input directory to the output directory. + """ + for item_name in os.listdir(input_dir): + if item_name == "fqn_to_file_index_mapping.json": + continue # this is saved by the consolidation step + src_path = os.path.join(input_dir, item_name) + dst_path = os.path.join(output_dir, item_name) + shutil.move(src_path, dst_path) + shutil.rmtree(input_dir, ignore_errors=True) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Consolidate sharded HF safetensors checkpoints into consolidated files, " + "preserving original sharding layout where possible." + ) + ) + + parser.add_argument( + "--model-name", + "-m", + required=True, + help=( + "Hugging Face repo id (e.g. meta-llama/Llama-3.2-1B) or absolute path to a HF snapshot directory. " + "Used as reference to copy metadata and derive FQN->file index mapping." + ), + ) + parser.add_argument( + "--input-dir", + "-i", + required=True, + help="Directory containing sharded safetensors files to consolidate.", + ) + parser.add_argument( + "--output-dir", + "-o", + required=True, + help="Directory where consolidated safetensors and metadata will be written.", + ) + parser.add_argument( + "--num-threads", + type=int, + default=5, + help="Number of threads for writing consolidated data (default: 5).", + ) + parser.add_argument( + "--backend", + choices=["auto", "nccl", "gloo"], + default="auto", + help="Distributed backend to initialize (default: auto).", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + backend = args.backend + if backend == "auto": + backend = "nccl" if torch.cuda.device_count() > 0 else "gloo" + initialize_distributed(backend) + + os.makedirs(args.output_dir, exist_ok=True) + + if not os.path.exists(args.input_dir): + raise FileNotFoundError("Could not locate the input directory. Pass an absolute path to the input directory.") + + hf_metadata_dir = os.path.join(args.input_dir, ".hf_metadata") + + if not os.path.exists(hf_metadata_dir) and not os.path.isdir(hf_metadata_dir): + raise FileNotFoundError("Expected to find the .hf_metadata directory in the input directory.") + + with open(os.path.join(hf_metadata_dir, "fqn_to_file_index_mapping.json"), "r") as f: + fqn_to_index_mapping = json.load(f) + + consolidate_safetensors_files_on_every_rank( + args.input_dir, + args.output_dir, + fqn_to_index_mapping, + num_threads=args.num_threads, + ) + + if get_world_size_safe() > 1: + dist.barrier() + + if get_rank_safe() == 0: + copy_metadata_files(hf_metadata_dir, args.output_dir) + + if get_world_size_safe() > 1: + dist.barrier() + + +if __name__ == "__main__": + main() From 4f4881d999d9b0aa1a32200f46a66c25e4017e62 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 27 Jan 2026 21:15:02 -0800 Subject: [PATCH 12/17] Remove unused parameter Signed-off-by: Igor Gitman --- tests/gpu-tests/test_train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/gpu-tests/test_train.py b/tests/gpu-tests/test_train.py index 1a0b5efdad..845bc11eff 100644 --- a/tests/gpu-tests/test_train.py +++ b/tests/gpu-tests/test_train.py @@ -147,7 +147,6 @@ def test_grpo_nemo_rl(backend): grpo_nemo_rl( ctx=wrap_arguments( "++data.prompt.prompt_config=qwen/math-cot " - "++grpo.lr_warmup_steps=2 " "++grpo.num_prompts_per_step=2 " "++policy.max_total_sequence_length=256 " "++policy.dtensor_cfg.tensor_parallel_size=1 " From 265228eb84d068ae3cb9da2b32fef69cbdd28266 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 27 Jan 2026 21:37:29 -0800 Subject: [PATCH 13/17] Fix for import Signed-off-by: Igor Gitman --- nemo_skills/training/nemo_rl/convert_dcp_to_hf.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py index b2522e22ac..44b2996947 100644 --- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py +++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py @@ -20,7 +20,6 @@ import os import re import subprocess -import sys import yaml @@ -96,10 +95,15 @@ def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, hf_overrid script_dir = os.path.dirname(os.path.abspath(__file__)) consolidation_script = os.path.join(script_dir, "offline_hf_consolidation.py") - # Run the consolidation script + # Run the consolidation script using uv with the automodel extra to get nemo_automodel # Reference: https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/offline_hf_consolidation.py cmd = [ - sys.executable, + "uv", + "run", + "--active", + "--extra", + "automodel", + "python", consolidation_script, "--model-name", model_name, From 4192d30389f749e60481182ba9560f0b7c4c7785 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 27 Jan 2026 22:06:53 -0800 Subject: [PATCH 14/17] Add extra automodel Signed-off-by: Igor Gitman --- nemo_skills/pipeline/nemo_rl/grpo.py | 2 +- nemo_skills/pipeline/nemo_rl/sft.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_skills/pipeline/nemo_rl/grpo.py b/nemo_skills/pipeline/nemo_rl/grpo.py index 1cbd26c888..eeeff72474 100644 --- a/nemo_skills/pipeline/nemo_rl/grpo.py +++ b/nemo_skills/pipeline/nemo_rl/grpo.py @@ -192,7 +192,7 @@ def get_training_cmd( def get_checkpoint_convert_cmd(output_dir, final_hf_path, step, backend, max_position_embeddings=None): cmd = "export PYTHONPATH=$PYTHONPATH:/nemo_run/code && export UV_PROJECT=/opt/NeMo-RL && cd /nemo_run/code && " if backend == "fsdp": - cmd += "uv run --active python -m nemo_skills.training.nemo_rl.convert_dcp_to_hf " + cmd += "uv run --extra automodel python -m nemo_skills.training.nemo_rl.convert_dcp_to_hf " elif backend == "megatron": cmd += "uv run --extra mcore python -m nemo_skills.training.nemo_rl.convert_megatron_to_hf " else: diff --git a/nemo_skills/pipeline/nemo_rl/sft.py b/nemo_skills/pipeline/nemo_rl/sft.py index 87d12fbc8e..69a3e98408 100644 --- a/nemo_skills/pipeline/nemo_rl/sft.py +++ b/nemo_skills/pipeline/nemo_rl/sft.py @@ -174,7 +174,7 @@ def get_training_cmd( def get_checkpoint_convert_cmd(output_dir, final_hf_path, step, backend, max_position_embeddings=None): cmd = "export PYTHONPATH=$PYTHONPATH:/nemo_run/code && export UV_PROJECT=/opt/NeMo-RL && cd /nemo_run/code && " if backend == "fsdp": - cmd += "uv run --active python -m nemo_skills.training.nemo_rl.convert_dcp_to_hf " + cmd += "uv run --extra automodel python -m nemo_skills.training.nemo_rl.convert_dcp_to_hf " elif backend == "megatron": cmd += "uv run --extra mcore python -m nemo_skills.training.nemo_rl.convert_megatron_to_hf " else: From 9804714300d9e0c0242b24254b8c3be9664d1ff0 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 27 Jan 2026 22:32:02 -0800 Subject: [PATCH 15/17] Add copy for tokenizer files Signed-off-by: Igor Gitman --- .../training/nemo_rl/convert_dcp_to_hf.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py index 44b2996947..77cc21c6fe 100644 --- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py +++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py @@ -19,6 +19,7 @@ import json import os import re +import shutil import subprocess import yaml @@ -87,7 +88,37 @@ def is_safetensors_checkpoint(weights_path): return os.path.isdir(hf_metadata_path) -def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, hf_overrides=None): +def copy_tokenizer_files(model_name, hf_ckpt_path): + """Download and copy tokenizer files from HuggingFace to the HF checkpoint directory. + + Args: + model_name: HuggingFace model name to download tokenizer from + hf_ckpt_path: Path to the HF checkpoint directory + """ + from huggingface_hub import hf_hub_download, list_repo_files + + # Common tokenizer files that need to be copied + tokenizer_files = [ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + "tokenizer.model", # For SentencePiece-based tokenizers + "added_tokens.json", + ] + + print(f"Downloading tokenizer files from {model_name}...") + repo_files = list_repo_files(model_name) + for filename in tokenizer_files: + if filename in repo_files: + downloaded_path = hf_hub_download(model_name, filename) + dst_path = os.path.join(hf_ckpt_path, filename) + shutil.copy2(downloaded_path, dst_path) + print(f"Copied {filename}") + + +def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, tokenizer_name, hf_overrides=None): """Convert safetensors checkpoint to HF format using offline_hf_consolidation.py.""" model_dir = os.path.join(weights_path, "model") @@ -116,6 +147,9 @@ def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, hf_overrid print(f"Running consolidation: {' '.join(cmd)}") subprocess.run(cmd, check=True) + # Copy tokenizer files (not handled by offline consolidation) + copy_tokenizer_files(tokenizer_name, hf_ckpt_path) + # Apply hf_overrides to config.json if provided if hf_overrides: config_path = os.path.join(hf_ckpt_path, "config.json") @@ -177,6 +211,7 @@ def main(): weights_path=dcp_ckpt_path, hf_ckpt_path=args.hf_ckpt_path, model_name=model_name_or_path, + tokenizer_name=tokenizer_name_or_path, hf_overrides=hf_overrides if hf_overrides else None, ) else: From 3f66a6fba571ef630fd6578590fae6de6f5b58f1 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Wed, 28 Jan 2026 21:57:27 -0800 Subject: [PATCH 16/17] Fix tokenizer files logic Signed-off-by: Igor Gitman --- .../training/nemo_rl/convert_dcp_to_hf.py | 33 ++++++++----------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py index 77cc21c6fe..3405bf2a39 100644 --- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py +++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py @@ -88,37 +88,30 @@ def is_safetensors_checkpoint(weights_path): return os.path.isdir(hf_metadata_path) -def copy_tokenizer_files(model_name, hf_ckpt_path): - """Download and copy tokenizer files from HuggingFace to the HF checkpoint directory. +def copy_tokenizer_files(tokenizer_path, hf_ckpt_path): + """Copy tokenizer files from the original model to the HF checkpoint directory. Args: - model_name: HuggingFace model name to download tokenizer from + tokenizer_path: Path to directory containing tokenizer files hf_ckpt_path: Path to the HF checkpoint directory """ - from huggingface_hub import hf_hub_download, list_repo_files - - # Common tokenizer files that need to be copied tokenizer_files = [ "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.json", "merges.txt", - "tokenizer.model", # For SentencePiece-based tokenizers "added_tokens.json", + "chat_template.jinja", ] - - print(f"Downloading tokenizer files from {model_name}...") - repo_files = list_repo_files(model_name) - for filename in tokenizer_files: - if filename in repo_files: - downloaded_path = hf_hub_download(model_name, filename) - dst_path = os.path.join(hf_ckpt_path, filename) - shutil.copy2(downloaded_path, dst_path) - print(f"Copied {filename}") + for fname in tokenizer_files: + src = os.path.join(tokenizer_path, fname) + if os.path.exists(src): + shutil.copy2(src, os.path.join(hf_ckpt_path, fname)) + print(f"Copied {fname}") -def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, tokenizer_name, hf_overrides=None): +def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, tokenizer_path, hf_overrides=None): """Convert safetensors checkpoint to HF format using offline_hf_consolidation.py.""" model_dir = os.path.join(weights_path, "model") @@ -148,7 +141,9 @@ def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, tokenizer_ subprocess.run(cmd, check=True) # Copy tokenizer files (not handled by offline consolidation) - copy_tokenizer_files(tokenizer_name, hf_ckpt_path) + # TODO: this will fail if config["policy"]["model_name"] isn't a path, but that's not common and we should + # anyway remove this logic when it's properly handled in nemo-rl + copy_tokenizer_files(tokenizer_path, hf_ckpt_path) # Apply hf_overrides to config.json if provided if hf_overrides: @@ -211,7 +206,7 @@ def main(): weights_path=dcp_ckpt_path, hf_ckpt_path=args.hf_ckpt_path, model_name=model_name_or_path, - tokenizer_name=tokenizer_name_or_path, + tokenizer_path=tokenizer_name_or_path, hf_overrides=hf_overrides if hf_overrides else None, ) else: From a9a80f5876dd6b86ee629c4b1c2b18ba7bf60d5b Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Thu, 29 Jan 2026 10:37:01 -0800 Subject: [PATCH 17/17] Use 10 samples for bfcl in gpu ci Signed-off-by: Igor Gitman --- tests/gpu-tests/test_eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py index 1988dd961e..15e5789f2a 100644 --- a/tests/gpu-tests/test_eval.py +++ b/tests/gpu-tests/test_eval.py @@ -179,7 +179,8 @@ def test_aaa_prepare_and_eval_all_datasets(): # It also needs a special eval arg # TODO: after summarize results works natively with eval groups, we can merge these # TODO: enable bfcl_v4 after figuring out why it's broken in this setup - bfcl_eval_args = "++eval_config.partial_eval=true ++model_name=Qwen/Qwen3-1.7B-FC" + # setting 10 samples as bfcl is brittle when using only 2 + bfcl_eval_args = "++eval_config.partial_eval=true ++model_name=Qwen/Qwen3-1.7B-FC ++max_samples=10" eval( ctx=wrap_arguments(f"{common_ctx} {bfcl_eval_args}"), output_dir=output_dir,