From 14e9db787a7852546cb6a87c06ae75a3ba80fa13 Mon Sep 17 00:00:00 2001
From: Sadegh Mahdavi <smahdavi@nvidia.com>
Date: Tue, 9 Dec 2025 15:30:44 -0800
Subject: [PATCH 01/17] Update nemo-rl to latest

Signed-off-by: Sadegh Mahdavi <smahdavi@nvidia.com>
---
 dockerfiles/Dockerfile.nemo-rl                |  75 +++++++--
 .../training/nemo_rl/configs/grpo.yaml        | 148 ++++++++++--------
 nemo_skills/training/nemo_rl/configs/sft.yaml |  34 ++--
 nemo_skills/utils.py                          |   7 +-
 4 files changed, 165 insertions(+), 99 deletions(-)

diff --git a/dockerfiles/Dockerfile.nemo-rl b/dockerfiles/Dockerfile.nemo-rl
index b339b14eea..ee00953949 100644
--- a/dockerfiles/Dockerfile.nemo-rl
+++ b/dockerfiles/Dockerfile.nemo-rl
@@ -2,7 +2,16 @@
 # TODO: from next update try to re-use their dockerfile as is as they support specifying the commit
 
 ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
+
+FROM scratch AS nemo-rl
+
+ARG NRL_GIT_REF=main
+ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /
+
+
 FROM ${BASE_IMAGE} AS base
+# An environment variable to indicate that we are in a container.
+ENV NRL_CONTAINER=1
 
 # It is more convenient for users to run as root
 USER root
@@ -34,7 +43,7 @@ rm -rf /var/lib/apt/lists/*
 EOF
 
 # Install uv and python
-ARG UV_VERSION=0.7.2
+ARG UV_VERSION=0.9.7
 ARG PYTHON_VERSION=3.12
 ENV PATH="/root/.local/bin:$PATH"
 RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
@@ -43,36 +52,44 @@ RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
 # Disable usage stats by default for users who are sensitive to sharing usage.
 # Users are encouraged to enable if the wish.
 ENV RAY_USAGE_STATS_ENABLED=0
+# After ray>=2.47, this feature is enabled by default which creates uv venvs for any py_executable starting with `uv run`.
+# There is severe contention and performance issues with this enabled considering our dependencies are so large and occasionally
+# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task.
+ENV RAY_ENABLE_UV_RUN_RUNTIME_ENV=0
 ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
 
 
 FROM base AS hermetic
 
-ARG NEMO_RL_COMMIT
-ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-85eeb8d059b0249cace427dd5dec9573107be224}
-
-RUN git clone https://github.com/NVIDIA-NeMo/RL.git /opt/NeMo-RL && cd /opt/NeMo-RL && git checkout ${NEMO_RL_COMMIT} && git submodule update --init --recursive
-
 WORKDIR /opt/NeMo-RL
 
 # Variables to control the build of TE. If there are issues with parallelization, consider
 # setting these to 1.
 ARG MAX_JOBS
 ARG NVTE_BUILD_THREADS_PER_JOB
+# Only use for custom vllm installs. Learn more at https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/use-custom-vllm.md
+ARG BUILD_CUSTOM_VLLM
 
 ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
 ENV UV_LINK_MODE=copy
 
-# This step is to warm the uv cache with flash-attn without invalidating it due to COPY layers
-# This layer has to be manually updated
-RUN <<"EOF" bash -exu
-uv venv ${UV_PROJECT_ENVIRONMENT}
+# Ensure DeepEP is built for H100 and B200 (also mcore inference unified memory API now invokes a torch API that requires these to be set)
+ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"
 
-VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install --link-mode symlink setuptools torch==2.7.0 psutil ninja --torch-backend=cu128
-VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install --link-mode symlink flash-attn==2.7.4.post1 --no-build-isolation
-EOF
+# First copy only the dependency files
+COPY --from=nemo-rl pyproject.toml uv.lock ./
+# Copy in the top level __init__.py/package_info.py since build-custom-vllm.sh needs the nemo_rl package to exist.
+COPY --from=nemo-rl nemo_rl/__init__.py nemo_rl/package_info.py ./nemo_rl/
+COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh
+COPY --from=nemo-rl --link research/ ./research/
+COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/
 
 RUN <<"EOF" bash -exu
+uv venv --seed
+if [[ -n "${BUILD_CUSTOM_VLLM:-}" ]]; then
+    bash tools/build-custom-vllm.sh
+    source 3rdparty/vllm/nemo-rl.env
+fi
 # uv sync has a more reliable resolver than simple uv pip install which can fail
 
 # Sync each training + inference backend one at a time (since they may conflict)
@@ -83,19 +100,45 @@ RUN <<"EOF" bash -exu
 uv sync --link-mode symlink --locked --no-install-project
 uv sync --link-mode symlink --locked --extra vllm --no-install-project
 uv sync --link-mode symlink --locked --extra mcore --no-install-project
+uv sync --link-mode symlink --locked --extra automodel --no-install-project
 uv sync --link-mode symlink --locked --all-groups --no-install-project
 EOF
 
-RUN VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install --link-mode=symlink /opt/NeMo-RL/3rdparty/Megatron-LM-workspace/Megatron-LM
 ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
 ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
 
+WORKDIR /opt/NeMo-RL
 
+FROM hermetic AS release
+
+ARG NEMO_RL_COMMIT
+ARG NVIDIA_BUILD_ID
+ARG NVIDIA_BUILD_REF
+ARG RC_DATE=00.00
+ARG TARGETARCH
+ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-09476838c92c27a7488afbe6febb6339d3d79be9}
+ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-<unknown>}
+ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-<unknown>}
+LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}"
+LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
 
-WORKDIR /opt/NeMo-RL
 ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
 
-# Copy in source and prefetch all virtual environments
+# Copy in source from build context (defaults to cloned repo, can be overridden)
+# Exclude pyproject.toml and uv.lock since those may be altered by build-custom-vllm.sh
+COPY --from=nemo-rl --exclude=pyproject.toml --exclude=uv.lock . /opt/NeMo-RL
+# Unshallow the repo to get the full history (in the case it was from the scratch layer).
+# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history),
+# so do a quick check before trying to unshallow.
+RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true
 RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py
 
+# Generate container fingerprint for frozen environment support
+# Store outside /opt/NeMo-RL to avoid being overwritten by user mounts
+RUN python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint
+
+# NOTICES.txt file points to where the OSS source code is archived
+RUN echo "This distribution includes open source which is archived at the following URL: https://opensource.nvidia.com/oss/teams/nvidia/nemo-rl/${RC_DATE}:linux-${TARGETARCH}/index.html" > NOTICES.txt && \
+    echo "For further inquiries or assistance, contact us at oss-requests@nvidia.com" >> NOTICES.txt
+
 RUN git clone https://github.com/NVIDIA-NeMo/Skills.git /opt/NeMo-Skills && cd /opt/NeMo-Skills && uv pip install .
diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml
index 53428ca873..187c0ff686 100644
--- a/nemo_skills/training/nemo_rl/configs/grpo.yaml
+++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml
@@ -1,5 +1,4 @@
-# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/ab1b638a499308caea022648daaf6994d390cbde/examples/configs/grpo_math_1B.yaml
-
+# Copied and edited from https://github.com/NVIDIA-NeMo/RL/blob/64ab08df3edf25131959fc474b44ed5e36a1600b/examples/configs/grpo_math_1B.yaml
 # GRPO Algorithm Configuration
 grpo:
   num_prompts_per_step: 32
@@ -9,7 +8,7 @@ grpo:
   max_num_steps: 1000000
   normalize_rewards: true
   use_leave_one_out_baseline: true
-  val_period: 0
+  val_period: 0 # disabled
   val_at_start: false
   overlong_filtering: false
   max_val_samples: 256
@@ -34,9 +33,16 @@ grpo:
     enabled: false # Set to true to enable async training mode
     # Max age (in training steps) for trajectories used in training
     max_trajectory_age_steps: 1
+    in_flight_weight_updates: false # Set to true to enable in-flight weight updates
+    recompute_kv_cache_after_weight_updates: false # Set to true to recompute kv cache after in-flight-weight-updates
 
 loss_fn:
   reference_policy_kl_penalty: 0.01
+  # Can be set to k1, k2, k3
+  # For more details, see http://joschu.net/blog/kl-approx.html
+  reference_policy_kl_type: "k3"
+  kl_input_clamp_value: 20.0
+  kl_output_clamp_value: 10.0
   ratio_clip_min: 0.2
   ratio_clip_max: 0.2
   ratio_clip_c: null
@@ -48,28 +54,31 @@ loss_fn:
   truncated_importance_sampling_ratio: null
   sequence_level_importance_ratios: false
   token_level_loss: true
+  force_on_policy_ratio: false  # Set to true to force ratio=1.0 (requires train_global_batch_size == num_prompts_per_step * num_generations_per_prompt)
 
 checkpointing:
   enabled: true
   checkpoint_dir: "results/grpo"
-  metric_name: "val_reward"
+  metric_name: "val:reward" # one of "val:" or "train:" followed by the metric name
   higher_is_better: true
   keep_top_k: 50
   save_period: 10
   checkpoint_must_save_by: null
+  model_save_format: "safetensors"
+  save_consolidated: false
 
 policy:
   model_name: ???
   tokenizer:
     name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
     chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
+  hf_config_overrides: {}
   train_global_batch_size: 512
   train_micro_batch_size: 4
   generation_batch_size: 32 # Only used when generating using HF backend
   logprob_batch_size: 4
   max_total_sequence_length: 512
   precision: "bfloat16"
-  fsdp_offload_enabled: false
   activation_checkpointing_enabled: false
   refit_buffer_size_gb: 4 # used for refitting inference engine, the unit is GB
   tensor_model_parallel_size: 1
@@ -78,8 +87,11 @@ policy:
   lr: 1e-6
   weight_decay: 0.01
   min_lr: 1e-6
+  logprob_chunk_size: null
+  offload_optimizer_for_logprob: false # Only useful for non-colocated generation since colocated generation will always offload optimizer to cuda before refit
 
   dtensor_cfg:
+    _v2: true
     enabled: true
     cpu_offload: False
     sequence_parallel: false
@@ -88,58 +100,9 @@ policy:
     context_parallel_size: ${policy.context_parallel_size}
     custom_parallel_plan: null
 
-  # dynamic_batching improves performance by ensuring logprob and training microbatches
-  # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
-  # responses are sorted by sequence length and bucketed into microbatches with a total
-  # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
-  # training and logprob stages respectively.
-  # We disable dynamic batching for Megatron as it is incompatible with Pipeline parallelism.
-  # Instead, we use sequence packing.
-  dynamic_batching:
-    enabled: False
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
-
-  sequence_packing:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
-  #If this value is set to null, it will be automatically assigned in the code.
-  make_sequence_length_divisible_by: null
-  max_grad_norm: 1.0
-
-  optimizer:
-    name: "torch.optim.AdamW"
-    kwargs:
-      lr: ${policy.lr}
-      weight_decay: ${policy.weight_decay}
-      betas: [0.9, 0.999]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
-
-  scheduler:
-  - name: "torch.optim.lr_scheduler.LinearLR"
-    kwargs:
-      start_factor: 1.0
-      end_factor: 1.0
-      total_iters: 1   # must be >=1, here it keeps LR constant
-  - name: "torch.optim.lr_scheduler.CosineAnnealingLR"
-    kwargs:
-      T_max: ${grpo.max_num_steps}      # total training steps
-      eta_min: ${policy.lr}   # set min_lr = initial_lr -> constant schedule
-  - milestones: [0]  # required to avoid config errors
-
-
-
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs
+    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     activation_checkpointing: false
     converter_type: "Qwen2ForCausalLM"
     tensor_model_parallel_size: ${policy.tensor_model_parallel_size}
@@ -156,9 +119,12 @@ policy:
     moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
     moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
     moe_permute_fusion: false
-    bias_activation_fusion: True
     #gives ~20% training perf speedup with sequence packing
     apply_rope_fusion: True
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
+    bias_activation_fusion: True
+    defer_fp32_logits: False
+    moe_per_layer_logging: False
 
     optimizer:
       optimizer: "adam"
@@ -204,7 +170,53 @@ policy:
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
+    fp8_cfg: null
+
+    env_vars: null
+
+  # See docs/design-docs/sequence-packing-and-dynamic-batching.md
+  # for more details on dynamic batching and sequence packing.
+  dynamic_batching:
+    enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    sequence_length_round: 64
+
+  sequence_packing:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
+  max_grad_norm: 1.0
+
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: ${policy.lr}
+      weight_decay: ${policy.weight_decay}
+      betas: [0.9, 0.999]
+      eps: 1e-8
+      # when using Dtensor, we need to set foreach
+      # and fused to False
+      foreach: False
+      fused: False
 
+  scheduler:
+    - name: "torch.optim.lr_scheduler.LinearLR"
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1.0
+        total_iters: 10
+    - name: "torch.optim.lr_scheduler.ConstantLR"
+      kwargs:
+        factor: 1.0
+        total_iters: 10000000000
+    - milestones: [10]
 
   generation:
     backend: "vllm"
@@ -215,18 +227,24 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
-      async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447.
+      async_engine: false
       precision: ${policy.precision}
+      kv_cache_dtype: "auto"
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
       expert_parallel_size: 1  # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
-      enable_expert_parallel: false
-      enforce_eager: True # Set as True to avoid vllm bug
-      # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit
-      # For Gemma models, we need to use "auto" due to a vllm bug
-      load_format: dummy
+      # when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy,
+      # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
+      # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
+      enforce_eager: False
+      use_deep_gemm: False
+      num_last_layers_in_bf16: 0
+      num_first_layers_in_bf16: 0
+      enable_vllm_metrics_logger: true # Set to true to enable vLLM internal metrics logger, turn off for better performance
+      vllm_metrics_logger_interval: 0.5 # Interval in seconds to collect vLLM logger metrics
+    vllm_kwargs: {}
     colocated:
       # true: generation shares training GPUs
       # false: uses dedicated generation resources
@@ -258,9 +276,9 @@ logger:
   num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
   wandb_enabled: false
   tensorboard_enabled: false
-  mlflow_enabled: false
+  mlflow_enabled: false  # Disable MLflow logging
   swanlab_enabled: false # Disable SwanLab logging
-  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
     project: "grpo-dev"
     name: "grpo-dev-logger"
@@ -275,5 +293,3 @@ logger:
 cluster:
   gpus_per_node: 1
   num_nodes: 1
-
-checkpoint_must_save_by: null
diff --git a/nemo_skills/training/nemo_rl/configs/sft.yaml b/nemo_skills/training/nemo_rl/configs/sft.yaml
index 20cc35ff8d..6781241939 100644
--- a/nemo_skills/training/nemo_rl/configs/sft.yaml
+++ b/nemo_skills/training/nemo_rl/configs/sft.yaml
@@ -16,7 +16,7 @@ sft:
 checkpointing:
   enabled: true
   checkpoint_dir: "results/sft"
-  metric_name: "val_loss"
+  metric_name: "val:val_loss" # one of "val:" or "train:" followed by the metric name
   higher_is_better: false
   keep_top_k: 50
   save_period: 100
@@ -33,8 +33,7 @@ policy:
   train_micro_batch_size: 1
   max_total_sequence_length: 4096
   precision: "bfloat16"
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
+  offload_optimizer_for_logprob: false
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   context_parallel_size: 1
@@ -43,9 +42,9 @@ policy:
   weight_decay: 0.01
   min_lr: 1e-6
 
-
   dtensor_cfg:
     enabled: true
+    env_vars: {}
     cpu_offload: False
     sequence_parallel: ${policy.sequence_parallel}
     activation_checkpointing: false
@@ -53,9 +52,10 @@ policy:
     context_parallel_size: ${policy.context_parallel_size}
     custom_parallel_plan: null
 
-
   megatron_cfg:
     enabled: false
+    env_vars: {}
+    empty_unused_memory_level: 1
     activation_checkpointing: false
     tensor_model_parallel_size: ${policy.tensor_model_parallel_size}
     expert_tensor_parallel_size: 1
@@ -72,10 +72,12 @@ policy:
     moe_router_bias_update_rate: 1e-3
     moe_permute_fusion: false
     #gives ~20% training perf speedup with sequence packing
+    apply_rope_fusion: True
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
     bias_activation_fusion: True
-    apply_rope_fusion: True  # Only used if position_embedding_type=rope
     layernorm_epsilon: 1e-6
-    empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs
+    defer_fp32_logits: False
+    moe_per_layer_logging: False
 
     optimizer:
       optimizer: "adam"
@@ -98,7 +100,7 @@ policy:
       use_distributed_optimizer: true
       use_precision_aware_optimizer: true
 
-      # clip_grad: ${policy.max_grad_norm}
+      clip_grad: ${policy.max_grad_norm}
 
       # optimizer cpu offload
       optimizer_cpu_offload: false
@@ -117,22 +119,24 @@ policy:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
       overlap_param_gather: true
-      average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
+      use_custom_fsdp: false
 
   dynamic_batching:
     enabled: false
-
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    sequence_length_round: 64
 
   sequence_packing:
     enabled: True
     train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
     algorithm: "modified_first_fit_decreasing"
     sequence_length_round: 64
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-  #If this value is set to null, it will be automatically assigned in the code.
-  make_sequence_length_divisible_by: null
-  max_grad_norm: null
+
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
+  max_grad_norm: 0.0 # Zero means no clipping
 
   optimizer:
     name: "torch.optim.AdamW"
@@ -155,7 +159,7 @@ policy:
   - name: "torch.optim.lr_scheduler.CosineAnnealingLR"
     kwargs:
       T_max: ${sft.max_num_steps}      # total training steps
-      eta_min: ${policy.lr}   # set min_lr = initial_lr -> constant schedule
+      eta_min: ${policy.min_lr}   # set min_lr = initial_lr -> constant schedule
   - milestones: [0]  # required to avoid config errors
 
 
diff --git a/nemo_skills/utils.py b/nemo_skills/utils.py
index 18e6f63c73..db1956e9a0 100644
--- a/nemo_skills/utils.py
+++ b/nemo_skills/utils.py
@@ -25,8 +25,6 @@
 from pathlib import Path
 from typing import Any, Callable, List, Optional, Union
 
-import fire
-from fire import decorators as fire_decorators
 from rich.logging import RichHandler
 
 # isort: off
@@ -507,6 +505,11 @@ def check_no_extra_args_fire():
         RuntimeError: If the function name is not found in the calling context.
         ValueError: If extra arguments are found that are not accepted by the function.
     """
+
+    # Need to import here since nemo-rl async GRPO data processing imports this file and does not have fire installed on its VLLM uv venv.
+    import fire
+    from fire import decorators as fire_decorators
+
     args = sys.argv[1:]
     # Extract the function name and its arguments from the command-line arguments
     function_name = args[0]

From ee566dc21cf905a2f1aa294dbffc5c8bdf5f665a Mon Sep 17 00:00:00 2001
From: Sadegh Mahdavi <smahdavi@nvidia.com>
Date: Tue, 9 Dec 2025 18:07:46 -0800
Subject: [PATCH 02/17] Update start_grpo

Signed-off-by: Sadegh Mahdavi <smahdavi@nvidia.com>
---
 nemo_skills/training/nemo_rl/start_grpo.py | 73 +++++++++++++++++-----
 1 file changed, 59 insertions(+), 14 deletions(-)

diff --git a/nemo_skills/training/nemo_rl/start_grpo.py b/nemo_skills/training/nemo_rl/start_grpo.py
index afe7d3f7be..82e34b2dda 100644
--- a/nemo_skills/training/nemo_rl/start_grpo.py
+++ b/nemo_skills/training/nemo_rl/start_grpo.py
@@ -328,20 +328,65 @@ def main() -> None:
         master_config,
     ) = setup(config, tokenizer, dataset, val_dataset)
 
-    grpo_train(
-        policy,
-        policy_generation,
-        dataloader,
-        val_dataloader,
-        tokenizer,
-        loss_fn,
-        task_to_env,
-        val_task_to_env,
-        logger,
-        checkpointer,
-        grpo_state,
-        master_config,
-    )
+    # Check if async mode is enabled
+    if "async_grpo" in config["grpo"] and config["grpo"]["async_grpo"]["enabled"]:
+        # Async GRPO does not support dynamic sampling, reward scaling, or reward shaping (DAPO features)
+        unsupported_features = [
+            "use_dynamic_sampling",
+            "reward_scaling",
+            "reward_shaping",
+        ]
+
+        for feature in unsupported_features:
+            if feature not in config["grpo"]:
+                continue
+
+            if feature == "use_dynamic_sampling":
+                if config["grpo"][feature]:
+                    raise NotImplementedError(f"{feature} is not supported with async GRPO")
+            else:
+                if config["grpo"][feature]["enabled"]:
+                    raise NotImplementedError(f"{feature} is not supported with async GRPO")
+
+        from nemo_rl.algorithms.grpo import async_grpo_train
+
+        print("🚀 Running async GRPO training")
+
+        async_config = config["grpo"]["async_grpo"]
+        # Run async GRPO training
+        async_grpo_train(
+            policy=policy,
+            policy_generation=policy_generation,
+            dataloader=dataloader,
+            val_dataloader=val_dataloader,
+            tokenizer=tokenizer,
+            loss_fn=loss_fn,
+            task_to_env=task_to_env,
+            val_task_to_env=val_task_to_env,
+            logger=logger,
+            checkpointer=checkpointer,
+            grpo_save_state=grpo_state,
+            master_config=master_config,
+            max_trajectory_age_steps=async_config["max_trajectory_age_steps"],
+        )
+    else:
+        print("🚀 Running synchronous GRPO training")
+
+        # Run standard GRPO training
+        grpo_train(
+            policy,
+            policy_generation,
+            dataloader,
+            val_dataloader,
+            tokenizer,
+            loss_fn,
+            task_to_env,
+            val_task_to_env,
+            logger,
+            checkpointer,
+            grpo_state,
+            master_config,
+        )
 
 
 if __name__ == "__main__":

From 33cf653db2545e7c697cacb3f2c07ad382136540 Mon Sep 17 00:00:00 2001
From: Sadegh Mahdavi <smahdavi@nvidia.com>
Date: Wed, 10 Dec 2025 11:17:38 -0800
Subject: [PATCH 03/17] legacy configs for nemo-rl

Signed-off-by: Sadegh Mahdavi <smahdavi@nvidia.com>
---
 .../nemo_rl/configs/grpo-legacy-85eeb8d.yaml  | 279 ++++++++++++++++++
 .../nemo_rl/configs/sft-legacy-85eeb8d.yaml   | 197 +++++++++++++
 2 files changed, 476 insertions(+)
 create mode 100644 nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml
 create mode 100644 nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml

diff --git a/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml b/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml
new file mode 100644
index 0000000000..53428ca873
--- /dev/null
+++ b/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml
@@ -0,0 +1,279 @@
+# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/ab1b638a499308caea022648daaf6994d390cbde/examples/configs/grpo_math_1B.yaml
+
+# GRPO Algorithm Configuration
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
+  max_num_epochs: 1
+  max_num_steps: 1000000
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 0
+  val_at_start: false
+  overlong_filtering: false
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42
+  use_dynamic_sampling: false
+  dynamic_sampling_max_gen_batches: 10
+  batch_multiplier: 1
+  reward_shaping:
+    enabled: false
+    overlong_buffer_length: 128
+    overlong_buffer_penalty: 1
+    max_response_length: ${policy.max_total_sequence_length}
+  reward_scaling:
+    enabled: false
+    source_min: 0.0
+    source_max: 1.0
+    target_min: 0.0
+    target_max: 1.0
+
+  async_grpo:
+    enabled: false # Set to true to enable async training mode
+    # Max age (in training steps) for trajectories used in training
+    max_trajectory_age_steps: 1
+
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  ratio_clip_c: null
+  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
+  use_on_policy_kl_approximation: false
+  # Async GRPO requires importance sampling correction enabled
+  # Set to true when async_grpo.enabled is true
+  use_importance_sampling_correction: false
+  truncated_importance_sampling_ratio: null
+  sequence_level_importance_ratios: false
+  token_level_loss: true
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: "results/grpo"
+  metric_name: "val_reward"
+  higher_is_better: true
+  keep_top_k: 50
+  save_period: 10
+  checkpoint_must_save_by: null
+
+policy:
+  model_name: ???
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+    chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
+  train_global_batch_size: 512
+  train_micro_batch_size: 4
+  generation_batch_size: 32 # Only used when generating using HF backend
+  logprob_batch_size: 4
+  max_total_sequence_length: 512
+  precision: "bfloat16"
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4 # used for refitting inference engine, the unit is GB
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+  lr: 1e-6
+  weight_decay: 0.01
+  min_lr: 1e-6
+
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: False
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: ${policy.tensor_model_parallel_size}
+    context_parallel_size: ${policy.context_parallel_size}
+    custom_parallel_plan: null
+
+  # dynamic_batching improves performance by ensuring logprob and training microbatches
+  # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
+  # responses are sorted by sequence length and bucketed into microbatches with a total
+  # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
+  # training and logprob stages respectively.
+  # We disable dynamic batching for Megatron as it is incompatible with Pipeline parallelism.
+  # Instead, we use sequence packing.
+  dynamic_batching:
+    enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    sequence_length_round: 64
+
+  sequence_packing:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+  #If this value is set to null, it will be automatically assigned in the code.
+  make_sequence_length_divisible_by: null
+  max_grad_norm: 1.0
+
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: ${policy.lr}
+      weight_decay: ${policy.weight_decay}
+      betas: [0.9, 0.999]
+      eps: 1e-8
+      # when using Dtensor, we need to set foreach
+      # and fused to False
+      foreach: False
+      fused: False
+
+  scheduler:
+  - name: "torch.optim.lr_scheduler.LinearLR"
+    kwargs:
+      start_factor: 1.0
+      end_factor: 1.0
+      total_iters: 1   # must be >=1, here it keeps LR constant
+  - name: "torch.optim.lr_scheduler.CosineAnnealingLR"
+    kwargs:
+      T_max: ${grpo.max_num_steps}      # total training steps
+      eta_min: ${policy.lr}   # set min_lr = initial_lr -> constant schedule
+  - milestones: [0]  # required to avoid config errors
+
+
+
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs
+    activation_checkpointing: false
+    converter_type: "Qwen2ForCausalLM"
+    tensor_model_parallel_size: ${policy.tensor_model_parallel_size}
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: ${policy.pipeline_model_parallel_size}
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: ${policy.context_parallel_size}
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    moe_permute_fusion: false
+    bias_activation_fusion: True
+    #gives ~20% training perf speedup with sequence packing
+    apply_rope_fusion: True
+
+    optimizer:
+      optimizer: "adam"
+      lr: ${policy.lr}
+      min_lr: ${policy.min_lr}
+      weight_decay: ${policy.weight_decay}
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      # optimizer cpu offload
+      optimizer_cpu_offload: false
+      optimizer_offload_fraction: 0.0
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "cosine"
+      lr_decay_iters: ${grpo.max_num_steps}
+      lr_warmup_iters: 0
+      lr_warmup_init: 1.0e-6
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+
+
+  generation:
+    backend: "vllm"
+    max_new_tokens: ${policy.max_total_sequence_length}
+    temperature: 1.0
+    top_p: 1.0
+    top_k: null
+    stop_token_ids: null
+    stop_strings: null
+    vllm_cfg:
+      async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447.
+      precision: ${policy.precision}
+      tensor_parallel_size: 1
+      pipeline_parallel_size: 1
+      expert_parallel_size: 1  # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP
+      gpu_memory_utilization: 0.6
+      max_model_len: ${policy.max_total_sequence_length}
+      enable_expert_parallel: false
+      enforce_eager: True # Set as True to avoid vllm bug
+      # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit
+      # For Gemma models, we need to use "auto" due to a vllm bug
+      load_format: dummy
+    colocated:
+      # true: generation shares training GPUs
+      # false: uses dedicated generation resources
+      enabled: true
+      # only relevant when enabled is false
+      resources:
+        gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
+        num_nodes: null # Decides number of nodes to be dedicated to generation
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
+  shuffle: true
+  prompt:
+    prompt_config: ???
+    examples_type: null
+    config_dir: null
+    template_dir: null
+  train_data_path: null
+  val_data_path: null
+  num_workers: 10
+
+env:
+  math:
+    env_cls: nemo_skills.training.nemo_rl.environments.math_environment.MathEnvironment
+    num_workers: 8
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
+  wandb_enabled: false
+  tensorboard_enabled: false
+  mlflow_enabled: false
+  swanlab_enabled: false # Disable SwanLab logging
+  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "grpo-dev"
+    name: "grpo-dev-logger"
+  tensorboard: {}
+  mlflow:
+    experiment_name: "grpo-dev"
+    run_name: "grpo-dev-logger"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1
+
+checkpoint_must_save_by: null
diff --git a/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml b/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml
new file mode 100644
index 0000000000..20cc35ff8d
--- /dev/null
+++ b/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml
@@ -0,0 +1,197 @@
+# SFT Algorithm Configuration
+sft:
+  ## total number of steps to train will equal
+  ## min((max_num_epochs * len(train_dataloader)), max_num_steps)
+  # setting both to big values by default, so only one needs to be set
+  max_num_epochs: 100000000
+  max_num_steps: 100000000
+
+  val_period: 0
+  val_batches: 1
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: false
+  seed: 42
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: "results/sft"
+  metric_name: "val_loss"
+  higher_is_better: false
+  keep_top_k: 50
+  save_period: 100
+  checkpoint_must_save_by: null
+
+
+policy:
+  model_name: ???
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+    chat_template: "infer_from_data"  ## Can be: null (passthrough), "default" (tokenizer's default), "infer_from_data" (auto-detect from data), or custom jinja2 template
+    chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  max_total_sequence_length: 4096
+  precision: "bfloat16"
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+  sequence_parallel: false
+  lr: 1e-6
+  weight_decay: 0.01
+  min_lr: 1e-6
+
+
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: False
+    sequence_parallel: ${policy.sequence_parallel}
+    activation_checkpointing: false
+    tensor_parallel_size: ${policy.tensor_model_parallel_size}
+    context_parallel_size: ${policy.context_parallel_size}
+    custom_parallel_plan: null
+
+
+  megatron_cfg:
+    enabled: false
+    activation_checkpointing: false
+    tensor_model_parallel_size: ${policy.tensor_model_parallel_size}
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: ${policy.pipeline_model_parallel_size}
+    context_parallel_size: ${policy.context_parallel_size}
+    pipeline_dtype: ${policy.precision}
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    sequence_parallel: ${policy.sequence_parallel}
+    freeze_moe_router: false
+    moe_router_dtype: null
+    moe_router_load_balancing_type: "aux_loss"
+    moe_router_bias_update_rate: 1e-3
+    moe_permute_fusion: false
+    #gives ~20% training perf speedup with sequence packing
+    bias_activation_fusion: True
+    apply_rope_fusion: True  # Only used if position_embedding_type=rope
+    layernorm_epsilon: 1e-6
+    empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs
+
+    optimizer:
+      optimizer: "adam"
+      lr: ${policy.lr}
+      min_lr: ${policy.min_lr}
+      weight_decay: ${policy.weight_decay}
+      bf16: true  # must be true to avoid checkpoint load error
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      # clip_grad: ${policy.max_grad_norm}
+
+      # optimizer cpu offload
+      optimizer_cpu_offload: false
+      optimizer_offload_fraction: 0.0
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "cosine"
+      lr_decay_iters: ${sft.max_num_steps}
+      lr_warmup_iters: 0
+      lr_warmup_init: 1.0e-6
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+  dynamic_batching:
+    enabled: false
+
+
+  sequence_packing:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+  #If this value is set to null, it will be automatically assigned in the code.
+  make_sequence_length_divisible_by: null
+  max_grad_norm: null
+
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: ${policy.lr}
+      weight_decay: ${policy.weight_decay}
+      betas: [0.9, 0.98]
+      eps: 1e-8
+      # when using Dtensor, we need to set foreach
+      # and fused to False
+      foreach: False
+      fused: False
+
+  scheduler:
+  - name: "torch.optim.lr_scheduler.LinearLR"
+    kwargs:
+      start_factor: 1.0
+      end_factor: 1.0
+      total_iters: 1   # must be >=1, here it keeps LR constant
+  - name: "torch.optim.lr_scheduler.CosineAnnealingLR"
+    kwargs:
+      T_max: ${sft.max_num_steps}      # total training steps
+      eta_min: ${policy.lr}   # set min_lr = initial_lr -> constant schedule
+  - milestones: [0]  # required to avoid config errors
+
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: prompt_response_dataset
+  add_bos: false
+  add_eos: false
+  add_generation_prompt: false
+  input_key: input
+  output_key: output
+  force_reprocess: false
+  shuffle: true
+  num_workers: 10
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
+  tensorboard_enabled: true
+  mlflow_enabled: false
+  swanlab_enabled: false # Disable SwanLab logging
+  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
+  wandb:
+    project: "sft-dev"
+    name: "sft-dev-${data.dataset_name}"
+  tensorboard:
+    log_dir: "tb_logs-sft-dev-${data.dataset_name}"
+  mlflow:
+    experiment_name: "sft-dev"
+    run_name: "sft-dev-${data.dataset_name}"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1

From f00cc8e704c9225b7725ea51b13ca5595bde5a4b Mon Sep 17 00:00:00 2001
From: Sadegh Mahdavi <smahdavi@nvidia.com>
Date: Wed, 10 Dec 2025 14:45:12 -0800
Subject: [PATCH 04/17] update dockerfile

Signed-off-by: Sadegh Mahdavi <smahdavi@nvidia.com>
---
 dockerfiles/Dockerfile.nemo-rl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dockerfiles/Dockerfile.nemo-rl b/dockerfiles/Dockerfile.nemo-rl
index ee00953949..27d9b00ce9 100644
--- a/dockerfiles/Dockerfile.nemo-rl
+++ b/dockerfiles/Dockerfile.nemo-rl
@@ -1,3 +1,4 @@
+# syntax=docker/dockerfile:1
 # copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/main/docker/Dockerfile
 # TODO: from next update try to re-use their dockerfile as is as they support specifying the commit
 
@@ -5,8 +6,8 @@ ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
 
 FROM scratch AS nemo-rl
 
-ARG NRL_GIT_REF=main
-ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /
+ARG NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-09476838c92c27a7488afbe6febb6339d3d79be9}
+ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NEMO_RL_COMMIT} /
 
 
 FROM ${BASE_IMAGE} AS base
@@ -111,12 +112,10 @@ WORKDIR /opt/NeMo-RL
 
 FROM hermetic AS release
 
-ARG NEMO_RL_COMMIT
 ARG NVIDIA_BUILD_ID
 ARG NVIDIA_BUILD_REF
 ARG RC_DATE=00.00
 ARG TARGETARCH
-ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-09476838c92c27a7488afbe6febb6339d3d79be9}
 ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-<unknown>}
 ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-<unknown>}
 LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}"

From cf9c856b549cb883b5611153160262655a5a72df Mon Sep 17 00:00:00 2001
From: Sadegh Mahdavi <smahdavi@nvidia.com>
Date: Thu, 15 Jan 2026 12:01:59 -0800
Subject: [PATCH 05/17] update nemo-rl to latest commit

Signed-off-by: Sadegh Mahdavi <smahdavi@nvidia.com>
---
 dockerfiles/Dockerfile.nemo-rl                | 10 +++++--
 .../training/nemo_rl/configs/grpo.yaml        | 26 +++++++++++-----
 nemo_skills/training/nemo_rl/configs/sft.yaml | 30 +++++++++++++++++++
 3 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/dockerfiles/Dockerfile.nemo-rl b/dockerfiles/Dockerfile.nemo-rl
index 27d9b00ce9..9b66bf7f44 100644
--- a/dockerfiles/Dockerfile.nemo-rl
+++ b/dockerfiles/Dockerfile.nemo-rl
@@ -6,7 +6,7 @@ ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
 
 FROM scratch AS nemo-rl
 
-ARG NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-09476838c92c27a7488afbe6febb6339d3d79be9}
+ARG NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-e95efb912a6909b5da91ffeb197debe91fd480d8}
 ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NEMO_RL_COMMIT} /
 
 
@@ -38,6 +38,8 @@ apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos
 apt update
 apt install -y nsight-systems-cli
 
+# To fix CVE-2025-68973
+apt install -y --only-upgrade gnupg
 
 apt-get clean
 rm -rf /var/lib/apt/lists/*
@@ -85,7 +87,7 @@ COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh
 COPY --from=nemo-rl --link research/ ./research/
 COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/
 
-RUN <<"EOF" bash -exu
+RUN --mount=type=ssh <<"EOF" bash -exu
 uv venv --seed
 if [[ -n "${BUILD_CUSTOM_VLLM:-}" ]]; then
     bash tools/build-custom-vllm.sh
@@ -103,6 +105,10 @@ uv sync --link-mode symlink --locked --extra vllm --no-install-project
 uv sync --link-mode symlink --locked --extra mcore --no-install-project
 uv sync --link-mode symlink --locked --extra automodel --no-install-project
 uv sync --link-mode symlink --locked --all-groups --no-install-project
+
+# Remove the aiohttp in this uv cache dir to fully address CVE GHSA-mqqc-3gqh-h2x8
+# The ray install will include the older aiohttp version in its cache
+find /root/.cache/uv -type d -path "*ray/_private/runtime_env/agent/thirdparty_files/aiohttp*" -exec rm -rf {} +
 EOF
 
 ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml
index 187c0ff686..5993e0430d 100644
--- a/nemo_skills/training/nemo_rl/configs/grpo.yaml
+++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml
@@ -1,4 +1,4 @@
-# Copied and edited from https://github.com/NVIDIA-NeMo/RL/blob/64ab08df3edf25131959fc474b44ed5e36a1600b/examples/configs/grpo_math_1B.yaml
+# Copied and edited from https://github.com/NVIDIA-NeMo/RL/blob/e95efb912a6909b5da91ffeb197debe91fd480d8/examples/configs/grpo_math_1B.yaml
 # GRPO Algorithm Configuration
 grpo:
   num_prompts_per_step: 32
@@ -79,8 +79,6 @@ policy:
   logprob_batch_size: 4
   max_total_sequence_length: 512
   precision: "bfloat16"
-  activation_checkpointing_enabled: false
-  refit_buffer_size_gb: 4 # used for refitting inference engine, the unit is GB
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   context_parallel_size: 1
@@ -157,10 +155,10 @@ policy:
       start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       weight_decay_incr_style: "constant"
-      lr_decay_style: "cosine"
-      lr_decay_iters: ${grpo.max_num_steps}
-      lr_warmup_iters: 0
-      lr_warmup_init: 1.0e-6
+      lr_decay_style: "constant"
+      lr_decay_iters: 1000
+      lr_warmup_iters: 13
+      lr_warmup_init: 5.0e-7
 
     distributed_data_parallel_config:
       grad_reduce_in_fp32: false
@@ -226,7 +224,16 @@ policy:
     top_k: null
     stop_token_ids: null
     stop_strings: null
-    vllm_cfg:
+    mcore_generation_config: # When using megatron for generation
+      buffer_size_gb: 20  # Total GPU memory (in GB) allocated for KV cache buffers
+      buffer_guaranteed_fraction: 0.1  # Fraction of buffer reserved for guaranteed active requests
+      num_cuda_graphs: 16  # Number of CUDA graphs to pre-compile for different batch sizes
+      block_size_tokens: 256  # Size of each KV cache block in tokens (affects memory granularity)
+      use_cuda_graphs_for_non_decode_steps: true  # Enable CUDA graphs for prefill/context processing
+      enable_chunked_prefill: true  # Split long prefills into chunks for better memory management
+      unified_memory_level: 0  # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
+      max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
+    vllm_cfg: # When using vllm for generation
       async_engine: false
       precision: ${policy.precision}
       kv_cache_dtype: "auto"
@@ -282,6 +289,9 @@ logger:
   wandb:
     project: "grpo-dev"
     name: "grpo-dev-logger"
+  swanlab:
+    project: "grpo-dev"
+    name: "grpo-dev-logger"
   tensorboard: {}
   mlflow:
     experiment_name: "grpo-dev"
diff --git a/nemo_skills/training/nemo_rl/configs/sft.yaml b/nemo_skills/training/nemo_rl/configs/sft.yaml
index 6781241939..3a8ed6158e 100644
--- a/nemo_skills/training/nemo_rl/configs/sft.yaml
+++ b/nemo_skills/training/nemo_rl/configs/sft.yaml
@@ -43,6 +43,7 @@ policy:
   min_lr: 1e-6
 
   dtensor_cfg:
+    _v2: true
     enabled: true
     env_vars: {}
     cpu_offload: False
@@ -52,6 +53,19 @@ policy:
     context_parallel_size: ${policy.context_parallel_size}
     custom_parallel_plan: null
 
+    # LoRA (Low-Rank Adaptation) Configuration
+    lora_cfg:
+      enabled: False  # Set to True to enable LoRA fine-tuning
+      target_modules: []  # List of module names to apply LoRA (empty list with match_all_linear=true applies to all linear layers)
+      exclude_modules: []  # List of module names to exclude from LoRA
+      match_all_linear: true  # If True, applies LoRA to all linear layers (overrides target_modules)
+      dim: 8  # LoRA rank (r): lower rank = fewer parameters but less capacity. Typical values: 4, 8, 16, 32, 64
+      alpha: 32  # LoRA scaling factor: effective learning rate multiplier = alpha/dim. Typical values: 16, 32, 64
+      dropout: 0.0  # Dropout probability applied to LoRA layers (0.0 = no dropout)
+      dropout_position: "post"  # Where to apply dropout: "pre" (before LoRA) or "post" (after LoRA)
+      lora_A_init: "xavier"  # Initialization method for LoRA A matrix: "xavier" or "uniform"
+      use_triton: true  # Use Triton-optimized kernels for LoRA (faster but requires flash-attn). Disable when tensor_parallel_size > 1
+
   megatron_cfg:
     enabled: false
     env_vars: {}
@@ -79,6 +93,19 @@ policy:
     defer_fp32_logits: False
     moe_per_layer_logging: False
 
+    peft:
+      enabled: false
+      target_modules: []
+      exclude_modules: []
+      dim: 8
+      alpha: 32
+      dropout: 0.0
+      dropout_position: "post"
+      lora_A_init_method: "xavier"
+      lora_B_init_method: "zero"
+      a2a_experimental: false
+      lora_dtype: None
+
     optimizer:
       optimizer: "adam"
       lr: ${policy.lr}
@@ -186,6 +213,9 @@ logger:
   wandb:
     project: "sft-dev"
     name: "sft-dev-${data.dataset_name}"
+  swanlab:
+    project: "sft-dev"
+    name: "sft-dev-${data.dataset_name}"
   tensorboard:
     log_dir: "tb_logs-sft-dev-${data.dataset_name}"
   mlflow:

From 53a9e93250fbe4ce42b552ad682c4c5144ee3eea Mon Sep 17 00:00:00 2001
From: Sadegh Mahdavi <smahdavi@nvidia.com>
Date: Thu, 15 Jan 2026 12:04:13 -0800
Subject: [PATCH 06/17] add one more comment

Signed-off-by: Sadegh Mahdavi <smahdavi@nvidia.com>
---
 nemo_skills/training/nemo_rl/configs/grpo.yaml | 2 +-
 nemo_skills/training/nemo_rl/configs/sft.yaml  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml
index 5993e0430d..c6e30dd305 100644
--- a/nemo_skills/training/nemo_rl/configs/grpo.yaml
+++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml
@@ -190,7 +190,7 @@ policy:
   # makes the training sequence length divisible by the tensor parallel size
   # this is useful for sequence parallel training
   make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
+  max_grad_norm: 1.0 # megatron: Zero means no clipping, FSDP: null means no clipping
 
   optimizer:
     name: "torch.optim.AdamW"
diff --git a/nemo_skills/training/nemo_rl/configs/sft.yaml b/nemo_skills/training/nemo_rl/configs/sft.yaml
index 3a8ed6158e..0c6d470e82 100644
--- a/nemo_skills/training/nemo_rl/configs/sft.yaml
+++ b/nemo_skills/training/nemo_rl/configs/sft.yaml
@@ -163,7 +163,7 @@ policy:
   # makes the training sequence length divisible by the tensor parallel size
   # this is useful for sequence parallel training
   make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 0.0 # Zero means no clipping
+  max_grad_norm: 0.0 # megatron: Zero means no clipping, FSDP: null means no clipping
 
   optimizer:
     name: "torch.optim.AdamW"

From 95b50a1f4312bfd2a8baa6c081c5f6a615c57949 Mon Sep 17 00:00:00 2001
From: Sadegh Mahdavi <smahdavi@nvidia.com>
Date: Tue, 27 Jan 2026 14:20:35 -0800
Subject: [PATCH 07/17] Remove legacy and rollback grpo configs

Signed-off-by: Sadegh Mahdavi <smahdavi@nvidia.com>
---
 .../nemo_rl/configs/grpo-legacy-85eeb8d.yaml  | 279 ------------------
 .../training/nemo_rl/configs/grpo.yaml        |   8 +-
 .../nemo_rl/configs/sft-legacy-85eeb8d.yaml   | 197 -------------
 3 files changed, 4 insertions(+), 480 deletions(-)
 delete mode 100644 nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml
 delete mode 100644 nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml

diff --git a/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml b/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml
deleted file mode 100644
index 53428ca873..0000000000
--- a/nemo_skills/training/nemo_rl/configs/grpo-legacy-85eeb8d.yaml
+++ /dev/null
@@ -1,279 +0,0 @@
-# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/ab1b638a499308caea022648daaf6994d390cbde/examples/configs/grpo_math_1B.yaml
-
-# GRPO Algorithm Configuration
-grpo:
-  num_prompts_per_step: 32
-  num_generations_per_prompt: 16
-  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
-  max_num_epochs: 1
-  max_num_steps: 1000000
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 0
-  val_at_start: false
-  overlong_filtering: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  use_dynamic_sampling: false
-  dynamic_sampling_max_gen_batches: 10
-  batch_multiplier: 1
-  reward_shaping:
-    enabled: false
-    overlong_buffer_length: 128
-    overlong_buffer_penalty: 1
-    max_response_length: ${policy.max_total_sequence_length}
-  reward_scaling:
-    enabled: false
-    source_min: 0.0
-    source_max: 1.0
-    target_min: 0.0
-    target_max: 1.0
-
-  async_grpo:
-    enabled: false # Set to true to enable async training mode
-    # Max age (in training steps) for trajectories used in training
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
-  use_on_policy_kl_approximation: false
-  # Async GRPO requires importance sampling correction enabled
-  # Set to true when async_grpo.enabled is true
-  use_importance_sampling_correction: false
-  truncated_importance_sampling_ratio: null
-  sequence_level_importance_ratios: false
-  token_level_loss: true
-
-checkpointing:
-  enabled: true
-  checkpoint_dir: "results/grpo"
-  metric_name: "val_reward"
-  higher_is_better: true
-  keep_top_k: 50
-  save_period: 10
-  checkpoint_must_save_by: null
-
-policy:
-  model_name: ???
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-    chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
-  train_global_batch_size: 512
-  train_micro_batch_size: 4
-  generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 4
-  max_total_sequence_length: 512
-  precision: "bfloat16"
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
-  refit_buffer_size_gb: 4 # used for refitting inference engine, the unit is GB
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  context_parallel_size: 1
-  lr: 1e-6
-  weight_decay: 0.01
-  min_lr: 1e-6
-
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: ${policy.tensor_model_parallel_size}
-    context_parallel_size: ${policy.context_parallel_size}
-    custom_parallel_plan: null
-
-  # dynamic_batching improves performance by ensuring logprob and training microbatches
-  # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
-  # responses are sorted by sequence length and bucketed into microbatches with a total
-  # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
-  # training and logprob stages respectively.
-  # We disable dynamic batching for Megatron as it is incompatible with Pipeline parallelism.
-  # Instead, we use sequence packing.
-  dynamic_batching:
-    enabled: False
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
-
-  sequence_packing:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
-  #If this value is set to null, it will be automatically assigned in the code.
-  make_sequence_length_divisible_by: null
-  max_grad_norm: 1.0
-
-  optimizer:
-    name: "torch.optim.AdamW"
-    kwargs:
-      lr: ${policy.lr}
-      weight_decay: ${policy.weight_decay}
-      betas: [0.9, 0.999]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
-
-  scheduler:
-  - name: "torch.optim.lr_scheduler.LinearLR"
-    kwargs:
-      start_factor: 1.0
-      end_factor: 1.0
-      total_iters: 1   # must be >=1, here it keeps LR constant
-  - name: "torch.optim.lr_scheduler.CosineAnnealingLR"
-    kwargs:
-      T_max: ${grpo.max_num_steps}      # total training steps
-      eta_min: ${policy.lr}   # set min_lr = initial_lr -> constant schedule
-  - milestones: [0]  # required to avoid config errors
-
-
-
-  megatron_cfg:
-    enabled: true
-    empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
-    tensor_model_parallel_size: ${policy.tensor_model_parallel_size}
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: ${policy.pipeline_model_parallel_size}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: ${policy.context_parallel_size}
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    bias_activation_fusion: True
-    #gives ~20% training perf speedup with sequence packing
-    apply_rope_fusion: True
-
-    optimizer:
-      optimizer: "adam"
-      lr: ${policy.lr}
-      min_lr: ${policy.min_lr}
-      weight_decay: ${policy.weight_decay}
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-      clip_grad: ${policy.max_grad_norm}
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "cosine"
-      lr_decay_iters: ${grpo.max_num_steps}
-      lr_warmup_iters: 0
-      lr_warmup_init: 1.0e-6
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-
-
-  generation:
-    backend: "vllm"
-    max_new_tokens: ${policy.max_total_sequence_length}
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
-    vllm_cfg:
-      async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447.
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1  # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP
-      gpu_memory_utilization: 0.6
-      max_model_len: ${policy.max_total_sequence_length}
-      enable_expert_parallel: false
-      enforce_eager: True # Set as True to avoid vllm bug
-      # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit
-      # For Gemma models, we need to use "auto" due to a vllm bug
-      load_format: dummy
-    colocated:
-      # true: generation shares training GPUs
-      # false: uses dedicated generation resources
-      enabled: true
-      # only relevant when enabled is false
-      resources:
-        gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
-        num_nodes: null # Decides number of nodes to be dedicated to generation
-
-data:
-  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
-  shuffle: true
-  prompt:
-    prompt_config: ???
-    examples_type: null
-    config_dir: null
-    template_dir: null
-  train_data_path: null
-  val_data_path: null
-  num_workers: 10
-
-env:
-  math:
-    env_cls: nemo_skills.training.nemo_rl.environments.math_environment.MathEnvironment
-    num_workers: 8
-
-logger:
-  log_dir: "logs"  # Base directory for all logs
-  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: false
-  tensorboard_enabled: false
-  mlflow_enabled: false
-  swanlab_enabled: false # Disable SwanLab logging
-  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  wandb:
-    project: "grpo-dev"
-    name: "grpo-dev-logger"
-  tensorboard: {}
-  mlflow:
-    experiment_name: "grpo-dev"
-    run_name: "grpo-dev-logger"
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
-cluster:
-  gpus_per_node: 1
-  num_nodes: 1
-
-checkpoint_must_save_by: null
diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml
index c6e30dd305..e48e704997 100644
--- a/nemo_skills/training/nemo_rl/configs/grpo.yaml
+++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml
@@ -155,7 +155,7 @@ policy:
       start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
+      lr_decay_style: "cosine" # This is equivalent to constant unless min_lr is set to smaller value
       lr_decay_iters: 1000
       lr_warmup_iters: 13
       lr_warmup_init: 5.0e-7
@@ -210,10 +210,10 @@ policy:
         start_factor: 0.1
         end_factor: 1.0
         total_iters: 10
-    - name: "torch.optim.lr_scheduler.ConstantLR"
+    - name: "torch.optim.lr_scheduler.CosineAnnealingLR"
       kwargs:
-        factor: 1.0
-        total_iters: 10000000000
+        T_max: ${grpo.max_num_steps}
+        eta_min: ${policy.min_lr}
     - milestones: [10]
 
   generation:
diff --git a/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml b/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml
deleted file mode 100644
index 20cc35ff8d..0000000000
--- a/nemo_skills/training/nemo_rl/configs/sft-legacy-85eeb8d.yaml
+++ /dev/null
@@ -1,197 +0,0 @@
-# SFT Algorithm Configuration
-sft:
-  ## total number of steps to train will equal
-  ## min((max_num_epochs * len(train_dataloader)), max_num_steps)
-  # setting both to big values by default, so only one needs to be set
-  max_num_epochs: 100000000
-  max_num_steps: 100000000
-
-  val_period: 0
-  val_batches: 1
-  val_global_batch_size: 32
-  val_micro_batch_size: 1
-  val_at_start: false
-  seed: 42
-
-checkpointing:
-  enabled: true
-  checkpoint_dir: "results/sft"
-  metric_name: "val_loss"
-  higher_is_better: false
-  keep_top_k: 50
-  save_period: 100
-  checkpoint_must_save_by: null
-
-
-policy:
-  model_name: ???
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-    chat_template: "infer_from_data"  ## Can be: null (passthrough), "default" (tokenizer's default), "infer_from_data" (auto-detect from data), or custom jinja2 template
-    chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
-  train_global_batch_size: 32
-  train_micro_batch_size: 1
-  max_total_sequence_length: 4096
-  precision: "bfloat16"
-  fsdp_offload_enabled: false
-  activation_checkpointing_enabled: false
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  context_parallel_size: 1
-  sequence_parallel: false
-  lr: 1e-6
-  weight_decay: 0.01
-  min_lr: 1e-6
-
-
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: ${policy.sequence_parallel}
-    activation_checkpointing: false
-    tensor_parallel_size: ${policy.tensor_model_parallel_size}
-    context_parallel_size: ${policy.context_parallel_size}
-    custom_parallel_plan: null
-
-
-  megatron_cfg:
-    enabled: false
-    activation_checkpointing: false
-    tensor_model_parallel_size: ${policy.tensor_model_parallel_size}
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: ${policy.pipeline_model_parallel_size}
-    context_parallel_size: ${policy.context_parallel_size}
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: ${policy.sequence_parallel}
-    freeze_moe_router: false
-    moe_router_dtype: null
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing
-    bias_activation_fusion: True
-    apply_rope_fusion: True  # Only used if position_embedding_type=rope
-    layernorm_epsilon: 1e-6
-    empty_unused_memory_level: 0 # setting it to 0 will maximize performance, but it might lead to OOMs, while setting it to 2 to minimize the peak memory usage to avoid OOMs
-
-    optimizer:
-      optimizer: "adam"
-      lr: ${policy.lr}
-      min_lr: ${policy.min_lr}
-      weight_decay: ${policy.weight_decay}
-      bf16: true  # must be true to avoid checkpoint load error
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      # clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "cosine"
-      lr_decay_iters: ${sft.max_num_steps}
-      lr_warmup_iters: 0
-      lr_warmup_init: 1.0e-6
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-  dynamic_batching:
-    enabled: false
-
-
-  sequence_packing:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-  #If this value is set to null, it will be automatically assigned in the code.
-  make_sequence_length_divisible_by: null
-  max_grad_norm: null
-
-  optimizer:
-    name: "torch.optim.AdamW"
-    kwargs:
-      lr: ${policy.lr}
-      weight_decay: ${policy.weight_decay}
-      betas: [0.9, 0.98]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
-
-  scheduler:
-  - name: "torch.optim.lr_scheduler.LinearLR"
-    kwargs:
-      start_factor: 1.0
-      end_factor: 1.0
-      total_iters: 1   # must be >=1, here it keeps LR constant
-  - name: "torch.optim.lr_scheduler.CosineAnnealingLR"
-    kwargs:
-      T_max: ${sft.max_num_steps}      # total training steps
-      eta_min: ${policy.lr}   # set min_lr = initial_lr -> constant schedule
-  - milestones: [0]  # required to avoid config errors
-
-
-data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: prompt_response_dataset
-  add_bos: false
-  add_eos: false
-  add_generation_prompt: false
-  input_key: input
-  output_key: output
-  force_reprocess: false
-  shuffle: true
-  num_workers: 10
-
-logger:
-  log_dir: "logs"  # Base directory for all logs
-  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  swanlab_enabled: false # Disable SwanLab logging
-  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  num_val_samples_to_print: 0  # Number of validation samples to pretty print on terminal
-  wandb:
-    project: "sft-dev"
-    name: "sft-dev-${data.dataset_name}"
-  tensorboard:
-    log_dir: "tb_logs-sft-dev-${data.dataset_name}"
-  mlflow:
-    experiment_name: "sft-dev"
-    run_name: "sft-dev-${data.dataset_name}"
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
-
-cluster:
-  gpus_per_node: 1
-  num_nodes: 1

From 502df62f80cba18f930058af3ff0f683dd8a00a5 Mon Sep 17 00:00:00 2001
From: Sadegh Mahdavi <smahdavi@nvidia.com>
Date: Tue, 27 Jan 2026 14:21:29 -0800
Subject: [PATCH 08/17] Remove legacy and rollback grpo configs

Signed-off-by: Sadegh Mahdavi <smahdavi@nvidia.com>
---
 nemo_skills/training/nemo_rl/configs/grpo.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml
index e48e704997..9ec2af3358 100644
--- a/nemo_skills/training/nemo_rl/configs/grpo.yaml
+++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml
@@ -156,7 +156,7 @@ policy:
       end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
       weight_decay_incr_style: "constant"
       lr_decay_style: "cosine" # This is equivalent to constant unless min_lr is set to smaller value
-      lr_decay_iters: 1000
+      lr_decay_iters: ${grpo.max_num_steps}
       lr_warmup_iters: 13
       lr_warmup_init: 5.0e-7
 

From 02f5e89a94263f1ea480443e638e9f833c99242b Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 27 Jan 2026 17:35:28 -0800
Subject: [PATCH 09/17] Update conversion script

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 .../training/nemo_rl/convert_dcp_to_hf.py     | 110 ++++++++++++++++--
 1 file changed, 101 insertions(+), 9 deletions(-)

diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
index c54e5842b9..da28b51cf2 100644
--- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
+++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
@@ -16,11 +16,12 @@
 # and added logic to figure out max step automatically
 
 import argparse
+import glob
 import os
 import re
+import shutil
 
 import yaml
-from nemo_rl.utils.native_checkpoint import convert_dcp_to_hf
 
 
 def parse_args():
@@ -80,6 +81,84 @@ def find_max_step_folder(training_folder, step_override=None):
     return os.path.join(training_folder, f"step_{chosen_step}")
 
 
+def is_safetensors_checkpoint(weights_path):
+    """Check if checkpoint is in the new safetensors format (has model/.hf_metadata/)."""
+    hf_metadata_path = os.path.join(weights_path, "model", ".hf_metadata")
+    return os.path.isdir(hf_metadata_path)
+
+
+def convert_safetensors_to_hf(weights_path, hf_ckpt_path, tokenizer_path, hf_overrides=None):
+    """Convert safetensors checkpoint to HF format by reorganizing files."""
+    model_dir = os.path.join(weights_path, "model")
+    hf_metadata_dir = os.path.join(model_dir, ".hf_metadata")
+
+    os.makedirs(hf_ckpt_path, exist_ok=True)
+
+    # Copy config.json from .hf_metadata
+    config_src = os.path.join(hf_metadata_dir, "config.json")
+    if os.path.exists(config_src):
+        shutil.copy2(config_src, os.path.join(hf_ckpt_path, "config.json"))
+
+    # Copy generation_config.json if exists
+    gen_config_src = os.path.join(hf_metadata_dir, "generation_config.json")
+    if os.path.exists(gen_config_src):
+        shutil.copy2(gen_config_src, os.path.join(hf_ckpt_path, "generation_config.json"))
+
+    # Find and copy safetensors files
+    safetensors_files = glob.glob(os.path.join(model_dir, "*.safetensors"))
+    if len(safetensors_files) == 1:
+        # Single shard - rename to model.safetensors
+        shutil.copy2(safetensors_files[0], os.path.join(hf_ckpt_path, "model.safetensors"))
+    else:
+        # Multiple shards - copy with standard naming and create index
+        import json
+
+        weight_map = {}
+        for i, src_file in enumerate(sorted(safetensors_files), 1):
+            dst_name = f"model-{i:05d}-of-{len(safetensors_files):05d}.safetensors"
+            shutil.copy2(src_file, os.path.join(hf_ckpt_path, dst_name))
+
+            # Read keys from safetensors file to build weight_map
+            from safetensors import safe_open
+
+            with safe_open(src_file, framework="pt") as f:
+                for key in f.keys():
+                    weight_map[key] = dst_name
+
+        # Write index file
+        index = {"metadata": {}, "weight_map": weight_map}
+        with open(os.path.join(hf_ckpt_path, "model.safetensors.index.json"), "w") as f:
+            json.dump(index, f, indent=2)
+
+    # Copy tokenizer files from the original model
+    tokenizer_files = [
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "added_tokens.json",
+        "chat_template.jinja",
+    ]
+    for fname in tokenizer_files:
+        src = os.path.join(tokenizer_path, fname)
+        if os.path.exists(src):
+            shutil.copy2(src, os.path.join(hf_ckpt_path, fname))
+
+    # Apply hf_overrides to config.json if provided
+    if hf_overrides:
+        import json
+
+        config_path = os.path.join(hf_ckpt_path, "config.json")
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        config.update(hf_overrides)
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+
+    return hf_ckpt_path
+
+
 def main():
     """Main entry point."""
     args = parse_args()
@@ -122,14 +201,27 @@ def main():
     if args.max_position_embeddings:
         hf_overrides["max_position_embeddings"] = args.max_position_embeddings
 
-    hf_ckpt = convert_dcp_to_hf(
-        dcp_ckpt_path=dcp_ckpt_path,
-        hf_ckpt_path=args.hf_ckpt_path,
-        model_name_or_path=model_name_or_path,
-        tokenizer_name_or_path=tokenizer_name_or_path,
-        overwrite=True,
-        hf_overrides=hf_overrides,
-    )
+    # Check if checkpoint is in the new safetensors format
+    if is_safetensors_checkpoint(dcp_ckpt_path):
+        print("Detected safetensors checkpoint format, using direct conversion...")
+        hf_ckpt = convert_safetensors_to_hf(
+            weights_path=dcp_ckpt_path,
+            hf_ckpt_path=args.hf_ckpt_path,
+            tokenizer_path=tokenizer_name_or_path,
+            hf_overrides=hf_overrides if hf_overrides else None,
+        )
+    else:
+        print("Detected DCP checkpoint format, using DCP conversion...")
+        from nemo_rl.utils.native_checkpoint import convert_dcp_to_hf
+
+        hf_ckpt = convert_dcp_to_hf(
+            dcp_ckpt_path=dcp_ckpt_path,
+            hf_ckpt_path=args.hf_ckpt_path,
+            model_name_or_path=model_name_or_path,
+            tokenizer_name_or_path=tokenizer_name_or_path,
+            overwrite=True,
+            hf_overrides=hf_overrides,
+        )
     print(f"Saved HF checkpoint to: {hf_ckpt}")
 
 

From 0ee9c7763ebd6cdd3a18c0a92cb0fc1748529a71 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 27 Jan 2026 18:20:26 -0800
Subject: [PATCH 10/17] Adjust test for warmup

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 tests/gpu-tests/test_train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/gpu-tests/test_train.py b/tests/gpu-tests/test_train.py
index fa45c1c113..1a0b5efdad 100644
--- a/tests/gpu-tests/test_train.py
+++ b/tests/gpu-tests/test_train.py
@@ -147,10 +147,11 @@ def test_grpo_nemo_rl(backend):
     grpo_nemo_rl(
         ctx=wrap_arguments(
             "++data.prompt.prompt_config=qwen/math-cot "
-            "++grpo.max_num_steps=5 "
+            "++grpo.lr_warmup_steps=2 "
             "++grpo.num_prompts_per_step=2 "
             "++policy.max_total_sequence_length=256 "
             "++policy.dtensor_cfg.tensor_parallel_size=1 "
+            "++policy.megatron_cfg.scheduler.lr_warmup_iters=2 "
             "++checkpointing.save_period=2 "
             "++policy.train_global_batch_size=2 "
             "++policy.train_micro_batch_size=1 "

From 6dcbd91678fb799be89f8c5e0f6f1786360cb133 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 27 Jan 2026 20:58:47 -0800
Subject: [PATCH 11/17] Switch to a proper conversion script

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 .../training/nemo_rl/convert_dcp_to_hf.py     |  85 +++-------
 .../nemo_rl/offline_hf_consolidation.py       | 146 ++++++++++++++++++
 2 files changed, 171 insertions(+), 60 deletions(-)
 create mode 100644 nemo_skills/training/nemo_rl/offline_hf_consolidation.py

diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
index da28b51cf2..b2522e22ac 100644
--- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
+++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
@@ -16,10 +16,11 @@
 # and added logic to figure out max step automatically
 
 import argparse
-import glob
+import json
 import os
 import re
-import shutil
+import subprocess
+import sys
 
 import yaml
 
@@ -87,68 +88,32 @@ def is_safetensors_checkpoint(weights_path):
     return os.path.isdir(hf_metadata_path)
 
 
-def convert_safetensors_to_hf(weights_path, hf_ckpt_path, tokenizer_path, hf_overrides=None):
-    """Convert safetensors checkpoint to HF format by reorganizing files."""
+def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, hf_overrides=None):
+    """Convert safetensors checkpoint to HF format using offline_hf_consolidation.py."""
     model_dir = os.path.join(weights_path, "model")
-    hf_metadata_dir = os.path.join(model_dir, ".hf_metadata")
 
-    os.makedirs(hf_ckpt_path, exist_ok=True)
-
-    # Copy config.json from .hf_metadata
-    config_src = os.path.join(hf_metadata_dir, "config.json")
-    if os.path.exists(config_src):
-        shutil.copy2(config_src, os.path.join(hf_ckpt_path, "config.json"))
-
-    # Copy generation_config.json if exists
-    gen_config_src = os.path.join(hf_metadata_dir, "generation_config.json")
-    if os.path.exists(gen_config_src):
-        shutil.copy2(gen_config_src, os.path.join(hf_ckpt_path, "generation_config.json"))
-
-    # Find and copy safetensors files
-    safetensors_files = glob.glob(os.path.join(model_dir, "*.safetensors"))
-    if len(safetensors_files) == 1:
-        # Single shard - rename to model.safetensors
-        shutil.copy2(safetensors_files[0], os.path.join(hf_ckpt_path, "model.safetensors"))
-    else:
-        # Multiple shards - copy with standard naming and create index
-        import json
-
-        weight_map = {}
-        for i, src_file in enumerate(sorted(safetensors_files), 1):
-            dst_name = f"model-{i:05d}-of-{len(safetensors_files):05d}.safetensors"
-            shutil.copy2(src_file, os.path.join(hf_ckpt_path, dst_name))
-
-            # Read keys from safetensors file to build weight_map
-            from safetensors import safe_open
-
-            with safe_open(src_file, framework="pt") as f:
-                for key in f.keys():
-                    weight_map[key] = dst_name
-
-        # Write index file
-        index = {"metadata": {}, "weight_map": weight_map}
-        with open(os.path.join(hf_ckpt_path, "model.safetensors.index.json"), "w") as f:
-            json.dump(index, f, indent=2)
-
-    # Copy tokenizer files from the original model
-    tokenizer_files = [
-        "tokenizer.json",
-        "tokenizer_config.json",
-        "special_tokens_map.json",
-        "vocab.json",
-        "merges.txt",
-        "added_tokens.json",
-        "chat_template.jinja",
+    # Get the path to the consolidation script (same directory as this script)
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    consolidation_script = os.path.join(script_dir, "offline_hf_consolidation.py")
+
+    # Run the consolidation script
+    # Reference: https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/offline_hf_consolidation.py
+    cmd = [
+        sys.executable,
+        consolidation_script,
+        "--model-name",
+        model_name,
+        "--input-dir",
+        model_dir,
+        "--output-dir",
+        hf_ckpt_path,
     ]
-    for fname in tokenizer_files:
-        src = os.path.join(tokenizer_path, fname)
-        if os.path.exists(src):
-            shutil.copy2(src, os.path.join(hf_ckpt_path, fname))
+
+    print(f"Running consolidation: {' '.join(cmd)}")
+    subprocess.run(cmd, check=True)
 
     # Apply hf_overrides to config.json if provided
     if hf_overrides:
-        import json
-
         config_path = os.path.join(hf_ckpt_path, "config.json")
         with open(config_path, "r") as f:
             config = json.load(f)
@@ -203,11 +168,11 @@ def main():
 
     # Check if checkpoint is in the new safetensors format
     if is_safetensors_checkpoint(dcp_ckpt_path):
-        print("Detected safetensors checkpoint format, using direct conversion...")
+        print("Detected safetensors checkpoint format, using offline consolidation...")
         hf_ckpt = convert_safetensors_to_hf(
             weights_path=dcp_ckpt_path,
             hf_ckpt_path=args.hf_ckpt_path,
-            tokenizer_path=tokenizer_name_or_path,
+            model_name=model_name_or_path,
             hf_overrides=hf_overrides if hf_overrides else None,
         )
     else:
diff --git a/nemo_skills/training/nemo_rl/offline_hf_consolidation.py b/nemo_skills/training/nemo_rl/offline_hf_consolidation.py
new file mode 100644
index 0000000000..5fc1330564
--- /dev/null
+++ b/nemo_skills/training/nemo_rl/offline_hf_consolidation.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script can be used to consolidate sharded HF safetensors checkpoints
+# to the consolidated format.
+
+# Example model directory structure:
+# model/
+# ├── shard-00001-model-00001-of-00001.safetensors
+# └── shard-00002-model-00001-of-00001.safetensors
+#  ...
+
+# This script works on both single and multiple workers:
+# Example usage on 2 GPUs:
+# torchrun --nproc-per-node=2 tools/offline_hf_consolidation.py --model-name meta-llama/Llama-3.2-1B --input-dir checkpoints/epoch_0_step_19/model/ --output-dir checkpoints/epoch_0_step_19/model/consolidated/
+#
+# Example usage on 1 GPU:
+# python tools/offline_hf_consolidation.py --model-name meta-llama/Llama-3.2-1B --input-dir checkpoints/epoch_0_step_19/model/ --output-dir checkpoints/epoch_0_step_19/model/consolidated/
+
+# copied from https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/offline_hf_consolidation.py
+
+import argparse
+import json
+import os
+import shutil
+
+import torch
+import torch.distributed as dist
+from nemo_automodel.components.checkpoint._backports.consolidate_hf_safetensors import (
+    consolidate_safetensors_files_on_every_rank,
+)
+from nemo_automodel.components.distributed.init_utils import (
+    get_rank_safe,
+    get_world_size_safe,
+    initialize_distributed,
+)
+
+
+def copy_metadata_files(input_dir, output_dir):
+    """
+    Copy the metadata files over from the input directory to the output directory.
+    """
+    for item_name in os.listdir(input_dir):
+        if item_name == "fqn_to_file_index_mapping.json":
+            continue  # this is saved by the consolidation step
+        src_path = os.path.join(input_dir, item_name)
+        dst_path = os.path.join(output_dir, item_name)
+        shutil.move(src_path, dst_path)
+    shutil.rmtree(input_dir, ignore_errors=True)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Consolidate sharded HF safetensors checkpoints into consolidated files, "
+            "preserving original sharding layout where possible."
+        )
+    )
+
+    parser.add_argument(
+        "--model-name",
+        "-m",
+        required=True,
+        help=(
+            "Hugging Face repo id (e.g. meta-llama/Llama-3.2-1B) or absolute path to a HF snapshot directory. "
+            "Used as reference to copy metadata and derive FQN->file index mapping."
+        ),
+    )
+    parser.add_argument(
+        "--input-dir",
+        "-i",
+        required=True,
+        help="Directory containing sharded safetensors files to consolidate.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        "-o",
+        required=True,
+        help="Directory where consolidated safetensors and metadata will be written.",
+    )
+    parser.add_argument(
+        "--num-threads",
+        type=int,
+        default=5,
+        help="Number of threads for writing consolidated data (default: 5).",
+    )
+    parser.add_argument(
+        "--backend",
+        choices=["auto", "nccl", "gloo"],
+        default="auto",
+        help="Distributed backend to initialize (default: auto).",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    backend = args.backend
+    if backend == "auto":
+        backend = "nccl" if torch.cuda.device_count() > 0 else "gloo"
+    initialize_distributed(backend)
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    if not os.path.exists(args.input_dir):
+        raise FileNotFoundError("Could not locate the input directory. Pass an absolute path to the input directory.")
+
+    hf_metadata_dir = os.path.join(args.input_dir, ".hf_metadata")
+
+    if not os.path.exists(hf_metadata_dir) and not os.path.isdir(hf_metadata_dir):
+        raise FileNotFoundError("Expected to find the .hf_metadata directory in the input directory.")
+
+    with open(os.path.join(hf_metadata_dir, "fqn_to_file_index_mapping.json"), "r") as f:
+        fqn_to_index_mapping = json.load(f)
+
+    consolidate_safetensors_files_on_every_rank(
+        args.input_dir,
+        args.output_dir,
+        fqn_to_index_mapping,
+        num_threads=args.num_threads,
+    )
+
+    if get_world_size_safe() > 1:
+        dist.barrier()
+
+    if get_rank_safe() == 0:
+        copy_metadata_files(hf_metadata_dir, args.output_dir)
+
+    if get_world_size_safe() > 1:
+        dist.barrier()
+
+
+if __name__ == "__main__":
+    main()

From 4f4881d999d9b0aa1a32200f46a66c25e4017e62 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 27 Jan 2026 21:15:02 -0800
Subject: [PATCH 12/17] Remove unused parameter

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 tests/gpu-tests/test_train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/gpu-tests/test_train.py b/tests/gpu-tests/test_train.py
index 1a0b5efdad..845bc11eff 100644
--- a/tests/gpu-tests/test_train.py
+++ b/tests/gpu-tests/test_train.py
@@ -147,7 +147,6 @@ def test_grpo_nemo_rl(backend):
     grpo_nemo_rl(
         ctx=wrap_arguments(
             "++data.prompt.prompt_config=qwen/math-cot "
-            "++grpo.lr_warmup_steps=2 "
             "++grpo.num_prompts_per_step=2 "
             "++policy.max_total_sequence_length=256 "
             "++policy.dtensor_cfg.tensor_parallel_size=1 "

From 265228eb84d068ae3cb9da2b32fef69cbdd28266 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 27 Jan 2026 21:37:29 -0800
Subject: [PATCH 13/17] Fix for import

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/training/nemo_rl/convert_dcp_to_hf.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
index b2522e22ac..44b2996947 100644
--- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
+++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
@@ -20,7 +20,6 @@
 import os
 import re
 import subprocess
-import sys
 
 import yaml
 
@@ -96,10 +95,15 @@ def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, hf_overrid
     script_dir = os.path.dirname(os.path.abspath(__file__))
     consolidation_script = os.path.join(script_dir, "offline_hf_consolidation.py")
 
-    # Run the consolidation script
+    # Run the consolidation script using uv with the automodel extra to get nemo_automodel
     # Reference: https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/offline_hf_consolidation.py
     cmd = [
-        sys.executable,
+        "uv",
+        "run",
+        "--active",
+        "--extra",
+        "automodel",
+        "python",
         consolidation_script,
         "--model-name",
         model_name,

From 4192d30389f749e60481182ba9560f0b7c4c7785 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 27 Jan 2026 22:06:53 -0800
Subject: [PATCH 14/17] Add extra automodel

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 nemo_skills/pipeline/nemo_rl/grpo.py | 2 +-
 nemo_skills/pipeline/nemo_rl/sft.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_skills/pipeline/nemo_rl/grpo.py b/nemo_skills/pipeline/nemo_rl/grpo.py
index 1cbd26c888..eeeff72474 100644
--- a/nemo_skills/pipeline/nemo_rl/grpo.py
+++ b/nemo_skills/pipeline/nemo_rl/grpo.py
@@ -192,7 +192,7 @@ def get_training_cmd(
 def get_checkpoint_convert_cmd(output_dir, final_hf_path, step, backend, max_position_embeddings=None):
     cmd = "export PYTHONPATH=$PYTHONPATH:/nemo_run/code && export UV_PROJECT=/opt/NeMo-RL && cd /nemo_run/code && "
     if backend == "fsdp":
-        cmd += "uv run --active python -m nemo_skills.training.nemo_rl.convert_dcp_to_hf "
+        cmd += "uv run --extra automodel python -m nemo_skills.training.nemo_rl.convert_dcp_to_hf "
     elif backend == "megatron":
         cmd += "uv run --extra mcore python -m nemo_skills.training.nemo_rl.convert_megatron_to_hf "
     else:
diff --git a/nemo_skills/pipeline/nemo_rl/sft.py b/nemo_skills/pipeline/nemo_rl/sft.py
index 87d12fbc8e..69a3e98408 100644
--- a/nemo_skills/pipeline/nemo_rl/sft.py
+++ b/nemo_skills/pipeline/nemo_rl/sft.py
@@ -174,7 +174,7 @@ def get_training_cmd(
 def get_checkpoint_convert_cmd(output_dir, final_hf_path, step, backend, max_position_embeddings=None):
     cmd = "export PYTHONPATH=$PYTHONPATH:/nemo_run/code && export UV_PROJECT=/opt/NeMo-RL && cd /nemo_run/code && "
     if backend == "fsdp":
-        cmd += "uv run --active python -m nemo_skills.training.nemo_rl.convert_dcp_to_hf "
+        cmd += "uv run --extra automodel python -m nemo_skills.training.nemo_rl.convert_dcp_to_hf "
     elif backend == "megatron":
         cmd += "uv run --extra mcore python -m nemo_skills.training.nemo_rl.convert_megatron_to_hf "
     else:

From 9804714300d9e0c0242b24254b8c3be9664d1ff0 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Tue, 27 Jan 2026 22:32:02 -0800
Subject: [PATCH 15/17] Add copy for tokenizer files

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 .../training/nemo_rl/convert_dcp_to_hf.py     | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
index 44b2996947..77cc21c6fe 100644
--- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
+++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
@@ -19,6 +19,7 @@
 import json
 import os
 import re
+import shutil
 import subprocess
 
 import yaml
@@ -87,7 +88,37 @@ def is_safetensors_checkpoint(weights_path):
     return os.path.isdir(hf_metadata_path)
 
 
-def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, hf_overrides=None):
+def copy_tokenizer_files(model_name, hf_ckpt_path):
+    """Download and copy tokenizer files from HuggingFace to the HF checkpoint directory.
+
+    Args:
+        model_name: HuggingFace model name to download tokenizer from
+        hf_ckpt_path: Path to the HF checkpoint directory
+    """
+    from huggingface_hub import hf_hub_download, list_repo_files
+
+    # Common tokenizer files that need to be copied
+    tokenizer_files = [
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.model",  # For SentencePiece-based tokenizers
+        "added_tokens.json",
+    ]
+
+    print(f"Downloading tokenizer files from {model_name}...")
+    repo_files = list_repo_files(model_name)
+    for filename in tokenizer_files:
+        if filename in repo_files:
+            downloaded_path = hf_hub_download(model_name, filename)
+            dst_path = os.path.join(hf_ckpt_path, filename)
+            shutil.copy2(downloaded_path, dst_path)
+            print(f"Copied {filename}")
+
+
+def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, tokenizer_name, hf_overrides=None):
     """Convert safetensors checkpoint to HF format using offline_hf_consolidation.py."""
     model_dir = os.path.join(weights_path, "model")
 
@@ -116,6 +147,9 @@ def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, hf_overrid
     print(f"Running consolidation: {' '.join(cmd)}")
     subprocess.run(cmd, check=True)
 
+    # Copy tokenizer files (not handled by offline consolidation)
+    copy_tokenizer_files(tokenizer_name, hf_ckpt_path)
+
     # Apply hf_overrides to config.json if provided
     if hf_overrides:
         config_path = os.path.join(hf_ckpt_path, "config.json")
@@ -177,6 +211,7 @@ def main():
             weights_path=dcp_ckpt_path,
             hf_ckpt_path=args.hf_ckpt_path,
             model_name=model_name_or_path,
+            tokenizer_name=tokenizer_name_or_path,
             hf_overrides=hf_overrides if hf_overrides else None,
         )
     else:

From 3f66a6fba571ef630fd6578590fae6de6f5b58f1 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Wed, 28 Jan 2026 21:57:27 -0800
Subject: [PATCH 16/17] Fix tokenizer files logic

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 .../training/nemo_rl/convert_dcp_to_hf.py     | 33 ++++++++-----------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
index 77cc21c6fe..3405bf2a39 100644
--- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
+++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
@@ -88,37 +88,30 @@ def is_safetensors_checkpoint(weights_path):
     return os.path.isdir(hf_metadata_path)
 
 
-def copy_tokenizer_files(model_name, hf_ckpt_path):
-    """Download and copy tokenizer files from HuggingFace to the HF checkpoint directory.
+def copy_tokenizer_files(tokenizer_path, hf_ckpt_path):
+    """Copy tokenizer files from the original model to the HF checkpoint directory.
 
     Args:
-        model_name: HuggingFace model name to download tokenizer from
+        tokenizer_path: Path to directory containing tokenizer files
         hf_ckpt_path: Path to the HF checkpoint directory
     """
-    from huggingface_hub import hf_hub_download, list_repo_files
-
-    # Common tokenizer files that need to be copied
     tokenizer_files = [
         "tokenizer.json",
         "tokenizer_config.json",
         "special_tokens_map.json",
         "vocab.json",
         "merges.txt",
-        "tokenizer.model",  # For SentencePiece-based tokenizers
         "added_tokens.json",
+        "chat_template.jinja",
     ]
-
-    print(f"Downloading tokenizer files from {model_name}...")
-    repo_files = list_repo_files(model_name)
-    for filename in tokenizer_files:
-        if filename in repo_files:
-            downloaded_path = hf_hub_download(model_name, filename)
-            dst_path = os.path.join(hf_ckpt_path, filename)
-            shutil.copy2(downloaded_path, dst_path)
-            print(f"Copied {filename}")
+    for fname in tokenizer_files:
+        src = os.path.join(tokenizer_path, fname)
+        if os.path.exists(src):
+            shutil.copy2(src, os.path.join(hf_ckpt_path, fname))
+            print(f"Copied {fname}")
 
 
-def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, tokenizer_name, hf_overrides=None):
+def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, tokenizer_path, hf_overrides=None):
     """Convert safetensors checkpoint to HF format using offline_hf_consolidation.py."""
     model_dir = os.path.join(weights_path, "model")
 
@@ -148,7 +141,9 @@ def convert_safetensors_to_hf(weights_path, hf_ckpt_path, model_name, tokenizer_
     subprocess.run(cmd, check=True)
 
     # Copy tokenizer files (not handled by offline consolidation)
-    copy_tokenizer_files(tokenizer_name, hf_ckpt_path)
+    # TODO: this will fail if config["policy"]["model_name"] isn't a path, but that's not common and we should
+    # anyway remove this logic when it's properly handled in nemo-rl
+    copy_tokenizer_files(tokenizer_path, hf_ckpt_path)
 
     # Apply hf_overrides to config.json if provided
     if hf_overrides:
@@ -211,7 +206,7 @@ def main():
             weights_path=dcp_ckpt_path,
             hf_ckpt_path=args.hf_ckpt_path,
             model_name=model_name_or_path,
-            tokenizer_name=tokenizer_name_or_path,
+            tokenizer_path=tokenizer_name_or_path,
             hf_overrides=hf_overrides if hf_overrides else None,
         )
     else:

From a9a80f5876dd6b86ee629c4b1c2b18ba7bf60d5b Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Thu, 29 Jan 2026 10:37:01 -0800
Subject: [PATCH 17/17] Use 10 samples for bfcl in gpu ci

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 tests/gpu-tests/test_eval.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/gpu-tests/test_eval.py b/tests/gpu-tests/test_eval.py
index 1988dd961e..15e5789f2a 100644
--- a/tests/gpu-tests/test_eval.py
+++ b/tests/gpu-tests/test_eval.py
@@ -179,7 +179,8 @@ def test_aaa_prepare_and_eval_all_datasets():
     # It also needs a special eval arg
     # TODO: after summarize results works natively with eval groups, we can merge these
     # TODO: enable bfcl_v4 after figuring out why it's broken in this setup
-    bfcl_eval_args = "++eval_config.partial_eval=true ++model_name=Qwen/Qwen3-1.7B-FC"
+    # setting 10 samples as bfcl is brittle when using only 2
+    bfcl_eval_args = "++eval_config.partial_eval=true ++model_name=Qwen/Qwen3-1.7B-FC ++max_samples=10"
     eval(
         ctx=wrap_arguments(f"{common_ctx} {bfcl_eval_args}"),
         output_dir=output_dir,