diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2673715981..b08f0148b1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -59,3 +59,5 @@ # Codeowners /.github/CODEOWNERS @nvidia-nemo/rl_maintainers + +/research/template_project @terrykong \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index bd327db27d..1d6a0d6974 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -76,7 +76,9 @@ ENV TORCH_CUDA_ARCH_LIST="9.0 10.0" # First copy only the dependency files COPY --from=nemo-rl pyproject.toml uv.lock ./ +COPY --from=nemo-rl nemo_rl/__init__.py nemo_rl/package_info.py ./nemo_rl/ COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh +COPY --from=nemo-rl --link research/ ./research/ COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/ RUN <<"EOF" bash -exu diff --git a/nemo_rl/distributed/virtual_cluster.py b/nemo_rl/distributed/virtual_cluster.py index c28befb541..879c0f22c4 100644 --- a/nemo_rl/distributed/virtual_cluster.py +++ b/nemo_rl/distributed/virtual_cluster.py @@ -44,19 +44,19 @@ class PY_EXECUTABLES: SYSTEM = sys.executable # Use NeMo-RL direct dependencies. - BASE = "uv run --locked" + BASE = f"uv run --locked --directory {git_root}" # Use NeMo-RL direct dependencies and vllm. - VLLM = "uv run --locked --extra vllm" + VLLM = f"uv run --locked --extra vllm --directory {git_root}" # Use NeMo-RL direct dependencies and nemo-automodel. - AUTOMODEL = "uv run --locked --extra automodel" + AUTOMODEL = f"uv run --locked --extra automodel --directory {git_root}" # Use NeMo-RL direct dependencies and Megatron. - MCORE = "uv run --locked --extra mcore" + MCORE = f"uv run --locked --extra mcore --directory {git_root}" # Use Penguin dependencies - PENGUIN = "uv run --locked --extra penguin" + PENGUIN = f"uv run --locked --extra penguin --directory {git_root}" @ray.remote # pragma: no cover diff --git a/nemo_rl/utils/venvs.py b/nemo_rl/utils/venvs.py index 2d9c8018e2..c5511473ea 100644 --- a/nemo_rl/utils/venvs.py +++ b/nemo_rl/utils/venvs.py @@ -95,7 +95,7 @@ def create_local_venv( exec_cmd.extend(["echo", f"Finished creating venv {venv_path}"]) # Always run uv sync first to ensure the build requirements are set (for --no-build-isolation packages) - subprocess.run(["uv", "sync"], env=env, check=True) + subprocess.run(["uv", "sync", "--directory", git_root], env=env, check=True) subprocess.run(exec_cmd, env=env, check=True) # Return the path to the python executable in the virtual environment diff --git a/pyproject.toml b/pyproject.toml index e64a6441f6..73eb392ba5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -173,6 +173,12 @@ members = [ "3rdparty/Automodel-workspace/Automodel", "3rdparty/Megatron-Bridge-workspace", "3rdparty/Penguin-workspace", + # Research projects are also added here in order for them to share the global root level uv.lock. + # If we don't do this, the research projects do not see the global uv.lock, and may mistakenly + # install numpy>=2.0 because nemo-rl's core [dependencies] do not pin numpy, but when you inspect + # nemo-rl's uv.lock you'll see it's 1.X b/c megatron mandates 1.X in the optional dependencies, so + # globally we must choose 1.X otherwise we run into pickle issues from ray. + "research/template_project", ] [[tool.uv.index]] diff --git a/research/README.md b/research/README.md new file mode 100644 index 0000000000..956edf3937 --- /dev/null +++ b/research/README.md @@ -0,0 +1,50 @@ +# Research and Community Projects + +This directory contains research experiments and community-contributed projects built on NeMo RL. Each project is self-contained and demonstrates different techniques and applications. + +## Getting Started + +To create a new research project, start with the template: + +```bash +cp -r research/template_project research/my_new_project +``` + +The template includes: +- A minimal train-and-generate loop example +- Complete test suite structure (unit, functional, and test suites) +- Configuration examples +- Documentation template + +## Expectations for Research Project Authors + +> [!NOTE] +> This section is for research and community project authors contributing to the repository. + +### Acceptance Criteria + +The acceptance criteria for merging your research project into the main repository are reproduction steps for the results outlined in this README. We want to make sure others can reproduce your great work! Please include sufficient documentation in the README.md that enables users to follow and reproduce your results step-by-step. + +> [!NOTE] +> We strongly encourage you to consider contributing universally applicable features directly to the core `nemo_rl` package. Your work can help improve NeMo RL for everyone! However, if your innovation introduces complexity that doesn't align with the core library's design principles, the research folder is exactly the right place for it. This directory exists specifically to showcase novel ideas and experimental approaches that may not fit neatly into "core". + +### Code Reviews and Ownership + +Code reviews for research projects will always involve the original authors. Please add your name to the `.github/CODEOWNERS` file to be alerted when any changes touch your project. The NeMo RL core team reserves the right to merge PRs that touch your project if the original author does not respond in a timely manner. This allows the core team to move quickly to resolve issues. + +### Testing + +Authors are encouraged to write tests for their research projects. This template demonstrates three types of tests: +1. **Unit tests** - Fast, isolated component tests +2. **Functional tests** - End-to-end tests with minimal configurations +3. **Test suites** (nightlies) - Longer-running comprehensive validation tests + +All of these will be included in our automation. When changes occur in nemo-rl "core", the expectation is that it should not break tests that are written. + +In the event that we cannot resolve test breakage and the authors are unresponsive, we reserve the right to disable the tests to ensure a high fidelity test signal. An example of this would be if we are deprecating a backend and the research project has not migrated to its replacement. + +It should be noted that because we use `uv`, even if we must disable tests because the project will not work top-of-tree anymore, a user can always go back to the last working commit and run the research project with nemo-rl since the `uv.lock` represents the last known working state. Users can also build the Dockerfile at that commit to ensure a fully reproducible environment. + +## Projects + +- **[template_project](template_project/)** - A starting point for new research projects with example code and test structure diff --git a/research/template_project/.python-version b/research/template_project/.python-version new file mode 100644 index 0000000000..e4fba21835 --- /dev/null +++ b/research/template_project/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/research/template_project/README.md b/research/template_project/README.md new file mode 100644 index 0000000000..3d44a4b717 --- /dev/null +++ b/research/template_project/README.md @@ -0,0 +1,126 @@ +# Template Project: A Starting Point + +This is a template project for research experiments with NeMo RL. + +> [!IMPORTANT] +> This is a template! To start a new research project, copy this directory to a new location: +> ```bash +> cp -r research/template_project research/my_new_project +> ``` +> Then add your code and tests! Note that this project includes `nemo-rl` as a core dependency. + +## What This Shows + +The `single_update.py` script demonstrates a minimal train-and-generate loop: +1. Sets up a Ray compute cluster +2. Initializes vLLM generation and an LM policy +3. Trains the policy on a small batch using NLL loss +4. Refits the generation engine with the updated policy weights +5. Generates outputs with the new policy +6. Repeats the loop (10 iterations by default) + +This shows the basic cycle of training a language model and using it for generation. + +## Running the Example + +To run the `single_update.py` script: + +```bash +uv run single_update.py +``` + +## Testing + +This project includes a comprehensive test suite following NeMo RL's testing patterns. + +### Unit Tests + +Unit tests validate individual components and functions. + +```bash +# Run all unit tests +uv run --group test pytest tests/unit/ +``` + +### Functional Tests + +Functional tests run end-to-end scenarios with minimal configurations. These tests require GPU access. + +> [!IMPORTANT] +> Functional tests require at least 1 GPU to run. + +```bash +# Run the single_update functional test (runs for 1 step) +uv run bash tests/functional/single_update.sh +``` + +### Test Suites + +Test suites are longer-running comprehensive tests designed for validation on multiple steps. + +> [!IMPORTANT] +> Test suites require 8 GPUs and may take several minutes to complete. + +```bash +# Run the single_update test suite locally (runs for 10 steps on 1 node with 8 GPUs) +bash tests/test_suites/llm/single_update_1n8g.sh + +# Launch on SLURM with code snapshots +# For full documentation on tools/launch, see: +# https://github.com/NVIDIA-NeMo/RL/blob/main/tests/test_suites/README.md#launching-with-code-snapshots +bash ../../tools/launch tests/test_suites/llm/single_update_1n8g.sh + +# Dry run to estimate GPU hours needed +DRYRUN=1 bash ../../tools/launch tests/test_suites/llm/single_update_1n8g.sh +``` + +> [!TIP] +> The `tools/launch` script creates code snapshots and launches SLURM jobs for reproducible experiments. It automatically extracts the configuration from your test suite script and submits the appropriate number of jobs. + +The test suite structure mirrors nemo-rl's test organization: +- `tests/unit/` - Fast, isolated unit tests +- `tests/functional/` - End-to-end tests with minimal configurations +- `tests/test_suites/llm/` - Comprehensive multi-step validation tests +- `configs/recipes/llm/` - Configuration files for test suites (using defaults to inherit from base configs) + +## Updating Dependencies + +If you update the dependencies of this research project, run the following command to update the global `uv.lock` file and freeze the working set of dependencies: + +```bash +uv lock +``` + +This command will: +- Resolve all dependencies +- Update `uv.lock` with the latest compatible versions +- Ensure dependency consistency across environments + +## Python Version + +> [!NOTE] +> This project uses Python 3.12 as specified in `.python-version`. +> This Python version should always be kept in sync with the `.python-version` file at the root of the `nemo-rl` repository to ensure compatibility. + + +## Citation + +If you use this research project or have questions, please contact: + +``` +Author: AUTHOR NAMES HERE +Email: AUTHOR EMAILS HERE +Organization: ORGANIZATION HERE (optional) +``` + +If you use this research project, please cite it using the following BibTeX entry: + +```bibtex +@misc{template-project, +title = {Template Project: A Starting Point}, +author = {AUTHOR NAMES HERE}, +howpublished = {\url{https://github.com/NVIDIA-NeMo/RL/tree/main/research/template_project}}, +year = {2025}, +note = {Research project based on NeMo RL}, +} +``` diff --git a/research/template_project/configs/grpo_math_1B.yaml b/research/template_project/configs/grpo_math_1B.yaml new file mode 100644 index 0000000000..664b953a2f --- /dev/null +++ b/research/template_project/configs/grpo_math_1B.yaml @@ -0,0 +1,255 @@ +# GRPO Algorithm Configuration +grpo: + num_prompts_per_step: 32 + num_generations_per_prompt: 16 + max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question) + max_num_epochs: 1 + max_num_steps: 1000000 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + overlong_filtering: false + max_val_samples: 256 + val_batch_size: 256 + seed: 42 + async_grpo: + enabled: false # Set to true to enable async training mode + # Max age (in training steps) for trajectories used in training + max_trajectory_age_steps: 1 + +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_clip_min: 0.2 + ratio_clip_max: 0.2 + ratio_clip_c: null + # (default off) loss formulation improvements (docs/guides/grpo.md#loss) + use_on_policy_kl_approximation: false + # Async GRPO requires importance sampling correction enabled + # Set to true when async_grpo.enabled is true + use_importance_sampling_correction: false + sequence_level_importance_ratios: false + token_level_loss: true + +checkpointing: + enabled: true + checkpoint_dir: "results/grpo" + metric_name: "val_reward" + higher_is_better: true + keep_top_k: 3 + save_period: 10 + checkpoint_must_save_by: null + model_save_format: "safetensors" + save_consolidated: false + +policy: + model_name: "Qwen/Qwen2.5-1.5B" + tokenizer: + name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + train_global_batch_size: 512 + train_micro_batch_size: 4 + generation_batch_size: 32 # Only used when generating using HF backend + logprob_batch_size: 4 + max_total_sequence_length: 512 + precision: "bfloat16" + logprob_chunk_size: null + offload_optimizer_for_logprob: false # Only useful for non-colocated generation since colocated generation will always offload optimizer to cuda before refit + + dtensor_cfg: + _v2: true + enabled: true + cpu_offload: False + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + context_parallel_size: 1 + custom_parallel_plan: null + + megatron_cfg: + enabled: false + empty_unused_memory_level: 0 + activation_checkpointing: false + converter_type: "Qwen2ForCausalLM" + tensor_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + context_parallel_size: 1 + pipeline_dtype: ${policy.precision} + sequence_parallel: false + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + defer_fp32_logits: null + + optimizer: + optimizer: "adam" + lr: 5.0e-6 + min_lr: 5.0e-7 + weight_decay: 0.01 + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + clip_grad: ${policy.max_grad_norm} + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: 1000 + lr_warmup_iters: 13 + lr_warmup_init: 5.0e-7 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + + fp8_cfg: null + + env_vars: null + + # See docs/design-docs/sequence-packing-and-dynamic-batching.md + # for more details on dynamic batching and sequence packing. + dynamic_batching: + enabled: False + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + sequence_length_round: 64 + + sequence_packing: + enabled: True + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} + max_grad_norm: 1.0 + + optimizer: + name: "torch.optim.AdamW" + kwargs: + lr: 5.0e-6 + weight_decay: 0.01 + betas: [0.9, 0.999] + eps: 1e-8 + # when using Dtensor, we need to set foreach + # and fused to False + foreach: False + fused: False + + scheduler: + - name: "torch.optim.lr_scheduler.LinearLR" + kwargs: + start_factor: 0.1 + end_factor: 1.0 + total_iters: 50 + - name: "torch.optim.lr_scheduler.ConstantLR" + kwargs: + factor: 1.0 + total_iters: 10000000000 + - milestones: [50] + + generation: + backend: "vllm" + max_new_tokens: ${policy.max_total_sequence_length} + temperature: 1.0 + top_p: 1.0 + top_k: null + stop_token_ids: null + stop_strings: null + vllm_cfg: + async_engine: false + precision: ${policy.precision} + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + expert_parallel_size: 1 # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP + gpu_memory_utilization: 0.6 + max_model_len: ${policy.max_total_sequence_length} + # when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy, + # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile + # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998 + enforce_eager: False + use_deep_gemm: False + num_last_layers_in_bf16: 0 + num_first_layers_in_bf16: 0 + vllm_kwargs: {} + colocated: + # true: generation shares training GPUs + # false: uses dedicated generation resources + enabled: true + # only relevant when enabled is false + resources: + gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 + num_nodes: null # Decides number of nodes to be dedicated to generation + +data: + max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len + prompt_file: "examples/prompts/cot.txt" + system_prompt_file: null + shuffle: true + + dataset_name: "OpenMathInstruct-2" + # You can use custom response datasets for training and validation. For example: + # data: + # dataset_name: ResponseDataset + # train_data_path: # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace) + # val_data_path: + # input_key: , default is "input" + # output_key: , default is "output" + # train_split: , default is None # used for HuggingFace datasets + # val_split: , default is None # used for HuggingFace datasets + # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/grpo.md#datasets for more details. + +env: + math: + num_workers: 8 + +logger: + log_dir: "logs" # Base directory for all logs + num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal + wandb_enabled: false + tensorboard_enabled: false + mlflow_enabled: false # Disable MLflow logging + swanlab_enabled: false # Disable SwanLab logging + monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb: + project: "grpo-dev" + name: "grpo-dev-logger" + tensorboard: {} + mlflow: + experiment_name: "grpo-dev" + run_name: "grpo-dev-logger" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + +cluster: + gpus_per_node: 1 + num_nodes: 1 diff --git a/research/template_project/configs/recipes/llm/single_update_1n8g.yaml b/research/template_project/configs/recipes/llm/single_update_1n8g.yaml new file mode 100644 index 0000000000..f0bcbb5bff --- /dev/null +++ b/research/template_project/configs/recipes/llm/single_update_1n8g.yaml @@ -0,0 +1,6 @@ +defaults: + - ../../grpo_math_1B.yaml + +cluster: + gpus_per_node: 8 + diff --git a/research/template_project/pyproject.toml b/research/template_project/pyproject.toml new file mode 100644 index 0000000000..cae8d33ea5 --- /dev/null +++ b/research/template_project/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "template-project" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12" +dependencies = ["nemo-rl"] + +[dependency-groups] +test = ["pytest>=7.0.0", "pytest-timeout", "pytest-cov"] + +[tool.uv.sources] +nemo-rl = { workspace = true } + +[tool.hatch.build.targets.wheel] +packages = ["template_project"] + +[tool.pytest.ini_options] +addopts = "--durations=15 -s -rA -x" +testpaths = ["tests"] +python_files = "test_*.py" diff --git a/research/template_project/single_update.py b/research/template_project/single_update.py new file mode 100644 index 0000000000..84bbeaee06 --- /dev/null +++ b/research/template_project/single_update.py @@ -0,0 +1,201 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Minimal single-update demonstration script. + +What it does: + 1) Sets up a RayVirtualCluster + 2) Initializes VllmGeneration + 3) Initializes LM Policy + 4) Trains on a tiny synthetic batch (global batch size = 2) with NLLLoss + 5) Refits the generation engine with the latest policy weights + 6) Optionally repeats the train→refit cycle in a short loop + +Notes: +- The configuration is defined entirely in this file, inspired by examples/configs/grpo_math_1B.yaml +- Uses vLLM for generation and a small model for demonstration +- Uses simple NLL loss for brevity +""" + +import argparse +import os + +from omegaconf import OmegaConf +from template_project.data_utils import create_batch_from + +from nemo_rl.algorithms.grpo import MasterConfig, refit_policy_generation +from nemo_rl.algorithms.loss_functions import NLLLoss +from nemo_rl.algorithms.utils import get_tokenizer +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.distributed.virtual_cluster import RayVirtualCluster, init_ray +from nemo_rl.models.generation import configure_generation_config +from nemo_rl.models.generation.vllm import VllmGeneration +from nemo_rl.models.policy.lm_policy import Policy +from nemo_rl.utils.config import load_config, parse_hydra_overrides + +OmegaConf.register_new_resolver("mul", lambda a, b: a * b) + + +def main(config: MasterConfig) -> None: + # 0) Config + policy_config = config["policy"] + tokenizer = get_tokenizer(policy_config["tokenizer"]) + policy_config["generation"] = configure_generation_config( + policy_config["generation"], tokenizer + ) + + # 1) Set up compute cluster (single GPU for demo) + print("\n▶ Setting up compute cluster...") + init_ray() + cluster = RayVirtualCluster( + name="single_update_cluster", + bundle_ct_per_node_list=[config["cluster"]["gpus_per_node"]] + * config["cluster"]["num_nodes"], + use_gpus=True, + num_gpus_per_node=config["cluster"]["gpus_per_node"], + max_colocated_worker_groups=1 + if policy_config["generation"]["backend"] == "megatron" + else 2, + ) + + # 2) Initialize vLLM generation first for a clean GPU environment + print("\n▶ Initializing vLLM generation...") + # Initialize vLLM directly from config + policy_config["generation"]["model_name"] = policy_config["model_name"] + policy_generation = VllmGeneration( + cluster=cluster, config=policy_config["generation"] + ) + # Pre-initialize workers to avoid contention later + policy_generation.finish_generation() + print(" ✓ vLLM generation ready") + + # 3) Initialize policy (LM) + print("\n▶ Initializing LM Policy...") + policy = Policy( + cluster=cluster, + config=policy_config, + tokenizer=tokenizer, + init_reference_model=False, + ) + print(" ✓ Policy created") + + # Prepare refit info once before first refit + state_dict_info = policy.prepare_refit_info() + policy_generation.prepare_refit_info(state_dict_info or {}) + + # 4) Create tiny numeric batch and train with NLLLoss + print("\n▶ Creating tiny numeric batch and training with NLLLoss...") + train_sentences = ["a b c d e hello", "a d f world"] * config["policy"][ + "train_global_batch_size" + ] + generation_prompts = [ + "Have you heard of NVIDIA?", + "What is calligraphy?", + "What is the capital of France?", + "What is the capital of the United States?", + "What is the capital of the United Kingdom?", + "What is the capital of the Philippines?", + "What is the capital of the China?", + "What is the capital of the Japan?", + "What is the capital of the Korea?", + "What is the capital of the India?", + "What is the capital of the Pakistan?", + "What is the capital of the Bangladesh?", + "What is the capital of the Nepal?", + ] + data = create_batch_from(tokenizer, sentences=train_sentences) + loss_fn = NLLLoss() + + # Optionally repeat the train→refit cycle + num_iters = int(os.environ.get("SINGLE_UPDATE_ITERS", "10")) + + for step in range(num_iters): + print(f"\n===== Iteration {step + 1}/{num_iters} =====") + # Generate before training using predefined prompts + gen_inputs = create_batch_from(tokenizer, sentences=generation_prompts) + gen_data = BatchedDataDict( + { + "input_ids": gen_inputs["input_ids"], + "input_lengths": gen_inputs["input_lengths"], + } + ) + + print(" • Refit generation with latest policy weights...") + refit_policy_generation( + policy=policy, + policy_generation=policy_generation, + colocated_inference=policy_config["generation"]["colocated"]["enabled"], + ) + print(" ✓ Refit complete") + + policy_generation.prepare_for_generation() + gen_outputs = policy_generation.generate( + gen_data, greedy=True + ) # greedy for demonstration + policy_generation.finish_generation() + decoded = tokenizer.batch_decode( + gen_outputs["output_ids"].tolist(), skip_special_tokens=True + ) + print( + " • Pre-train generations (first turn would be gibberish b/c vllm dummy weights; at around loss <0.3 you should see memorization):" + ) + for i, out_text in enumerate(decoded): + print(f" - prompt: '{generation_prompts[i]}' -> '{out_text}'") + policy.prepare_for_training() + results = policy.train(data, loss_fn) + loss_tensor = results["loss"] + print(f" • Training loss: {loss_tensor}") + + print("\nAll done.") + + policy.shutdown() + policy_generation.shutdown() + cluster.shutdown() + + +def parse_args() -> tuple[argparse.Namespace, list[str]]: + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Run GRPO training with configuration") + parser.add_argument( + "--config", type=str, default=None, help="Path to YAML config file" + ) + + # Parse known args for the script + args, overrides = parser.parse_known_args() + + return args, overrides + + +if __name__ == "__main__": + # Parse arguments + args, overrides = parse_args() + + if not args.config: + args.config = os.path.join( + os.path.dirname(__file__), "configs", "grpo_math_1B.yaml" + ) + + config = load_config(args.config) + print(f"Loaded configuration from: {args.config}") + + if overrides: + print(f"Overrides: {overrides}") + config = parse_hydra_overrides(config, overrides) + + config: MasterConfig = OmegaConf.to_container(config, resolve=True) + print("Applied CLI overrides") + from rich.pretty import pprint + + pprint(config) + + main(config) diff --git a/research/template_project/template_project/__init__.py b/research/template_project/template_project/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/research/template_project/template_project/data_utils.py b/research/template_project/template_project/data_utils.py new file mode 100644 index 0000000000..8f76d58715 --- /dev/null +++ b/research/template_project/template_project/data_utils.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data utilities for template project.""" + +import torch + +from nemo_rl.distributed.batched_data_dict import BatchedDataDict + + +def create_batch_from(tokenizer, sentences: list[str]) -> BatchedDataDict: + """Create a tiny batch from raw sentences (no chat templates).""" + assert len(sentences) > 0, "sentences list must not be empty" + + enc = tokenizer( + sentences, + add_special_tokens=False, + return_tensors="pt", + padding=True, + ) + input_ids = enc["input_ids"] + attention_mask = enc["attention_mask"].to(torch.float32) + input_lengths = attention_mask.sum(dim=1).to(torch.int32) + sample_mask = torch.ones(input_ids.size(0), dtype=torch.float32) + + # For simple NLL training, use the attention mask as token_mask + # (loss will be applied to positions 1..len-1 via NLLLoss) + token_mask = torch.ones_like(input_ids) + + return BatchedDataDict( + { + "input_ids": input_ids, + "input_lengths": input_lengths, + "token_mask": token_mask, + "sample_mask": sample_mask, + } + ) diff --git a/research/template_project/tests/__init__.py b/research/template_project/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/research/template_project/tests/functional/single_update.sh b/research/template_project/tests/functional/single_update.sh new file mode 100755 index 0000000000..7dee942ad5 --- /dev/null +++ b/research/template_project/tests/functional/single_update.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) + +set -eou pipefail + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR + +cd $PROJECT_ROOT + +# Run single_update.py for just 1 iteration +SINGLE_UPDATE_ITERS=1 uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJECT_ROOT/nemo_rl \ + single_update.py \ + --config $PROJECT_ROOT/configs/grpo_math_1B.yaml \ + cluster.gpus_per_node=1 \ + cluster.num_nodes=1 \ + policy.train_global_batch_size=1 \ + policy.train_micro_batch_size=1 \ + $@ \ + 2>&1 | tee $RUN_LOG + +echo "Functional test passed: single_update.py completed 1 step successfully" + diff --git a/research/template_project/tests/test_suites/llm/common.env b/research/template_project/tests/test_suites/llm/common.env new file mode 100644 index 0000000000..a392ca41a2 --- /dev/null +++ b/research/template_project/tests/test_suites/llm/common.env @@ -0,0 +1,40 @@ +#!/bin/bash +# Source this file before running test to setup +# +# This is similar to nemo-rl's common.env except it doesn't enforce +# the config path must be named the same thing as the script. +# +# source ./common.env +set -eou pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +# Mark all repos as safe in the test context, since wandb fetchs metadata about the repo and it's a +# catch-22 to get the project root and mark it safe if you don't know the project root +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../../..) + +exit_if_max_steps_reached() { + # Early stopping to save compute if max step has been reached + STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) + if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 + fi + echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" +} + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log + +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +if [[ -n "${TEST_DRYRUN:-}" ]]; then + echo "[INFO] TEST_DRYRUN mode: used for testing" + exit +fi + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + diff --git a/research/template_project/tests/test_suites/llm/single_update_1n8g.sh b/research/template_project/tests/test_suites/llm/single_update_1n8g.sh new file mode 100755 index 0000000000..22577a2d12 --- /dev/null +++ b/research/template_project/tests/test_suites/llm/single_update_1n8g.sh @@ -0,0 +1,46 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +NEMO_RL_ROOT=$(realpath $SCRIPT_DIR/../../../../..) + +# Source common.env from local test suite +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +# This test does not convert tensorboard logs to metrics.json. We typically use this +# to check if the test is already completed to not run any further. This is relevant +# when MAX_STPES > STEPS_PER_RUN since when launching multiple times, we want to +# exit early if we've already completed the MAX_STEPS. See tests in nemo-rl for +# concrete examples. +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT + +# Set the number of iterations via environment variable +NRL_FORCE_REBUILD_VENVS=true \ +SINGLE_UPDATE_ITERS=$MAX_STEPS \ +uv run single_update.py \ + --config configs/grpo_math_1B.yaml \ + cluster.gpus_per_node=8 \ + cluster.num_nodes=$NUM_NODES \ + $@ \ + 2>&1 | tee $RUN_LOG + +# We create a simple metrics.json to check if the script ran normally (usually based on tensorboard logs) +if grep -q "All done." "$RUN_LOG"; then + echo '{"succeed": "yes"}' > $JSON_METRICS +else + echo '{"succeed": "no"}' > $JSON_METRICS +fi + +# This is standard for nemo-rl tests to always run this script. We have some automation +# that checks the output of this script for success or failure. +uv run $NEMO_RL_ROOT/tests/check_metrics.py $JSON_METRICS \ + 'data["succeed"] == "yes"' diff --git a/research/template_project/tests/unit/__init__.py b/research/template_project/tests/unit/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/research/template_project/tests/unit/test_data_utils.py b/research/template_project/tests/unit/test_data_utils.py new file mode 100644 index 0000000000..b4eed49cac --- /dev/null +++ b/research/template_project/tests/unit/test_data_utils.py @@ -0,0 +1,113 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for template_project.data_utils.""" + +import pytest +import torch +from template_project.data_utils import create_batch_from +from transformers import AutoTokenizer + + +@pytest.fixture +def tokenizer(): + """Fixture to create a tokenizer with proper padding token.""" + tok = AutoTokenizer.from_pretrained("gpt2") + tok.pad_token = tok.eos_token + return tok + + +def test_create_batch_from_single_sentence(tokenizer): + """Test create_batch_from with a single sentence.""" + sentences = ["Hello world"] + + batch = create_batch_from(tokenizer, sentences) + + assert "input_ids" in batch + assert "input_lengths" in batch + assert "token_mask" in batch + assert "sample_mask" in batch + assert batch["input_ids"].shape[0] == 1 + assert batch["input_lengths"].shape[0] == 1 + assert batch["sample_mask"].shape[0] == 1 + + +def test_create_batch_from_multiple_sentences(tokenizer): + """Test create_batch_from with multiple sentences.""" + sentences = ["Hello world", "This is a test", "Another sentence here"] + + batch = create_batch_from(tokenizer, sentences) + + assert batch["input_ids"].shape[0] == 3 + assert batch["input_lengths"].shape[0] == 3 + assert batch["sample_mask"].shape[0] == 3 + assert batch["token_mask"].shape == batch["input_ids"].shape + + +def test_create_batch_from_padding(tokenizer): + """Test that create_batch_from correctly pads sequences.""" + sentences = ["short", "this is a much longer sentence"] + + batch = create_batch_from(tokenizer, sentences) + + # All sequences should have the same length (padded to max) + assert batch["input_ids"].shape[1] == batch["input_ids"].shape[1] + # Input lengths should reflect the actual (unpadded) lengths + assert batch["input_lengths"][0] < batch["input_lengths"][1] + + +def test_create_batch_from_dtypes(tokenizer): + """Test that create_batch_from produces correct data types.""" + sentences = ["Hello world"] + + batch = create_batch_from(tokenizer, sentences) + + assert batch["input_ids"].dtype == torch.long + assert batch["input_lengths"].dtype == torch.int32 + assert batch["sample_mask"].dtype == torch.float32 + assert batch["token_mask"].dtype == torch.long + + +def test_create_batch_from_sample_mask_all_ones(tokenizer): + """Test that sample_mask is all ones.""" + sentences = ["Hello", "World", "Test"] + + batch = create_batch_from(tokenizer, sentences) + + assert torch.all(batch["sample_mask"] == 1.0) + assert batch["sample_mask"].shape[0] == len(sentences) + + +def test_create_batch_from_token_mask_all_ones(tokenizer): + """Test that token_mask is all ones.""" + sentences = ["Hello world", "Test sentence"] + + batch = create_batch_from(tokenizer, sentences) + + assert torch.all(batch["token_mask"] == 1) + assert batch["token_mask"].shape == batch["input_ids"].shape + + +def test_create_batch_from_input_lengths(tokenizer): + """Test that input_lengths correctly represent non-padded token counts.""" + sentences = ["a b c", "x y"] + + batch = create_batch_from(tokenizer, sentences) + + # Compute expected lengths from attention mask + tok = AutoTokenizer.from_pretrained("gpt2") + tok.pad_token = tok.eos_token + enc = tok(sentences, add_special_tokens=False, return_tensors="pt", padding=True) + expected_lengths = enc["attention_mask"].sum(dim=1).to(torch.int32) + + assert torch.all(batch["input_lengths"] == expected_lengths) diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh index 9de07d28bd..99fc24c52b 100644 --- a/tests/functional/L1_Functional_Tests_GPU.sh +++ b/tests/functional/L1_Functional_Tests_GPU.sh @@ -35,5 +35,13 @@ time uv run --no-sync bash ./tests/functional/vlm_grpo.sh time uv run --no-sync bash ./tests/functional/distillation.sh time uv run --no-sync bash ./tests/functional/distillation_megatron.sh +# Research functional tests (self-discovery) +for test_script in research/*/tests/functional/*.sh; do + project_dir=$(echo $test_script | cut -d/ -f1-2) + pushd $project_dir + time uv run --no-sync bash $(echo $test_script | cut -d/ -f3-) + popd +done + cd ${PROJECT_ROOT}/tests coverage combine .coverage* diff --git a/tests/unit/L0_Unit_Tests_Other.sh b/tests/unit/L0_Unit_Tests_Other.sh index 3908183305..f60c09df03 100644 --- a/tests/unit/L0_Unit_Tests_Other.sh +++ b/tests/unit/L0_Unit_Tests_Other.sh @@ -45,3 +45,11 @@ if [[ $exit_code -eq 5 ]]; then else uv run --extra vllm bash -x ./tests/run_unit.sh unit/ --ignore=unit/models/generation/ --ignore=unit/models/policy/ --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only fi + +# Research unit tests +for i in research/*/tests/unit; do + project_dir=$(dirname $(dirname $i)) + pushd $project_dir + uv run --no-sync pytest tests/unit + popd +done diff --git a/tools/copyright.sh b/tools/copyright.sh index 044243adbc..0ffe32ddd5 100755 --- a/tools/copyright.sh +++ b/tools/copyright.sh @@ -19,9 +19,13 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # Move to the project root cd $SCRIPT_DIR/.. find_files_with_missing_copyright() { -find ./nemo_rl/ ./docs/*.py ./examples/ ./tests/ ./tools/ ./3rdparty/*/*.py -type f -name '*.py' | while read path; do - echo -en $path"\t" - head -2 $path | grep -iv 'coding=' | head -1 +find ./nemo_rl/ ./docs/*.py ./examples/ ./tests/ ./tools/ ./3rdparty/*/*.py ./research/ -type f -name '*.py' | while read path; do + # Skip empty files - they don't need copyright headers + if [[ ! -s "$path" ]]; then + continue + fi + first_line=$(head -2 "$path" | grep -iv 'coding=' | head -1) + echo -e "$path\t$first_line" done \ | egrep -iv 'Copyright.*NVIDIA CORPORATION.*All rights reserved.' \ | grep -iv 'BSD 3-Clause License' \ diff --git a/uv.lock b/uv.lock index 7b06abd41f..03c163b5ec 100644 --- a/uv.lock +++ b/uv.lock @@ -23,6 +23,7 @@ members = [ "nemo-automodel", "nemo-rl", "penguin", + "template-project", ] overrides = [ { name = "opencv-python-headless", specifier = ">=4.11.0" }, @@ -6108,6 +6109,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b4/94/fd3853b98f39d10206b08f2737d2ec2dc6f46a42dc7b7e05f4f0162d13ee/tdigest-0.5.2.2-py3-none-any.whl", hash = "sha256:dd25f8d6e6be002192bba9e4b8c16491d36c10b389f50637818603d1f67c6fb2", size = 9440, upload-time = "2019-05-07T18:57:38.942Z" }, ] +[[package]] +name = "template-project" +version = "0.1.0" +source = { editable = "research/template_project" } +dependencies = [ + { name = "nemo-rl" }, +] + +[package.dev-dependencies] +test = [ + { name = "pytest" }, + { name = "pytest-cov" }, + { name = "pytest-timeout" }, +] + +[package.metadata] +requires-dist = [{ name = "nemo-rl", editable = "." }] + +[package.metadata.requires-dev] +test = [ + { name = "pytest", specifier = ">=7.0.0" }, + { name = "pytest-cov" }, + { name = "pytest-timeout" }, +] + [[package]] name = "tensorboard" version = "2.20.0"