diff --git a/3rdparty/Gym-workspace/Gym b/3rdparty/Gym-workspace/Gym index 39ee39e35b..c192ee407f 160000 --- a/3rdparty/Gym-workspace/Gym +++ b/3rdparty/Gym-workspace/Gym @@ -1 +1 @@ -Subproject commit 39ee39e35bfedef86ccf8d1ada9835ab8f62d267 +Subproject commit c192ee407ff71046015d11da7c8960082bd62418 diff --git a/3rdparty/Gym-workspace/setup.py b/3rdparty/Gym-workspace/setup.py index ddb5c62284..b6df0d66c0 100644 --- a/3rdparty/Gym-workspace/setup.py +++ b/3rdparty/Gym-workspace/setup.py @@ -42,6 +42,7 @@ "yappi", "ray[default]", "psutil", + "datasets", ] if src_dir.exists(): diff --git a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml new file mode 100644 index 0000000000..dea76e41cf --- /dev/null +++ b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml @@ -0,0 +1,277 @@ +grpo: + max_num_epochs: 1 + num_prompts_per_step: 64 + num_generations_per_prompt: 16 + max_rollout_turns: 1 # for multi-turn rollouts. Workplace assistant has 1 turn but can have up to 6 tool-calling steps + max_num_steps: 1000000 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: true + overlong_filtering: false + max_val_samples: null # inferred from size of val dataset. for multi evals, repeat val ds via `num_repeats` in `ng_prepare_data`. + val_batch_size: null + seed: 42 + use_dynamic_sampling: false + dynamic_sampling_max_gen_batches: 10 + batch_multiplier: 1 + reward_shaping: + enabled: false + overlong_buffer_length: 128 + overlong_buffer_penalty: 1 + max_response_length: ${policy.max_total_sequence_length} + reward_scaling: + enabled: false + source_min: 0.0 + source_max: 1.0 + target_min: 0.0 + target_max: 1.0 + skip_reference_policy_logprobs_calculation: true + +loss_fn: + reference_policy_kl_penalty: 0 + reference_policy_kl_type: "k3" + kl_input_clamp_value: 20.0 + kl_output_clamp_value: 10.0 + ratio_clip_min: 0.2 + ratio_clip_max: 0.2 + ratio_clip_c: null + # (default off) loss formulation improvements (docs/guides/grpo.md#loss) + use_on_policy_kl_approximation: false + truncated_importance_sampling_ratio: null + use_importance_sampling_correction: false + token_level_loss: true + +checkpointing: + enabled: true + metric_name: "val:accuracy" + higher_is_better: true + keep_top_k: 3 + save_period: 6 # Save checkpoint every 6 steps (aligned with val_period) + checkpoint_must_save_by: null + +policy: + model_name: "nvidia/NVIDIA-Nemotron-Nano-9B-v2" + tokenizer: + name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true + hf_config_overrides: {} + train_global_batch_size: ${mul:${grpo.num_prompts_per_step}, ${grpo.num_generations_per_prompt}} # Match the total rollouts per step + train_micro_batch_size: 1 + logprob_batch_size: 1 + generation_batch_size: 32 # Only used when generating using HF backend + max_total_sequence_length: 8192 + precision: "bfloat16" + logprob_chunk_size: null # Disabled to allow defer_fp32_logits: false + + dtensor_cfg: + _v2: false + enabled: false + cpu_offload: False + sequence_parallel: false + activation_checkpointing: true + tensor_parallel_size: 2 + context_parallel_size: 1 + custom_parallel_plan: null + clear_cache_every_n_steps: null + + megatron_cfg: + enabled: true + bias_activation_fusion: false + tensor_model_parallel_size: 2 + # We might want to consider setting this value higher (e.g. to 1) and raising the vllm generation max mem utilization + empty_unused_memory_level: 0 + activation_checkpointing: true + # train_iters needs to be large enough to cover all training steps + train_iters: 100000 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + context_parallel_size: 1 + pipeline_dtype: ${policy.precision} + sequence_parallel: false + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + defer_fp32_logits: false + moe_permute_fusion: false + + optimizer: + optimizer: "adam" + lr: 5.0e-6 + min_lr: 5.0e-7 + weight_decay: 0.01 + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + # optimizer cpu offload + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + + clip_grad: ${policy.max_grad_norm} + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: 100000 # Must be greater than lr_warmup_iters + lr_warmup_iters: 13 + lr_warmup_init: 5.0e-7 + override_opt_param_scheduler: true + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + + env_vars: null + + # See docs/design-docs/sequence-packing-and-dynamic-batching.md + # for more details on dynamic batching and sequence packing. + dynamic_batching: + enabled: False + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + sequence_length_round: 64 + + sequence_packing: + enabled: false + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: 1 + max_grad_norm: 1.0 + offload_optimizer_for_logprob: false # Only useful for non-colocated generation since colocated generation will always offload optimizer to cuda before refit + + optimizer: null + + scheduler: + - name: "torch.optim.lr_scheduler.ConstantLR" + kwargs: + factor: 1.0 + total_iters: 10000000000 + - milestones: [] + + generation: + backend: "vllm" + max_new_tokens: ${policy.max_total_sequence_length} + temperature: 1.0 + top_p: 1.0 + top_k: null + stop_token_ids: null + stop_strings: null + vllm_cfg: + async_engine: true + precision: ${policy.precision} + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + enable_expert_parallel: false + expert_parallel_size: 1 + gpu_memory_utilization: 0.8 + max_model_len: ${policy.max_total_sequence_length} + enforce_eager: false + use_deep_gemm: False + num_last_layers_in_bf16: 0 + num_first_layers_in_bf16: 0 + kv_cache_dtype: "auto" + expose_http_server: true + skip_tokenizer_init: false + tool_parser_plugin: ??? + http_server_serving_chat_kwargs: + # Workplace assistant uses 26 tools, so we enable auto_tools. + # For Nemotron Nano v2, we use the dedicated `nemotron_json` tool parser + enable_auto_tools: true + tool_parser: nemotron_json + vllm_kwargs: + compilation_config: + # when enforce_eager is False, set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy, + # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile + # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998 + use_inductor: False + # We need the Mamba cache to be set to fp32 for Nemotron Nano v2 + mamba_ssm_cache_dtype: "float32" + colocated: + # true: generation shares training GPUs + # false: uses dedicated generation resources + enabled: true + # only relevant when enabled is false + resources: + gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 + num_nodes: null # Decides number of nodes to be dedicated to generation + +data: + # Using the prepared train and validation datasets (downloaded from HuggingFace and split 90/10) + # Train: 1129 samples, Validation: 126 samples + train_jsonl_fpath: 3rdparty/Gym-workspace/Gym/resources_servers/workplace_assistant/data/train.jsonl + validation_jsonl_fpath: 3rdparty/Gym-workspace/Gym/resources_servers/workplace_assistant/data/validation.jsonl + agent_name: workplace_assistant_simple_agent + shuffle: true + num_workers: 0 + +env: + should_use_nemo_gym: true + should_log_nemo_gym_responses: true # If you have low logging storage, set this to false + nemo_gym: # This is passed into NeMo-Gym as the initial_global_config_dict + config_paths: + - responses_api_models/vllm_model/configs/vllm_model_for_training.yaml # Required! And it must be *for_training + - resources_servers/workplace_assistant/configs/workplace_assistant.yaml + workplace_assistant_simple_agent: + responses_api_agents: + simple_agent: + max_steps: 6 # Workplace assistant allows up to 6 tool-calling steps per task + policy_model: + responses_api_models: + vllm_model: + # Disable reasoning! + uses_reasoning_parser: false + extra_body: + chat_template_kwargs: + enable_thinking: false + +logger: + log_dir: "logs/grpo-workplace-assistant-nemotron-nano-v2-9b" # Base directory for all logs + num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal + wandb_enabled: true + tensorboard_enabled: false + mlflow_enabled: false # Disable MLflow logging + swanlab_enabled: false + monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb: + project: "grpo-workplace-assistant" + name: "nemotron-nano-v2-9b-workplace-assistant" + tensorboard: {} + mlflow: + experiment_name: "grpo-workplace-assistant" + run_name: "nemotron-nano-v2-9b-workplace-assistant" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + +cluster: + gpus_per_node: 8 + num_nodes: 1 # Single node by default; set to 2+ for multi-node training diff --git a/examples/nemo_gym/launch_nemo_gym_multinode_training.sh b/examples/nemo_gym/launch_nemo_gym_multinode_training.sh new file mode 100755 index 0000000000..37ede71772 --- /dev/null +++ b/examples/nemo_gym/launch_nemo_gym_multinode_training.sh @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ----- PARAMETERS ----- +# WANDB_API_KEY, EXP_NAME, NUM_ACTOR_NODES, REPO_LOCATION, CONTAINER_IMAGE_PATH, SLURM_ACCOUNT, SLURM_PARTITION + +# ray.sub needs to be launched from the NeMo-RL root directory +cd $REPO_LOCATION + +# Construct the command +read -r -d '' COMMAND < len( + template_prefix_token_ids + ), f"""Found possibly non-monotonically increasing trajectory! +Template prefix token IDs (everything before the final assistant message): {template_prefix_token_ids} + +Template token IDs (everything that was sent to the model endpoint): {template_token_ids} + +Template prefix repr (detokenized): {repr(tokenizer.decode(template_prefix_token_ids))} + +Template repr (detokenized): {repr(tokenizer.decode(template_token_ids))} +""" + # We take everything starting with the EOS token ID. template_cut_start = -1 for pos in reversed(range(len(template_prefix_token_ids))): @@ -114,9 +127,16 @@ def _replace_prefix_tokens( break # This should never be the case, but - assert template_cut_start >= 0, ( - "No EOS token ID found in the chat-templated messages!" - ) + assert ( + template_cut_start >= 0 + ), f"""No EOS token ID found in the chat-templated messages! +Template prefix token IDs (everything before the final assistant message): {template_prefix_token_ids} + +Template token IDs (everything that was sent to the model endpoint): {template_token_ids} + +Template prefix repr (detokenized): {repr(tokenizer.decode(template_prefix_token_ids))} + +Template repr (detokenized): {repr(tokenizer.decode(template_token_ids))}""" return ( model_prefix_token_ids[:model_cut_end] + template_token_ids[template_cut_start:] @@ -269,6 +289,7 @@ def _setup_vllm_openai_api_server(self, app: FastAPI) -> FastAPI: from fastapi import Request from fastapi.responses import JSONResponse, StreamingResponse + from vllm import __version__ as vllm_version from vllm.entrypoints.openai.api_server import ( BaseModelPath, OpenAIServingChat, @@ -283,8 +304,13 @@ def _setup_vllm_openai_api_server(self, app: FastAPI) -> FastAPI: TokenizeCompletionRequest, TokenizeResponse, ) + from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.v1.engine.async_llm import logger as vllm_async_llm_logger + maybe_tool_parser_plugin = self.cfg["vllm_cfg"].get("tool_parser_plugin") + if maybe_tool_parser_plugin: + ToolParserManager.import_tool_parser(maybe_tool_parser_plugin) + engine_client = self.llm model_config = self.llm_async_engine_args.create_model_config() base_model_paths = [ @@ -294,12 +320,15 @@ def _setup_vllm_openai_api_server(self, app: FastAPI) -> FastAPI: BaseModelPath(name=model_config.model, model_path=model_config.model), ] - openai_serving_models = OpenAIServingModels( + openai_serving_models_kwargs = dict( engine_client=engine_client, - model_config=model_config, base_model_paths=base_model_paths, lora_modules=None, ) + # Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2 + if vllm_version < "0.11.1": + openai_serving_models_kwargs["model_config"] = model_config + openai_serving_models = OpenAIServingModels(**openai_serving_models_kwargs) class NeMoRLOpenAIChatRequestMixin: def model_post_init(self, context): @@ -435,13 +464,17 @@ class NeMoRLOpenAIServingChat(NeMoRLOpenAIServingMixin, OpenAIServingChat): serving_chat_kwargs = serving_chat_default_kwargs | self.cfg["vllm_cfg"].get( "http_server_serving_chat_kwargs", dict() ) - openai_serving_chat = NeMoRLOpenAIServingChat( - engine_client, - model_config, - openai_serving_models, - return_tokens_as_token_ids=True, - **serving_chat_kwargs, + serving_chat_kwargs.update( + dict( + engine_client=engine_client, + models=openai_serving_models, + return_tokens_as_token_ids=True, + ) ) + # Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2 + if vllm_version < "0.11.1": + serving_chat_kwargs["model_config"] = model_config + openai_serving_chat = NeMoRLOpenAIServingChat(**serving_chat_kwargs) generation_config = self.cfg @@ -496,15 +529,20 @@ class NeMoRLOpenAIServingTokenization( ): pass - openai_serving_tokenization = NeMoRLOpenAIServingTokenization( - engine_client, - model_config, - openai_serving_models, + serving_tokenization_kwargs = dict( request_logger=serving_chat_kwargs["request_logger"], chat_template=serving_chat_kwargs["chat_template"], chat_template_content_format=serving_chat_kwargs[ "chat_template_content_format" ], + engine_client=serving_chat_kwargs["engine_client"], + models=serving_chat_kwargs["models"], + ) + # Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2 + if vllm_version < "0.11.1": + serving_tokenization_kwargs["model_config"] = model_config + openai_serving_tokenization = NeMoRLOpenAIServingTokenization( + **serving_tokenization_kwargs ) @app.post("/tokenize") @@ -524,15 +562,21 @@ async def tokenize(request: NeMoRLTokenizeRequest, raw_request: Request): # Logging ######################################## print( - "Adding a vLLM logging filter so that the logs aren't spammed with `Added request ...` messages. This is to help errors pop up better and filter out noise." + "Adding a vLLM logging filter so that the logs aren't spammed with not useful messages like `Added request ...`. This is to help errors pop up better and filter out noise." ) - class NoAddedRequestFilter(LoggingFilter): + class CleanLoggingFilter(LoggingFilter): def filter(self, record: LogRecord) -> bool: msg = record.getMessage() - return "Added request" not in msg - vllm_async_llm_logger.addFilter(NoAddedRequestFilter()) + # vLLM does not accept `strict` tool definitions and reporting it to the user is not useful either. + return ( + "Added request" not in msg + and "The following fields were present in the request but ignored: {'strict'}" + not in msg + ) + + vllm_async_llm_logger.addFilter(CleanLoggingFilter()) return app diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index ac03e0506f..e39fef12d4 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -1164,6 +1164,7 @@ def test_vllm_http_server(cluster, tokenizer): "function_call": None, "tool_calls": [], "reasoning_content": None, + "reasoning": None, }, "logprobs": { "content": [ @@ -1200,6 +1201,11 @@ def _standardize(d: dict) -> dict: # We don't want to implicate log prob accuracy in this test. d["choices"][0]["logprobs"]["content"][0].pop("logprob") + # Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2 + message = d["choices"][0]["message"] + if "reasoning" in message: + message.pop("reasoning") + return d assert _standardize(expected_result) == _standardize(actual_result) @@ -1369,6 +1375,9 @@ def test_replace_prefix_tokens_missing_eos_in_template_prefix_raises(): class _T: eos_token_id = 2 + def decode(self, *args, **kwargs): + pass + tokenizer = _T() model_prefix_token_ids = [7, 2] template_prefix_token_ids = [9, 9, 9] # no EOS inside prefix diff --git a/uv.lock b/uv.lock index 7d12c3b39d..d333c863c4 100644 --- a/uv.lock +++ b/uv.lock @@ -3465,6 +3465,7 @@ name = "nemo-gym" source = { editable = "3rdparty/Gym-workspace" } dependencies = [ { name = "aiohttp" }, + { name = "datasets" }, { name = "devtools" }, { name = "fastapi" }, { name = "gradio" }, @@ -3486,6 +3487,7 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "aiohttp" }, + { name = "datasets" }, { name = "devtools" }, { name = "fastapi" }, { name = "gradio" },