diff --git a/nemo_rl/distributed/ray_actor_environment_registry.py b/nemo_rl/distributed/ray_actor_environment_registry.py index 4190062ec6..7b91ed1b3b 100644 --- a/nemo_rl/distributed/ray_actor_environment_registry.py +++ b/nemo_rl/distributed/ray_actor_environment_registry.py @@ -27,9 +27,7 @@ ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = { "nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": VLLM_EXECUTABLE, "nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": VLLM_EXECUTABLE, - # Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM. - # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved. - "nemo_rl.models.policy.workers.dtensor_policy_worker.DTensorPolicyWorker": VLLM_EXECUTABLE, + "nemo_rl.models.policy.workers.dtensor_policy_worker.DTensorPolicyWorker": PY_EXECUTABLES.FSDP, "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2": PY_EXECUTABLES.AUTOMODEL, "nemo_rl.models.policy.workers.megatron_policy_worker.MegatronPolicyWorker": MCORE_EXECUTABLE, "nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM, diff --git a/nemo_rl/distributed/virtual_cluster.py b/nemo_rl/distributed/virtual_cluster.py index 3021b760e4..fa631ff8ee 100644 --- a/nemo_rl/distributed/virtual_cluster.py +++ b/nemo_rl/distributed/virtual_cluster.py @@ -49,6 +49,9 @@ class PY_EXECUTABLES: # Use NeMo-RL direct dependencies and vllm. VLLM = f"uv run --locked --extra vllm --directory {git_root}" + # Use NeMo-RL direct dependencies and fsdp. + FSDP = f"uv run --locked --extra fsdp --directory {git_root}" + # Use NeMo-RL direct dependencies and nemo-automodel. AUTOMODEL = f"uv run --locked --extra automodel --directory {git_root}" diff --git a/nemo_rl/models/generation/fp8.py b/nemo_rl/models/generation/vllm/quantization/fp8.py similarity index 90% rename from nemo_rl/models/generation/fp8.py rename to nemo_rl/models/generation/vllm/quantization/fp8.py index 9acf6a88ef..9505f42524 100644 --- a/nemo_rl/models/generation/fp8.py +++ b/nemo_rl/models/generation/vllm/quantization/fp8.py @@ -107,89 +107,6 @@ def patched_run_workers(self, *args, **kwargs): fp8_patches_applied = True -def get_vllm_qkv_scale_names(layer_idx: int) -> dict[str, str]: - """Get vLLM-compatible parameter names for Q/K/V FP8 scales. - - This function centralizes the naming convention for Q/K/V scale parameters - that vLLM expects. These names must match vLLM's internal parameter structure. - - Args: - layer_idx: The transformer layer index (0-based) - - Returns: - Dictionary mapping scale types to vLLM parameter names: - - 'q_scale': Q activation scale name - - 'k_scale': K activation scale name - - 'v_scale': V activation scale name - - Note: - The q_scale has an extra '.attn.' component compared to k_scale/v_scale. - This matches vLLM's parameter remapping logic in: - vllm.model_executor.model_loader.weight_utils.maybe_remap_kv_scale_name - - Example: - >>> get_vllm_qkv_scale_names(0) - { - 'q_scale': 'model.layers.0.self_attn.attn.q_scale', - 'k_scale': 'model.layers.0.self_attn.k_scale', - 'v_scale': 'model.layers.0.self_attn.v_scale' - } - """ - return { - "q_scale": f"model.layers.{layer_idx}.self_attn.attn.q_scale", - "k_scale": f"model.layers.{layer_idx}.self_attn.k_scale", - "v_scale": f"model.layers.{layer_idx}.self_attn.v_scale", - } - - -def convert_calibration_to_vllm_format( - calibration_results: dict[str, dict[str, float]], -) -> dict[str, float]: - """Convert NeMo-RL calibration results to vLLM parameter format. - - Currently only used by megatron policy worker. - After FP8 KV cache is supported by DTensor path, this function can be reused. - - This function transforms the calibration output format (with layer_N keys) - into the flat dictionary format that vLLM expects for parameter loading. - - Args: - calibration_results: Dict with keys like "layer_0", "layer_1", etc. - Each value is a dict with keys: "q_scale", "k_scale", "v_scale" - and corresponding float scale values. - - Returns: - Flat dictionary mapping vLLM parameter names to scale values. - Keys follow vLLM's naming convention as defined in get_vllm_qkv_scale_names. - - Example: - >>> calib = { - ... "layer_0": {"q_scale": 1.0, "k_scale": 2.0, "v_scale": 3.0}, - ... "layer_1": {"q_scale": 1.5, "k_scale": 2.5, "v_scale": 3.5} - ... } - >>> convert_calibration_to_vllm_format(calib) - { - 'model.layers.0.self_attn.attn.q_scale': 1.0, - 'model.layers.0.self_attn.k_scale': 2.0, - 'model.layers.0.self_attn.v_scale': 3.0, - 'model.layers.1.self_attn.attn.q_scale': 1.5, - 'model.layers.1.self_attn.k_scale': 2.5, - 'model.layers.1.self_attn.v_scale': 3.5 - } - """ - vllm_scales = {} - for layer_key, scales in calibration_results.items(): - # Extract layer index from "layer_N" format - layer_idx = int(layer_key.split("_")[1]) - param_names = get_vllm_qkv_scale_names(layer_idx) - - vllm_scales[param_names["q_scale"]] = scales["q_scale"] - vllm_scales[param_names["k_scale"]] = scales["k_scale"] - vllm_scales[param_names["v_scale"]] = scales["v_scale"] - - return vllm_scales - - def apply_fp8_patches(self, fp8_config): global global_fp8_config, fp8_patches_applied assert not fp8_patches_applied diff --git a/nemo_rl/models/generation/vllm/quantization/fp8_train_utils.py b/nemo_rl/models/generation/vllm/quantization/fp8_train_utils.py new file mode 100644 index 0000000000..ac4db666cf --- /dev/null +++ b/nemo_rl/models/generation/vllm/quantization/fp8_train_utils.py @@ -0,0 +1,96 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def get_vllm_qkv_scale_names(layer_idx: int) -> dict[str, str]: + """Get vLLM-compatible parameter names for Q/K/V FP8 scales. + + This function centralizes the naming convention for Q/K/V scale parameters + that vLLM expects. These names must match vLLM's internal parameter structure. + + Args: + layer_idx: The transformer layer index (0-based) + + Returns: + Dictionary mapping scale types to vLLM parameter names: + - 'q_scale': Q activation scale name + - 'k_scale': K activation scale name + - 'v_scale': V activation scale name + + Note: + The q_scale has an extra '.attn.' component compared to k_scale/v_scale. + This matches vLLM's parameter remapping logic in: + vllm.model_executor.model_loader.weight_utils.maybe_remap_kv_scale_name + + Example: + >>> get_vllm_qkv_scale_names(0) + { + 'q_scale': 'model.layers.0.self_attn.attn.q_scale', + 'k_scale': 'model.layers.0.self_attn.k_scale', + 'v_scale': 'model.layers.0.self_attn.v_scale' + } + """ + return { + "q_scale": f"model.layers.{layer_idx}.self_attn.attn.q_scale", + "k_scale": f"model.layers.{layer_idx}.self_attn.k_scale", + "v_scale": f"model.layers.{layer_idx}.self_attn.v_scale", + } + + +def convert_calibration_to_vllm_format( + calibration_results: dict[str, dict[str, float]], +) -> dict[str, float]: + """Convert NeMo-RL calibration results to vLLM parameter format. + + Currently only used by megatron policy worker. + After FP8 KV cache is supported by DTensor path, this function can be reused. + + This function transforms the calibration output format (with layer_N keys) + into the flat dictionary format that vLLM expects for parameter loading. + + Args: + calibration_results: Dict with keys like "layer_0", "layer_1", etc. + Each value is a dict with keys: "q_scale", "k_scale", "v_scale" + and corresponding float scale values. + + Returns: + Flat dictionary mapping vLLM parameter names to scale values. + Keys follow vLLM's naming convention as defined in get_vllm_qkv_scale_names. + + Example: + >>> calib = { + ... "layer_0": {"q_scale": 1.0, "k_scale": 2.0, "v_scale": 3.0}, + ... "layer_1": {"q_scale": 1.5, "k_scale": 2.5, "v_scale": 3.5} + ... } + >>> convert_calibration_to_vllm_format(calib) + { + 'model.layers.0.self_attn.attn.q_scale': 1.0, + 'model.layers.0.self_attn.k_scale': 2.0, + 'model.layers.0.self_attn.v_scale': 3.0, + 'model.layers.1.self_attn.attn.q_scale': 1.5, + 'model.layers.1.self_attn.k_scale': 2.5, + 'model.layers.1.self_attn.v_scale': 3.5 + } + """ + vllm_scales = {} + for layer_key, scales in calibration_results.items(): + # Extract layer index from "layer_N" format + layer_idx = int(layer_key.split("_")[1]) + param_names = get_vllm_qkv_scale_names(layer_idx) + + vllm_scales[param_names["q_scale"]] = scales["q_scale"] + vllm_scales[param_names["k_scale"]] = scales["k_scale"] + vllm_scales[param_names["v_scale"]] = scales["v_scale"] + + return vllm_scales diff --git a/nemo_rl/models/generation/vllm/vllm_backend.py b/nemo_rl/models/generation/vllm/vllm_backend.py index 1e947ed444..e342139d59 100644 --- a/nemo_rl/models/generation/vllm/vllm_backend.py +++ b/nemo_rl/models/generation/vllm/vllm_backend.py @@ -177,7 +177,7 @@ def update_weights_via_ipc_zmq(self) -> bool: "Offset is not equal to used bytes, usually indicate inaccurate info like keys or cached dtype in state_dict_info" ) # Load weights into the model - from nemo_rl.models.generation import fp8 + from nemo_rl.models.generation.vllm.quantization import fp8 if fp8.is_fp8_model(self.model_runner.vllm_config): # the fp8 load_weights additionally casts bf16 weights into fp8 @@ -230,7 +230,7 @@ def _load_model_weights(weights, model_runner): Returns: None """ - from nemo_rl.models.generation import fp8 + from nemo_rl.models.generation.vllm.quantization import fp8 if fp8.is_fp8_model(model_runner.vllm_config): # the fp8 load_weights additionally casts bf16 weights into fp8 diff --git a/nemo_rl/models/generation/vllm/vllm_worker.py b/nemo_rl/models/generation/vllm/vllm_worker.py index d2de455c22..9238533cd2 100644 --- a/nemo_rl/models/generation/vllm/vllm_worker.py +++ b/nemo_rl/models/generation/vllm/vllm_worker.py @@ -362,7 +362,7 @@ def _patch_vllm_vit_flash_attn_backend(): # Call init_fp8 when precision is fp8 # (kv_cache_dtype can be fp8/fp8_e4m3 or auto, validated in init_fp8) if self.cfg["vllm_cfg"]["precision"] == "fp8": - from nemo_rl.models.generation.fp8 import init_fp8 + from nemo_rl.models.generation.vllm.quantization.fp8 import init_fp8 fp8_kwargs = init_fp8( self.cfg["vllm_cfg"], self.model_name, model_parallel_size diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py index 343bd5d34e..1d175f35b2 100644 --- a/nemo_rl/models/policy/workers/megatron_policy_worker.py +++ b/nemo_rl/models/policy/workers/megatron_policy_worker.py @@ -103,10 +103,6 @@ from_parallel_logits_to_logprobs_packed_sequences, ) from nemo_rl.distributed.named_sharding import NamedSharding -from nemo_rl.models.generation.fp8 import ( - convert_calibration_to_vllm_format, - get_vllm_qkv_scale_names, -) from nemo_rl.models.generation.interfaces import ( GenerationDatumSpec, GenerationOutputSpec, @@ -2139,6 +2135,10 @@ def _iter_params_with_optional_kv_scales( This helper is used by both IPC-based streaming and collective broadcast so that the logic for adding KV scales stays consistent in one place. """ + from nemo_rl.models.generation.vllm.quantization.fp8_train_utils import ( + get_vllm_qkv_scale_names, + ) + base_iter = self.megatron_bridge.export_hf_weights( [self.model], show_progress=False, @@ -2544,6 +2544,9 @@ def calibrate_qkv_fp8_scales( { "format": "fp8", "percentile": float, "margin": float, "layers": { layer_name: {"k_scale": float, "v_scale": float[, "q_scale": float] } } } """ + from nemo_rl.models.generation.vllm.quantization.fp8_train_utils import ( + convert_calibration_to_vllm_format, + ) # Allow overriding FP8 max for Q, K, V via environment variables for ease of testing. # Defaults align with FP8 e4m3 max magnitude. diff --git a/pyproject.toml b/pyproject.toml index 19916dbf6c..673cf2e04f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,19 +52,26 @@ dependencies = [ ] [project.optional-dependencies] -# Currently unused, but after https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved, we should use this for the "BASE" PYEXECUTABLE +fsdp = [ + "flash-attn==2.8.1", + "mamba-ssm", + "causal-conv1d", + # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved + "vllm==0.11.2", +] automodel = [ "nemo-automodel", # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular) # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108 # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76 - "vllm==0.11.2", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved "flash-attn==2.8.1", "mamba-ssm", "causal-conv1d", "nv-grouped-gemm", "transformer-engine[pytorch]==2.8.0", "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", + # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved + "vllm==0.11.2", ] vllm = [ "cuda-python", @@ -75,12 +82,6 @@ vllm = [ "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", "vllm==0.11.2", "num2words>=0.5.14", - # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved - "flash-attn==2.8.1", - # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved - "mamba-ssm", - # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved - "causal-conv1d", ] mcore = [ # also need cudnn (https://developer.nvidia.com/cudnn-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=deb_network) @@ -95,12 +96,12 @@ mcore = [ "transformer-engine[pytorch]==2.8.0", "megatron-core", "megatron-bridge", - # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved - "vllm==0.11.2", # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular) # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108 # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76 "flash-attn==2.8.1", + # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved + "vllm==0.11.2", ] nemo_gym = ["nemo_gym"] diff --git a/pyrefly.toml b/pyrefly.toml index 74f0f29ed9..e1127eadd2 100644 --- a/pyrefly.toml +++ b/pyrefly.toml @@ -101,6 +101,7 @@ project-includes = [ "nemo_rl/models/generation/interfaces.py", "nemo_rl/models/generation/vllm/__init__.py", "nemo_rl/models/generation/vllm/config.py", + "nemo_rl/models/generation/vllm/quantization/fp8_train_utils.py", "nemo_rl/models/generation/vllm/utils.py", "nemo_rl/models/generation/vllm/vllm_backend.py", "nemo_rl/models/huggingface/__init__.py", diff --git a/uv.lock b/uv.lock index f98bc2e21f..a66eed9d2d 100644 --- a/uv.lock +++ b/uv.lock @@ -3795,6 +3795,12 @@ automodel = [ { name = "transformer-engine", extra = ["pytorch"] }, { name = "vllm" }, ] +fsdp = [ + { name = "causal-conv1d" }, + { name = "flash-attn" }, + { name = "mamba-ssm" }, + { name = "vllm" }, +] mcore = [ { name = "flash-attn" }, { name = "megatron-bridge" }, @@ -3806,12 +3812,9 @@ nemo-gym = [ { name = "nemo-gym" }, ] vllm = [ - { name = "causal-conv1d" }, { name = "cuda-python" }, { name = "deep-ep" }, { name = "deep-gemm" }, - { name = "flash-attn" }, - { name = "mamba-ssm" }, { name = "num2words" }, { name = "vllm" }, ] @@ -3860,7 +3863,7 @@ requires-dist = [ { name = "accelerate", specifier = ">=0.26" }, { name = "blobfile" }, { name = "causal-conv1d", marker = "extra == 'automodel'", git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8" }, - { name = "causal-conv1d", marker = "extra == 'vllm'", git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8" }, + { name = "causal-conv1d", marker = "extra == 'fsdp'", git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8" }, { name = "colored", specifier = "==2.2.3" }, { name = "cuda-python", marker = "extra == 'vllm'" }, { name = "datasets", specifier = ">=4.0.0" }, @@ -3869,11 +3872,11 @@ requires-dist = [ { name = "deep-ep", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" }, { name = "deep-gemm", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepGEMM.git?rev=7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" }, { name = "flash-attn", marker = "extra == 'automodel'", specifier = "==2.8.1" }, + { name = "flash-attn", marker = "extra == 'fsdp'", specifier = "==2.8.1" }, { name = "flash-attn", marker = "extra == 'mcore'", specifier = "==2.8.1" }, - { name = "flash-attn", marker = "extra == 'vllm'", specifier = "==2.8.1" }, { name = "hydra-core" }, { name = "mamba-ssm", marker = "extra == 'automodel'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, - { name = "mamba-ssm", marker = "extra == 'vllm'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, + { name = "mamba-ssm", marker = "extra == 'fsdp'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, { name = "math-verify" }, { name = "matplotlib" }, { name = "megatron-bridge", marker = "extra == 'mcore'", editable = "3rdparty/Megatron-Bridge-workspace" }, @@ -3911,11 +3914,12 @@ requires-dist = [ { name = "transformers", specifier = "==4.57.1" }, { name = "triton", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')", index = "https://download.pytorch.org/whl/cu129" }, { name = "vllm", marker = "extra == 'automodel'", specifier = "==0.11.2" }, + { name = "vllm", marker = "extra == 'fsdp'", specifier = "==0.11.2" }, { name = "vllm", marker = "extra == 'mcore'", specifier = "==0.11.2" }, { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.11.2" }, { name = "wandb" }, ] -provides-extras = ["automodel", "vllm", "mcore", "nemo-gym"] +provides-extras = ["fsdp", "automodel", "vllm", "mcore", "nemo-gym"] [package.metadata.requires-dev] build = [