NVIDIA-NeMo · parthchadha · Aug 14, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
@@ -62,7 +62,7 @@ A key design principle for generation backends is that they process tokens direc
 
 ## VLLM Backend
 
-The VLLM backend (`models/generation/vllm.py`) implements the {py:class}`GenerationInterface <nemo_rl.models.generation.interfaces.GenerationInterface>` to provide efficient text generation using the VLLM library, which is optimized for large language models.
+The VLLM backend (`models/generation/vllm/vllm_generation.py`) implements the {py:class}`GenerationInterface <nemo_rl.models.generation.interfaces.GenerationInterface>` to provide efficient text generation using the VLLM library, which is optimized for large language models.
 
 ### VllmGeneration Class
 

@@ -107,7 +107,7 @@ This Policy object holds a [RayWorkerGroup](../../nemo_rl/distributed/worker_gro
 
 ## Fast Generation
 
-We support vLLM through the [VllmGeneration](../../nemo_rl/models/generation/vllm.py) class right now.
+We support vLLM through the [VllmGeneration](../../nemo_rl/models/generation/vllm/vllm_generation.py) class right now.
 
 The function [grpo_train](../../nemo_rl/algorithms/grpo.py) contains the core GRPO training loop.
 

@@ -15,7 +15,8 @@
 from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES
 
 ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = {
-    "nemo_rl.models.generation.vllm.VllmGenerationWorker": PY_EXECUTABLES.VLLM,
+    "nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": PY_EXECUTABLES.VLLM,
+    "nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": PY_EXECUTABLES.VLLM,
     # Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM.
     # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved.
     "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": PY_EXECUTABLES.VLLM,

@@ -0,0 +1,24 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nemo_rl.models.generation.vllm.config import VllmConfig
+from nemo_rl.models.generation.vllm.vllm_generation import VllmGeneration
+from nemo_rl.models.generation.vllm.vllm_worker import VllmGenerationWorker
+from nemo_rl.models.generation.vllm.vllm_worker_async import VllmAsyncGenerationWorker
+
+__all__ = [
+    "VllmConfig",
+    "VllmGeneration",
+    "VllmGenerationWorker",
+    "VllmAsyncGenerationWorker",
+]
@@ -0,0 +1,35 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, NotRequired, TypedDict
+
+from nemo_rl.models.generation.interfaces import GenerationConfig
+
+
+class VllmSpecificArgs(TypedDict):
+    tensor_parallel_size: int
+    pipeline_parallel_size: int
+    gpu_memory_utilization: float
+    max_model_len: int
+    # Additional arguments for vLLM inserted by nemo rl based on the context of when vllm is used
+    skip_tokenizer_init: bool
+    async_engine: bool
+    load_format: NotRequired[str]
+    precision: NotRequired[str]
+    enforce_eager: NotRequired[bool]
+
+
+class VllmConfig(GenerationConfig):
+    vllm_cfg: VllmSpecificArgs
+    vllm_kwargs: NotRequired[dict[str, Any]]