NVIDIA-NeMo · parthchadha · Jul 3, 2025 · Jun 30, 2025 · Jun 30, 2025 · Jul 1, 2025
@@ -22,6 +22,7 @@ generation:
     pipeline_parallel_size: 1
     gpu_memory_utilization: 0.9
     max_model_len: 2048
+    enforce_eager: False
   colocated:
     # true: generation shares training GPUs
     # false: uses dedicated generation resources

@@ -99,6 +99,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
+      enforce_eager: False
       # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit
       # For Gemma models, we need to use "auto" due to a vllm bug
       load_format: dummy

@@ -107,6 +107,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
+      enforce_eager: False
     colocated:
       # true: generation shares training GPUs
       # false: uses dedicated generation resources

@@ -58,6 +58,7 @@ policy:
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
+      enforce_eager: False
 
 cluster:
   gpus_per_node: 8

@@ -89,6 +89,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 16384
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 4096
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 16384
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 16384
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -87,6 +87,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 4096
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 4096
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512
+      enforce_eager: False
     colocated:
       enabled: true
       resources:

@@ -131,6 +131,9 @@ def configure_worker(
                 seed = node_idx * 1024 + bundle_id
 
             init_kwargs["seed"] = seed
+            # Need to give each DP group its own vllm cache to address:
+            # https://github.com/vllm-project/vllm/issues/18851
+            env_vars["VLLM_CACHE_ROOT"] = os.path.expanduser(f"~/.cache/vllm_{seed}")
 
         # Check if this worker is part of a parallel group (TP or TP+PP).
         # A worker is part of a parallel group if it's a secondary member (local_bundle_indices is None)
@@ -334,8 +337,7 @@ def _patch_vllm_init_workers_ray():
             enable_prefix_caching=torch.cuda.get_device_capability()[0] >= 8,
             dtype=self.cfg["vllm_cfg"]["precision"],
             seed=seed,
-            # Don't use cuda-graph by default as it leads to convergence issues (see https://github.com/NVIDIA-NeMo/RL/issues/186)
-            enforce_eager=True,
+            enforce_eager=self.cfg["vllm_cfg"]["enforce_eager"],
             max_model_len=self.cfg["vllm_cfg"]["max_model_len"],
             trust_remote_code=True,
             worker_extension_cls="nemo_rl.models.generation.vllm_backend.VllmInternalWorkerExtension",

@@ -241,6 +241,7 @@ def initial_multi_step_calculator_batch(rollout_tokenizer):
         "disable_log_stats": True,
         "disable_log_requests": True,
         "gpu_memory_utilization": 0.6,
+        "enforce_eager": "False",
     },
     "colocated": {
         "enabled": True,

@@ -56,6 +56,7 @@
         "async_engine": False,  # Default to False for synchronous tests
         "skip_tokenizer_init": False,
         "load_format": "auto",
+        "enforce_eager": "False",
     },
     "colocated": {
         "enabled": True,

@@ -50,6 +50,7 @@
         "async_engine": True,
         "skip_tokenizer_init": False,
         "load_format": "auto",
+        "enforce_eager": "False",
     },
     "colocated": {
         "enabled": True,