diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml index eab0f1db21..85e193dcae 100644 --- a/examples/configs/evals/eval.yaml +++ b/examples/configs/evals/eval.yaml @@ -22,6 +22,7 @@ generation: pipeline_parallel_size: 1 gpu_memory_utilization: 0.9 max_model_len: 2048 + enforce_eager: False colocated: # true: generation shares training GPUs # false: uses dedicated generation resources diff --git a/examples/configs/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/grpo-deepscaler-1.5b-8K.yaml index 1013f3d4c2..ce5ed73c17 100644 --- a/examples/configs/grpo-deepscaler-1.5b-8K.yaml +++ b/examples/configs/grpo-deepscaler-1.5b-8K.yaml @@ -99,6 +99,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: ${policy.max_total_sequence_length} + enforce_eager: False # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit # For Gemma models, we need to use "auto" due to a vllm bug load_format: dummy diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 1842b01497..fd944fa9e7 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -107,6 +107,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: ${policy.max_total_sequence_length} + enforce_eager: False colocated: # true: generation shares training GPUs # false: uses dedicated generation resources diff --git a/examples/configs/grpo_math_8B.yaml b/examples/configs/grpo_math_8B.yaml index 429a1d7663..a857b08858 100644 --- a/examples/configs/grpo_math_8B.yaml +++ b/examples/configs/grpo_math_8B.yaml @@ -58,6 +58,7 @@ policy: tensor_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: ${policy.max_total_sequence_length} + enforce_eager: False cluster: gpus_per_node: 8 diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml index 1248c28622..6bbcd95edd 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml @@ -89,6 +89,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 512 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml index 2458739e2e..af4bb6945d 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 16384 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml index 8f6327e1e9..b854eb7d38 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 4096 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml index cd05c86dbb..9f92be089b 100644 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 512 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml index c5ebb4f8eb..2a1a151ea5 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 16384 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml index 6d7a858749..06ae6b4637 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 16384 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml index bd22cd760e..fe2de660ce 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml @@ -87,6 +87,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 4096 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml index d6176ddd22..00a40de4d0 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 4096 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml index d1303bb444..d3bbc266f2 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 512 + enforce_eager: False colocated: enabled: true resources: diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 9506a063d3..64e97c3314 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -131,6 +131,9 @@ def configure_worker( seed = node_idx * 1024 + bundle_id init_kwargs["seed"] = seed + # Need to give each DP group its own vllm cache to address: + # https://github.com/vllm-project/vllm/issues/18851 + env_vars["VLLM_CACHE_ROOT"] = os.path.expanduser(f"~/.cache/vllm_{seed}") # Check if this worker is part of a parallel group (TP or TP+PP). # A worker is part of a parallel group if it's a secondary member (local_bundle_indices is None) @@ -334,8 +337,7 @@ def _patch_vllm_init_workers_ray(): enable_prefix_caching=torch.cuda.get_device_capability()[0] >= 8, dtype=self.cfg["vllm_cfg"]["precision"], seed=seed, - # Don't use cuda-graph by default as it leads to convergence issues (see https://github.com/NVIDIA-NeMo/RL/issues/186) - enforce_eager=True, + enforce_eager=self.cfg["vllm_cfg"]["enforce_eager"], max_model_len=self.cfg["vllm_cfg"]["max_model_len"], trust_remote_code=True, worker_extension_cls="nemo_rl.models.generation.vllm_backend.VllmInternalWorkerExtension", diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py index 08d1c0ffd6..db41fe2d39 100644 --- a/tests/unit/experience/test_rollouts.py +++ b/tests/unit/experience/test_rollouts.py @@ -241,6 +241,7 @@ def initial_multi_step_calculator_batch(rollout_tokenizer): "disable_log_stats": True, "disable_log_requests": True, "gpu_memory_utilization": 0.6, + "enforce_eager": "False", }, "colocated": { "enabled": True, diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 1404b02337..8371fababb 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -56,6 +56,7 @@ "async_engine": False, # Default to False for synchronous tests "skip_tokenizer_init": False, "load_format": "auto", + "enforce_eager": "False", }, "colocated": { "enabled": True, diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py index 9735b5f03d..d24a0c0f31 100644 --- a/tests/unit/models/generation/test_vllm_large_model.py +++ b/tests/unit/models/generation/test_vllm_large_model.py @@ -50,6 +50,7 @@ "async_engine": True, "skip_tokenizer_init": False, "load_format": "auto", + "enforce_eager": "False", }, "colocated": { "enabled": True,