From 488ee79e83d1a0f736406b1fd6231162b6c2d244 Mon Sep 17 00:00:00 2001 From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com> Date: Wed, 18 Mar 2026 16:40:39 -0700 Subject: [PATCH 1/5] =?UTF-8?q?Revert=20"[train]=20Add=20`worker=5Fprocess?= =?UTF-8?q?=5Fsetup=5Fhook`=20to=20set=20mp=20start=20method=20to=20`sp?= =?UTF-8?q?=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit dffae957cee22e1aa2d442a64055b5b738e4557c. --- skyrl/train/entrypoints/main_base.py | 9 ++-- skyrl/train/utils/utils.py | 8 +--- skyrl/utils/worker_setup.py | 19 -------- .../skyrl_train/utils/test_worker_setup.py | 46 ------------------- 4 files changed, 7 insertions(+), 75 deletions(-) delete mode 100644 skyrl/utils/worker_setup.py delete mode 100644 tests/backends/skyrl_train/utils/test_worker_setup.py diff --git a/skyrl/train/entrypoints/main_base.py b/skyrl/train/entrypoints/main_base.py index c5fd9dc92d..9ca5e58ede 100644 --- a/skyrl/train/entrypoints/main_base.py +++ b/skyrl/train/entrypoints/main_base.py @@ -3,6 +3,7 @@ """ import asyncio +import multiprocessing as mp import os import sys from pathlib import Path @@ -34,10 +35,12 @@ initialize_ray, ) from skyrl.utils.tok import get_tokenizer -from skyrl.utils.worker_setup import worker_setup_fn -# Run setup function to ensure driver process has consistent setup as Ray workers -worker_setup_fn() +# NOTE (sumanthrh): We use ray heavily and thus disable `fork` start method. +# forking within ray leads to undefined behaviour and often causes hard to debug +# memory leaks. See: https://docs.ray.io/en/latest/ray-core/patterns/fork-new-processes.html +# A common culprit is Pytorch dataloaders which use `fork` by default. +mp.set_start_method("spawn", force=True) config_dir = str(Path(__file__).parent.parent / "config") __all__ = ["BasePPOExp", "config_dir"] diff --git a/skyrl/train/utils/utils.py b/skyrl/train/utils/utils.py index 63e08e9b2a..b2dd2a2b13 100644 --- a/skyrl/train/utils/utils.py +++ b/skyrl/train/utils/utils.py @@ -747,13 +747,7 @@ def initialize_ray(cfg: SkyRLTrainConfig): # log_to_driver=True allows training progress from skyrl_entrypoint to reach stdout. # Infrastructure logs (vLLM, workers) are redirected to log file via os.dup2 in their init. - ray.init( - runtime_env={ - "env_vars": env_vars, - "worker_process_setup_hook": "skyrl.utils.worker_setup.worker_setup_fn", - }, - log_to_driver=True, - ) + ray.init(runtime_env={"env_vars": env_vars}, log_to_driver=True) if not verbose_logging: logger.info(f"Infrastructure logs will be written to: {log_file}") diff --git a/skyrl/utils/worker_setup.py b/skyrl/utils/worker_setup.py deleted file mode 100644 index 5a4cae5814..0000000000 --- a/skyrl/utils/worker_setup.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Worker process setup hook for Ray workers.""" - -import multiprocessing - - -def worker_setup_fn(): - """Set the multiprocessing start method to 'spawn' in Ray workers. - - This is passed to ray.init via runtime_env["worker_process_setup_hook"]. - We use ray and thus disable the `fork` start method. Forking within - ray leads to undefined behaviour and often causes hard to debug memory leaks. - See: https://docs.ray.io/en/latest/ray-core/patterns/fork-new-processes.html - A common culprit is PyTorch dataloaders which use `fork` by default. - """ - try: - multiprocessing.set_start_method("spawn", force=True) - except RuntimeError: - # Already set — nothing to do - pass diff --git a/tests/backends/skyrl_train/utils/test_worker_setup.py b/tests/backends/skyrl_train/utils/test_worker_setup.py deleted file mode 100644 index fbdd1b0b0e..0000000000 --- a/tests/backends/skyrl_train/utils/test_worker_setup.py +++ /dev/null @@ -1,46 +0,0 @@ -import multiprocessing - -import pytest -import ray - -from skyrl.utils.worker_setup import worker_setup_fn - - -@pytest.fixture(scope="module") -def ray_with_setup_hook(): - """Initialize Ray with the worker_process_setup_hook.""" - if ray.is_initialized(): - ray.shutdown() - ray.init( - runtime_env={ - "worker_process_setup_hook": "skyrl.utils.worker_setup.worker_setup_fn", - }, - ) - yield - ray.shutdown() - - -def test_worker_setup_fn_sets_spawn(): - """Test that worker_setup_fn sets the mp start method to spawn.""" - # Reset to default first - multiprocessing.set_start_method("fork", force=True) - worker_setup_fn() - assert multiprocessing.get_start_method() == "spawn" - - -def test_worker_setup_fn_idempotent(): - """Test that calling worker_setup_fn twice doesn't raise.""" - multiprocessing.set_start_method("spawn", force=True) - worker_setup_fn() # should not raise - assert multiprocessing.get_start_method() == "spawn" - - -@ray.remote -def _get_mp_start_method(): - return multiprocessing.get_start_method() - - -def test_worker_setup_hook_applied_in_ray_worker(ray_with_setup_hook): - """Test that Ray workers have mp start method set to spawn via the hook.""" - result = ray.get(_get_mp_start_method.remote()) - assert result == "spawn" From dcf3f7e8eacd5749f0e918465c457855532fd0bb Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Wed, 18 Mar 2026 23:45:41 +0000 Subject: [PATCH 2/5] x Signed-off-by: SumanthRH --- skyrl/train/utils/trainer_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skyrl/train/utils/trainer_utils.py b/skyrl/train/utils/trainer_utils.py index 2d4e6e24b1..cd9e88f923 100644 --- a/skyrl/train/utils/trainer_utils.py +++ b/skyrl/train/utils/trainer_utils.py @@ -690,6 +690,10 @@ def build_dataloader( num_workers=0 if cfg.generator.inference_engine.enable_http_endpoint else 8, drop_last=True if is_train else False, generator=seeded_generator, + # NOTE (sumanthrh): We use ray and thus use `spawn` start method. + # forking within ray leads to undefined behaviour and often causes hard to debug + # memory leaks. See: https://docs.ray.io/en/latest/ray-core/patterns/fork-new-processes.html + multiprocessing_context="spawn", ) if is_train: if not is_fully_async: From 9a7044c67fd180ba45513ba8765510ccd7ad3652 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Wed, 18 Mar 2026 23:55:25 +0000 Subject: [PATCH 3/5] x Signed-off-by: SumanthRH --- skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py b/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py index 43112dcc10..65ce0386de 100644 --- a/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py +++ b/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py @@ -174,6 +174,7 @@ def build_dataloader( num_workers=0 if cfg.generator.inference_engine.enable_http_endpoint else 8, drop_last=True if is_train else False, generator=seeded_generator, + multiprocessing_context="spawn", ) if is_train: if not is_fully_async: From cbff560b96e7b88defaf311a877cb1f549b2309b Mon Sep 17 00:00:00 2001 From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com> Date: Wed, 18 Mar 2026 17:02:02 -0700 Subject: [PATCH 4/5] Update skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py b/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py index 65ce0386de..2135651f79 100644 --- a/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py +++ b/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py @@ -174,7 +174,7 @@ def build_dataloader( num_workers=0 if cfg.generator.inference_engine.enable_http_endpoint else 8, drop_last=True if is_train else False, generator=seeded_generator, - multiprocessing_context="spawn", + multiprocessing_context="spawn" if not cfg.generator.inference_engine.enable_http_endpoint else None, ) if is_train: if not is_fully_async: From ca36132dca8484265c82180f1a509cc74d0926a9 Mon Sep 17 00:00:00 2001 From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com> Date: Wed, 18 Mar 2026 17:02:10 -0700 Subject: [PATCH 5/5] Update skyrl/train/utils/trainer_utils.py Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- skyrl/train/utils/trainer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skyrl/train/utils/trainer_utils.py b/skyrl/train/utils/trainer_utils.py index cd9e88f923..b2c27e5a1c 100644 --- a/skyrl/train/utils/trainer_utils.py +++ b/skyrl/train/utils/trainer_utils.py @@ -693,7 +693,7 @@ def build_dataloader( # NOTE (sumanthrh): We use ray and thus use `spawn` start method. # forking within ray leads to undefined behaviour and often causes hard to debug # memory leaks. See: https://docs.ray.io/en/latest/ray-core/patterns/fork-new-processes.html - multiprocessing_context="spawn", + multiprocessing_context="spawn" if not cfg.generator.inference_engine.enable_http_endpoint else None, ) if is_train: if not is_fully_async: