From 488ee79e83d1a0f736406b1fd6231162b6c2d244 Mon Sep 17 00:00:00 2001
From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Date: Wed, 18 Mar 2026 16:40:39 -0700
Subject: [PATCH 1/5] =?UTF-8?q?Revert=20"[train]=20Add=20`worker=5Fprocess?=
 =?UTF-8?q?=5Fsetup=5Fhook`=20to=20set=20mp=20start=20method=20to=20`sp?=
 =?UTF-8?q?=E2=80=A6"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit dffae957cee22e1aa2d442a64055b5b738e4557c.
---
 skyrl/train/entrypoints/main_base.py          |  9 ++--
 skyrl/train/utils/utils.py                    |  8 +---
 skyrl/utils/worker_setup.py                   | 19 --------
 .../skyrl_train/utils/test_worker_setup.py    | 46 -------------------
 4 files changed, 7 insertions(+), 75 deletions(-)
 delete mode 100644 skyrl/utils/worker_setup.py
 delete mode 100644 tests/backends/skyrl_train/utils/test_worker_setup.py

diff --git a/skyrl/train/entrypoints/main_base.py b/skyrl/train/entrypoints/main_base.py
index c5fd9dc92d..9ca5e58ede 100644
--- a/skyrl/train/entrypoints/main_base.py
+++ b/skyrl/train/entrypoints/main_base.py
@@ -3,6 +3,7 @@
 """
 
 import asyncio
+import multiprocessing as mp
 import os
 import sys
 from pathlib import Path
@@ -34,10 +35,12 @@
     initialize_ray,
 )
 from skyrl.utils.tok import get_tokenizer
-from skyrl.utils.worker_setup import worker_setup_fn
 
-# Run setup function to ensure driver process has consistent setup as Ray workers
-worker_setup_fn()
+# NOTE (sumanthrh): We use ray heavily and thus disable `fork` start method.
+# forking within ray leads to undefined behaviour and often causes hard to debug
+# memory leaks.  See: https://docs.ray.io/en/latest/ray-core/patterns/fork-new-processes.html
+# A common culprit is Pytorch dataloaders which use `fork` by default.
+mp.set_start_method("spawn", force=True)
 
 config_dir = str(Path(__file__).parent.parent / "config")
 __all__ = ["BasePPOExp", "config_dir"]
diff --git a/skyrl/train/utils/utils.py b/skyrl/train/utils/utils.py
index 63e08e9b2a..b2dd2a2b13 100644
--- a/skyrl/train/utils/utils.py
+++ b/skyrl/train/utils/utils.py
@@ -747,13 +747,7 @@ def initialize_ray(cfg: SkyRLTrainConfig):
 
     # log_to_driver=True allows training progress from skyrl_entrypoint to reach stdout.
     # Infrastructure logs (vLLM, workers) are redirected to log file via os.dup2 in their init.
-    ray.init(
-        runtime_env={
-            "env_vars": env_vars,
-            "worker_process_setup_hook": "skyrl.utils.worker_setup.worker_setup_fn",
-        },
-        log_to_driver=True,
-    )
+    ray.init(runtime_env={"env_vars": env_vars}, log_to_driver=True)
 
     if not verbose_logging:
         logger.info(f"Infrastructure logs will be written to: {log_file}")
diff --git a/skyrl/utils/worker_setup.py b/skyrl/utils/worker_setup.py
deleted file mode 100644
index 5a4cae5814..0000000000
--- a/skyrl/utils/worker_setup.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Worker process setup hook for Ray workers."""
-
-import multiprocessing
-
-
-def worker_setup_fn():
-    """Set the multiprocessing start method to 'spawn' in Ray workers.
-
-    This is passed to ray.init via runtime_env["worker_process_setup_hook"].
-    We use ray and thus disable the `fork` start method. Forking within
-    ray leads to undefined behaviour and often causes hard to debug memory leaks.
-    See: https://docs.ray.io/en/latest/ray-core/patterns/fork-new-processes.html
-    A common culprit is PyTorch dataloaders which use `fork` by default.
-    """
-    try:
-        multiprocessing.set_start_method("spawn", force=True)
-    except RuntimeError:
-        # Already set — nothing to do
-        pass
diff --git a/tests/backends/skyrl_train/utils/test_worker_setup.py b/tests/backends/skyrl_train/utils/test_worker_setup.py
deleted file mode 100644
index fbdd1b0b0e..0000000000
--- a/tests/backends/skyrl_train/utils/test_worker_setup.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import multiprocessing
-
-import pytest
-import ray
-
-from skyrl.utils.worker_setup import worker_setup_fn
-
-
-@pytest.fixture(scope="module")
-def ray_with_setup_hook():
-    """Initialize Ray with the worker_process_setup_hook."""
-    if ray.is_initialized():
-        ray.shutdown()
-    ray.init(
-        runtime_env={
-            "worker_process_setup_hook": "skyrl.utils.worker_setup.worker_setup_fn",
-        },
-    )
-    yield
-    ray.shutdown()
-
-
-def test_worker_setup_fn_sets_spawn():
-    """Test that worker_setup_fn sets the mp start method to spawn."""
-    # Reset to default first
-    multiprocessing.set_start_method("fork", force=True)
-    worker_setup_fn()
-    assert multiprocessing.get_start_method() == "spawn"
-
-
-def test_worker_setup_fn_idempotent():
-    """Test that calling worker_setup_fn twice doesn't raise."""
-    multiprocessing.set_start_method("spawn", force=True)
-    worker_setup_fn()  # should not raise
-    assert multiprocessing.get_start_method() == "spawn"
-
-
-@ray.remote
-def _get_mp_start_method():
-    return multiprocessing.get_start_method()
-
-
-def test_worker_setup_hook_applied_in_ray_worker(ray_with_setup_hook):
-    """Test that Ray workers have mp start method set to spawn via the hook."""
-    result = ray.get(_get_mp_start_method.remote())
-    assert result == "spawn"

From dcf3f7e8eacd5749f0e918465c457855532fd0bb Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Wed, 18 Mar 2026 23:45:41 +0000
Subject: [PATCH 2/5] x

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 skyrl/train/utils/trainer_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/skyrl/train/utils/trainer_utils.py b/skyrl/train/utils/trainer_utils.py
index 2d4e6e24b1..cd9e88f923 100644
--- a/skyrl/train/utils/trainer_utils.py
+++ b/skyrl/train/utils/trainer_utils.py
@@ -690,6 +690,10 @@ def build_dataloader(
         num_workers=0 if cfg.generator.inference_engine.enable_http_endpoint else 8,
         drop_last=True if is_train else False,
         generator=seeded_generator,
+        # NOTE (sumanthrh): We use ray and thus use `spawn` start method.
+        # forking within ray leads to undefined behaviour and often causes hard to debug
+        # memory leaks.  See: https://docs.ray.io/en/latest/ray-core/patterns/fork-new-processes.html
+        multiprocessing_context="spawn",
     )
     if is_train:
         if not is_fully_async:

From 9a7044c67fd180ba45513ba8765510ccd7ad3652 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Wed, 18 Mar 2026 23:55:25 +0000
Subject: [PATCH 3/5] x

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py b/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py
index 43112dcc10..65ce0386de 100644
--- a/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py
+++ b/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py
@@ -174,6 +174,7 @@ def build_dataloader(
         num_workers=0 if cfg.generator.inference_engine.enable_http_endpoint else 8,
         drop_last=True if is_train else False,
         generator=seeded_generator,
+        multiprocessing_context="spawn",
     )
     if is_train:
         if not is_fully_async:

From cbff560b96e7b88defaf311a877cb1f549b2309b Mon Sep 17 00:00:00 2001
From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Date: Wed, 18 Mar 2026 17:02:02 -0700
Subject: [PATCH 4/5] Update
 skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py

Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py b/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py
index 65ce0386de..2135651f79 100644
--- a/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py
+++ b/skyrl-agent/skyrl_agent/integrations/skyrl_train/trainer.py
@@ -174,7 +174,7 @@ def build_dataloader(
         num_workers=0 if cfg.generator.inference_engine.enable_http_endpoint else 8,
         drop_last=True if is_train else False,
         generator=seeded_generator,
-        multiprocessing_context="spawn",
+        multiprocessing_context="spawn" if not cfg.generator.inference_engine.enable_http_endpoint else None,
     )
     if is_train:
         if not is_fully_async:

From ca36132dca8484265c82180f1a509cc74d0926a9 Mon Sep 17 00:00:00 2001
From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Date: Wed, 18 Mar 2026 17:02:10 -0700
Subject: [PATCH 5/5] Update skyrl/train/utils/trainer_utils.py

Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 skyrl/train/utils/trainer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skyrl/train/utils/trainer_utils.py b/skyrl/train/utils/trainer_utils.py
index cd9e88f923..b2c27e5a1c 100644
--- a/skyrl/train/utils/trainer_utils.py
+++ b/skyrl/train/utils/trainer_utils.py
@@ -693,7 +693,7 @@ def build_dataloader(
         # NOTE (sumanthrh): We use ray and thus use `spawn` start method.
         # forking within ray leads to undefined behaviour and often causes hard to debug
         # memory leaks.  See: https://docs.ray.io/en/latest/ray-core/patterns/fork-new-processes.html
-        multiprocessing_context="spawn",
+        multiprocessing_context="spawn" if not cfg.generator.inference_engine.enable_http_endpoint else None,
     )
     if is_train:
         if not is_fully_async: