[train] after_worker_group_poll_status errors result in ControllerError (#57869)

TimothySeah · web-flow · commit f5abbb8f4aa9 · 2025-10-22T00:11:53.000Z
# Summary

We observed that whenever `after_worker_group_poll_status` raised an
exception, the Train Run would fail ungracefully and show up as
`ABORTED` in the dashboard. This happened in the following situations:
1) Different workers report remote checkpoints with different paths -&gt;
`(TrainController pid=46993) RuntimeError: The storage path of the
checkpoints in the training results is not the same. This means the
checkpoints are not consistent. Got a mix of the following checkpoint
paths: {'/tmp/tmpl95kv7ax', '/tmp/tmp__8e6etk'} ` -&gt; `ABORTED` Train Run
2) `ray.train.report("loss": ...}, checkpoint=checkpoint)` in
`train_func` -&gt; `TypeError: Object of type 'ellipsis' is not JSON
serializable` in `CheckpointManager._save_state` -&gt; `ABORTED` Train Run

This PR catches these exceptions, wraps them in a `ControllerError`, and
goes through the `FailurePolicy`, ultimately resulting in an `ERRORED`
Train Run, which is more intuitive because it happened due to an error
in the training workers (`The Train run failed due to an error in the
training workers.` is the comment associated with `RunStatus.ERRORED`).

I considered implementing a more general solution that caught all
`WorkerGroupCallback` errors and resurfaced them as `ControllerError`s,
but decided against it because:
* Callbacks occur in many different places and we might want to add
custom try/catch logic in each case.
* `after_worker_group_poll_status` is the only offender so far and most
of its errors are from user mistakes; other callback errors could be
legitimate bugs that should result in `ABORTED`

# Testing

Unit tests

---------

Signed-off-by: Timothy Seah &lt;tseah@anyscale.com&gt;
diff --git a/python/ray/train/v2/_internal/execution/controller/controller.py b/python/ray/train/v2/_internal/execution/controller/controller.py
@@ -408,7 +408,16 @@ async def _step(self) -> TrainControllerLoopIterationResult:
             assert isinstance(controller_state.scaling_decision, ResizeDecision)
             return self._execute_resize_decision(controller_state.scaling_decision)
         elif isinstance(controller_state, RunningState):
-            worker_group_status: WorkerGroupPollStatus = await self._poll_workers()
+            try:
+                worker_group_status: WorkerGroupPollStatus = await self._poll_workers()
+            except Exception as e:
+                training_failed_error = ControllerError(e)
+                failure_decision = self._failure_policy.make_decision(
+                    training_failed_error=training_failed_error,
+                )
+                return self._execute_failure_decision(
+                    failure_decision, training_failed_error=training_failed_error
+                )
 
             if worker_group_status.finished and not worker_group_status.errors:
                 return TrainControllerLoopIterationResult(
diff --git a/python/ray/train/v2/tests/test_controller.py b/python/ray/train/v2/tests/test_controller.py
@@ -1,4 +1,4 @@
-from unittest.mock import MagicMock, create_autospec
+from unittest.mock import create_autospec
 
 import pytest
 
@@ -27,6 +27,7 @@
     NoopDecision,
     ResizeDecision,
 )
+from ray.train.v2._internal.execution.worker_group import WorkerGroupPollStatus
 from ray.train.v2.api.config import ScalingConfig
 from ray.train.v2.tests.util import (
     DummyObjectRefWrapper,
@@ -45,6 +46,8 @@ def patch_worker_group(monkeypatch):
     # Make polling interval 0 to speed up tests
     monkeypatch.setenv(HEALTH_CHECK_INTERVAL_S_ENV_VAR, "0")
     yield
+    DummyWorkerGroup.set_poll_failure(None)
+    DummyWorkerGroup.set_start_failure(None)
 
 
 @pytest.fixture(autouse=True)
@@ -167,7 +170,7 @@ async def test_failure_handling():
     await controller._run_control_loop_iteration()
     assert isinstance(controller.get_state(), RunningState)
 
-    controller.get_worker_group().error_worker(3)
+    DummyWorkerGroup.set_poll_failure(RuntimeError("Simulated poll failure"))
     failure_policy.queue_decision(FailureDecision.RAISE)
     await controller._run_control_loop_iteration()
     assert isinstance(controller.get_state(), ErroredState)
@@ -177,7 +180,7 @@ async def test_failure_handling():
     "error_type", [WorkerGroupStartupFailedError, WorkerGroupStartupTimeoutError(2)]
 )
 @pytest.mark.asyncio
-async def test_worker_group_start_failure(monkeypatch, error_type):
+async def test_worker_group_start_failure(error_type):
     """Check that controller can gracefully handle worker group start failures."""
     scaling_policy = MockScalingPolicy(scaling_config=ScalingConfig())
     failure_policy = MockFailurePolicy(failure_config=None)
@@ -189,7 +192,6 @@ async def test_worker_group_start_failure(monkeypatch, error_type):
         failure_policy=failure_policy,
     )
     DummyWorkerGroup.set_start_failure(error_type)
-    monkeypatch.setattr(TrainController, "worker_group_cls", DummyWorkerGroup)
 
     assert isinstance(controller.get_state(), InitializingState)
 
@@ -208,7 +210,6 @@ async def test_worker_group_start_failure(monkeypatch, error_type):
 
     # Let the worker group start successfully the 2nd time.
     DummyWorkerGroup.set_start_failure(None)
-    monkeypatch.setattr(TrainController, "worker_group_cls", DummyWorkerGroup)
     scaling_policy.queue_recovery_decision(
         ResizeDecision(num_workers=2, resources_per_worker={})
     )
@@ -239,7 +240,10 @@ async def sleep_mock(t):
         failure_policy=None,
     )
     # Mock worker group to avoid actual polling
-    controller._worker_group = MagicMock()
+    controller._worker_group = create_autospec(DummyWorkerGroup, instance=True)
+    controller._worker_group.poll_status.return_value = WorkerGroupPollStatus(
+        worker_statuses={}
+    )
 
     num_polls = 5
     for _ in range(num_polls):
diff --git a/python/ray/train/v2/tests/util.py b/python/ray/train/v2/tests/util.py
@@ -45,6 +45,7 @@
 class DummyWorkerGroup(WorkerGroup):
 
     _start_failure = None
+    _poll_failure = None
 
     # TODO: Clean this up and use Mocks instead.
     def __init__(
@@ -58,6 +59,8 @@ def __init__(
         self._worker_statuses = {}
 
     def poll_status(self, *args, **kwargs) -> WorkerGroupPollStatus:
+        if self._poll_failure:
+            raise self._poll_failure
         return WorkerGroupPollStatus(
             worker_statuses=self._worker_statuses,
         )
@@ -97,6 +100,10 @@ def finish_worker(self, worker_index):
     def set_start_failure(cls, start_failure):
         cls._start_failure = start_failure
 
+    @classmethod
+    def set_poll_failure(cls, poll_failure):
+        cls._poll_failure = poll_failure
+
 
 class MockScalingPolicy(ScalingPolicy):
     def __init__(self, scaling_config):