ray-project · matthewdeng · Aug 13, 2025 · Jul 30, 2025 · Aug 5, 2025 · Aug 6, 2025
@@ -110,6 +110,23 @@ def get_tpu_cores_per_chip(accelerator_type: str) -> int:
     return DEFAULT_TPU_NUM_CORES_PER_CHIP
 
 
+def infer_tpu_pod_type_from_topology(
+    topology: str, accelerator_type: str
+) -> Optional[str]:
+    """Infer the TPU pod type (e.g. v4-32) from topology and accelerator type."""
+    try:
+        num_chips = 1
+        for value in topology.strip().lower().split("x"):
+            num_chips *= int(value)
+        generation = accelerator_type.lower().replace("tpu-", "")
+        return f"{generation}-{num_chips}"
+    except Exception as e:
+        logger.warning(
+            f"Failed to infer pod type from topology {topology} and type {accelerator_type}: {e}"
+        )
+        return None
+
+
 class TPUAcceleratorManager(AcceleratorManager):
     """Google TPU accelerators."""
 

@@ -10,7 +10,6 @@
 from ray._common.utils import RESOURCE_CONSTRAINT_PREFIX
 from ray._private import accelerators
 from ray._private.accelerators import AcceleratorManager
-from ray._private.accelerators.tpu import TPUAcceleratorManager
 
 logger = logging.getLogger(__name__)
 
@@ -292,10 +291,11 @@ def _get_default_labels(
                     ray._raylet.RAY_NODE_ACCELERATOR_TYPE_KEY
                 ] = accelerator_type
 
-            # Set TPU specific default labels to enable SPMD scheduling.
-            if isinstance(accelerator_manager, TPUAcceleratorManager):
+            # Set TPU specific default labels to enable multi-host scheduling.
+            if accelerator_manager.get_resource_name() == "TPU":
                 tpu_labels = accelerator_manager.get_current_node_accelerator_labels()
-                default_labels.update(tpu_labels)
+                if tpu_labels:
+                    default_labels.update(tpu_labels)
 
         return default_labels
 

@@ -353,5 +353,26 @@ def test_get_current_node_tpu_topology_from_metadata():
         assert topology == "2x2x4"
 
 
+@pytest.mark.parametrize(
+    "topology, accelerator_type, expected_pod_type",
+    [
+        ("2x4", "TPU-V6E", "v6e-8"),
+        ("2x2x2", "TPU-V4", "v4-8"),
+        ("2x4x4", "TPU-V3", "v3-32"),
+        ("4x4", "TPU-V5P", "v5p-16"),
+        ("8x16", "TPU-V6E", "v6e-128"),
+        ("", "TPU-V3", None),
+        ("4x", "TPU-V3", None),
+    ],
+)
+def test_infer_tpu_pod_type_from_topology(
+    topology, accelerator_type, expected_pod_type
+):
+    assert (
+        tpu.infer_tpu_pod_type_from_topology(topology, accelerator_type)
+        == expected_pod_type
+    )
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-sv", __file__]))
@@ -484,3 +484,35 @@ py_test(
         "//:ray_lib",
     ],
 )
+
+py_test(
+    name = "test_jax_trainer",
+    size = "small",
+    srcs = ["tests/test_jax_trainer.py"],
+    env = {"RAY_TRAIN_V2_ENABLED": "1"},
+    tags = [
+        "exclusive",
+        "team:ml",
+        "train_v2",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
+py_test(
+    name = "test_tpu_utils",
+    size = "small",
+    srcs = ["tests/test_tpu_utils.py"],
+    env = {"RAY_TRAIN_V2_ENABLED": "1"},
+    tags = [
+        "exclusive",
+        "team:ml",
+        "train_v2",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
@@ -0,0 +1,42 @@
+from typing import Dict, Optional
+
+import ray
+from ray.train.v2._internal.execution.callback import ControllerCallback
+from ray.train.v2.api.config import ScalingConfig
+from ray.train.v2.jax.tpu_utils import reserve_tpu_slice
+
+
+class TPUReservationCallback(ControllerCallback):
+    """A callback to handle TPU slice reservation for multi-host training."""
+
+    def on_controller_start_worker_group(
+        self, *, scaling_config: ScalingConfig, num_workers: int
+    ) -> Optional[Dict[str, str]]:
+        """Reserves a multi-host TPU slice before the worker group starts.
+
+        This hook is called by the TrainController. It checks if multi-host
+        TPUs are being used and, if so, reserves a slice.
+
+        Args:
+            scaling_config: The scaling configuration for the run.
+            num_workers: The number of workers to be started.
+
+        Returns:
+            A dictionary defining a `bundle_label_selector` to gang schedule
+            the worker group on the reserved TPU slice.
+        """
+        bundle_label_selector = None
+
+        if getattr(scaling_config, "use_tpu", False) and num_workers > 1:
+            slice_name = reserve_tpu_slice(
+                topology=getattr(scaling_config, "topology", None),
+                accelerator_type=getattr(scaling_config, "accelerator_type", None),
+            )
+            if not slice_name:
+                raise RuntimeError("Failed to reserve TPU slice.")
+
+            bundle_label_selector = {
+                ray._raylet.RAY_NODE_TPU_SLICE_NAME_KEY: slice_name
+            }
+
+        return bundle_label_selector
@@ -2,6 +2,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 from ray.train.v2.api.callback import RayTrainCallback
+from ray.train.v2.api.config import ScalingConfig
 from ray.train.v2.api.result import Result
 from ray.util.annotations import DeveloperAPI
 
@@ -78,6 +79,24 @@ def after_controller_start(self, train_run_context: "TrainRunContext"):
         before the control loop starts executing."""
         pass
 
+    def on_controller_start_worker_group(
+        self, *, scaling_config: ScalingConfig, num_workers: int
+    ) -> Optional[Dict[str, str]]:
+        """Called by the TrainController before the worker group is started.
+
+        This hook can be used to perform setup that modifies the worker group's
+        placement, such as reserving an accelerator slice.
+
+        Args:
+            scaling_config: The scaling configuration for the run.
+            num_workers: The number of workers to be started.
+
+        Returns:
+            An optional dictionary defining a `bundle_label_selector`
+            to gang schedule the worker group on the reserved TPU slice.
+        """
+        return None
+
     def before_controller_shutdown(self):
         """Called before `TrainController.run` exits,
         after the control loop has exited."""

@@ -280,12 +280,29 @@ def _start_worker_group(
             ControllerError if the worker group failed to start.
         """
         placement_strategy = self._scaling_policy.scaling_config.placement_strategy
+        scaling_config = self._train_run_context.scaling_config
+
+        # Check for `bundle_label_selector` to influence WorkerGroup scheduling.
+        bundle_label_selector = None
+        try:
+            for callback in self._callbacks:
+                if hasattr(callback, "on_controller_start_worker_group"):
+                    selector = callback.on_controller_start_worker_group(
+                        scaling_config=scaling_config, num_workers=num_workers
+                    )
+                    if selector:
+                        bundle_label_selector = selector
+                        break
+        except Exception as e:
+            return ControllerError(e)
+
         worker_group_context = WorkerGroupContext(
             run_attempt_id=self._get_run_attempt_id(),
             train_fn_ref=self._train_fn_ref,
             num_workers=num_workers,
             resources_per_worker=resources_per_worker,
             placement_strategy=placement_strategy,
+            bundle_label_selector=bundle_label_selector,
         )
         try:
             self._worker_group = self.worker_group_cls.create(

@@ -89,13 +89,15 @@ class WorkerGroupContext:
         num_workers: The number of workers in the worker group.
         resources_per_worker: The resources per worker.
         placement_strategy: Strategy for placing workers.
+        bundle_label_selector: Optional label selectors to apply per-bundle for workers.
     """
 
     run_attempt_id: str
     train_fn_ref: ObjectRefWrapper[Callable[[], None]]
     num_workers: int
     resources_per_worker: Dict[str, float]
     placement_strategy: str = "PACK"
+    bundle_label_selector: Optional[Dict[str, str]] = None
 
 
 class WorkerGroup:
@@ -268,10 +270,18 @@ def _start_impl(
             for callback in self._callbacks:
                 callback.before_worker_group_start(worker_group_context)
 
+            bundle_label_selector = (
+                [worker_group_context.bundle_label_selector.copy()]
+                * worker_group_context.num_workers
+                if worker_group_context.bundle_label_selector
+                else None
+            )
+
             pg = placement_group(
                 bundles=[worker_group_context.resources_per_worker]
                 * worker_group_context.num_workers,
                 strategy=worker_group_context.placement_strategy,
+                bundle_label_selector=bundle_label_selector,
             )
             logger.info(
                 f"Attempting to start training worker group of size {worker_group_context.num_workers} with "

@@ -1,7 +1,7 @@
 import logging
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 import pyarrow.fs
 
@@ -21,7 +21,9 @@
 
 if TYPE_CHECKING:
     from ray.train import UserCallback
+    from ray.tune.search.sample import Domain
 
+SampleRange = Union["Domain", Dict[str, List]]
-    from ray.tune.search.sample import Domain
-
-SampleRange = Union["Domain", Dict[str, List]]
-    from ray.tune.search.sample import Domain
-
-SampleRange = Union["Domain", Dict[str, List]]
 
 logger = logging.getLogger(__name__)
 
@@ -52,6 +54,14 @@ class ScalingConfig(ScalingConfigV1):
             See :ref:`the available accelerator types <accelerator_types>`.
             Ensure that your cluster has instances with the specified accelerator type
             or is able to autoscale to fulfill the request.
+        use_tpu: [Experimental] If True, training will be done on TPUs (1 TPU VM
+            per worker). Defaults to False. The number of TPUs reserved by each
+            worker can be overridden with the ``resources_per_worker``
+            argument. This arg enables SPMD execution of the training workload.
+        topology: [Experimental] If specified, Ray Train will launch the training
+            coordinator and workers on nodes with the specified topology. Topology is
+            auto-detected for TPUs and added as Ray node labels. This arg enables
+            SPMD execution of the training workload.
 
     Example:
 
@@ -73,17 +83,53 @@ class ScalingConfig(ScalingConfigV1):
     """
 
     trainer_resources: Optional[dict] = None
+    use_tpu: Union[bool, SampleRange] = False
+    topology: Optional[str] = None
 
     def __post_init__(self):
         if self.trainer_resources is not None:
             raise DeprecationWarning(TRAINER_RESOURCES_DEPRECATION_MESSAGE)
 
+        if self.resources_per_worker:
+            if self.use_gpu and self.use_tpu:
+                raise ValueError(
+                    "Cannot specify both `use_gpu=True` and `use_tpu=True`."
+                )
+
+        if not self.use_tpu and self.num_tpus_per_worker > 0:
+            raise ValueError(
+                "`use_tpu` is False but `TPU` was found in "
+                "`resources_per_worker`. Either set `use_tpu` to True or "
+                "remove `TPU` from `resources_per_worker."
+            )
+
+        if self.use_tpu and self.num_tpus_per_worker == 0:
+            raise ValueError(
+                "`use_tpu` is True but `TPU` is set to 0 in "
+                "`resources_per_worker`. Either set `use_tpu` to False or "
+                "request a positive number of `TPU` in "
+                "`resources_per_worker."
+            )
+
         super().__post_init__()
 
+    @property
+    def _resources_per_worker_not_none(self):
+        if self.resources_per_worker is None:
+            if self.use_tpu:
+                return {"TPU": 1}
+
+        return super()._resources_per_worker_not_none
+
     @property
     def _trainer_resources_not_none(self):
         return {}
 
+    @property
+    def num_tpus_per_worker(self):
+        """The number of TPUs to set per worker."""
+        return self._resources_per_worker_not_none.get("TPU", 0)
+
 
 @dataclass
 class FailureConfig(FailureConfigV1):

@@ -0,0 +1,15 @@
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    try:
+        import jax  # noqa: F401
+    except ModuleNotFoundError as exception:
+        raise ModuleNotFoundError(
+            "Jax isn't installed. To install Jax, please check"
+            " `https://github.com/google/jax#installation` for the instructions."
+        ) from exception
+
+from ray.train.v2.jax.config import JaxConfig
+from ray.train.v2.jax.jax_trainer import JaxTrainer
+
+__all__ = ["JaxConfig", "JaxTrainer"]