diff --git a/.github/workflows/modal-accelerate.yml b/.github/workflows/modal-accelerate.yml
new file mode 100644
index 000000000000..9d59e22658f8
--- /dev/null
+++ b/.github/workflows/modal-accelerate.yml
@@ -0,0 +1,99 @@
+name: modal-accelerate
+
+# This CI is running on modal.com's GPUs.
+#
+# It's set up here on github actions and then the cloned repo is sent to modal and everything
+# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
+# run.
+#
+# Both files are annotated to what's important and how one might change or update things if needed.
+#
+# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
+# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
+# Required status for PRs to pass.
+#
+
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+    types: [draft, opened, ready_for_review, synchronize]
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  collect-tests:
+    name: Collect tests to run
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      deepspeed: ${{ steps.filter.outputs.deepspeed }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Filter changed files
+        uses: dorny/paths-filter@v2
+        id: filter
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: |
+            deepspeed:
+              - 'deepspeed/**'
+              - '.github/workflows/modal*.yml'
+              - 'ci/**'
+              - 'tests/unit/**'
+              - 'csrc/**'
+
+  deploy:
+    name: DeepSpeedAI CI
+    runs-on: ubuntu-latest
+    needs: collect-tests
+    env:
+      # these are created at https://modal.com/settings/deepspeedai/tokens
+      # they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+      # this one comes from https://huggingface.co/settings/profile of the bot user
+      # and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+    if: needs.collect-tests.outputs.deepspeed == 'true'
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+          cache: 'pip' # caching pip dependencies
+
+      - name: Install build dependencies
+        run: |
+          pip install uv # much faster than pip
+          uv pip install --system modal
+
+      - name: Run tests
+        run: |
+          modal run -m ci.accelerate
diff --git a/.github/workflows/modal-torch-latest.yml b/.github/workflows/modal-torch-latest.yml
new file mode 100644
index 000000000000..5305f10aea54
--- /dev/null
+++ b/.github/workflows/modal-torch-latest.yml
@@ -0,0 +1,99 @@
+name: modal-torch-latest
+
+# This CI is running on modal.com's GPUs.
+#
+# It's set up here on github actions and then the cloned repo is sent to modal and everything
+# happens on their hw - see deepspeed/modal_ci/torch_latest.py  for where the actual vm is loaded, updated and the tests are
+# run.
+#
+# Both files are annotated to what's important and how one might change or update things if needed.
+#
+# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
+# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
+# Required status for PRs to pass.
+#
+
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+    types: [draft, opened, ready_for_review, synchronize]
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  collect-tests:
+    name: Collect tests to run
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      deepspeed: ${{ steps.filter.outputs.deepspeed }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Filter changed files
+        uses: dorny/paths-filter@v2
+        id: filter
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: |
+            deepspeed:
+              - 'deepspeed/**'
+              - '.github/workflows/modal*.yml'
+              - 'ci/**'
+              - 'tests/unit/**'
+              - 'csrc/**'
+
+  deploy:
+    name: DeepSpeedAI CI
+    runs-on: ubuntu-latest
+    needs: collect-tests
+    env:
+      # these are created at https://modal.com/settings/deepspeedai/tokens
+      # they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+      # this one comes from https://huggingface.co/settings/profile of the bot user
+      # and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+    if: needs.collect-tests.outputs.deepspeed == 'true'
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+          cache: 'pip' # caching pip dependencies
+
+      - name: Install build dependencies
+        run: |
+          pip install uv # much faster than pip
+          uv pip install --system modal
+
+      - name: Run tests
+        run: |
+          modal run -m ci.torch_latest
diff --git a/ci/__init__.py b/ci/__init__.py
new file mode 100644
index 000000000000..6f5f5619004b
--- /dev/null
+++ b/ci/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) DeepSpeed Team.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/ci/accelerate.py b/ci/accelerate.py
new file mode 100644
index 000000000000..f9fc09d75f19
--- /dev/null
+++ b/ci/accelerate.py
@@ -0,0 +1,43 @@
+# Copyright (c) Snowflake.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parents[1]
+
+# yapf: disable
+image = (modal.Image
+         .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
+         .run_commands("apt update && apt install -y libaio-dev")
+         .apt_install("git")
+         .run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
+         .run_commands(
+                "git clone https://github.com/huggingface/accelerate && \
+                uv pip install --system --compile-bytecode ./accelerate[testing]"
+            )
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
+         .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
+         .run_commands("pip install /root")
+         .add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
+         .add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
+         .add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
+        )
+
+app = modal.App("deepspeedai-accelerate-ci", image=image)
+
+@app.function(
+    gpu="l40s:1",
+    timeout=1800,
+)
+def pytest():
+    import subprocess
+    subprocess.run(
+        "pytest /accelerate/tests/deepspeed".split(),
+        check=True,
+        cwd=ROOT_PATH / ".",
+    )
diff --git a/ci/torch_latest.py b/ci/torch_latest.py
new file mode 100644
index 000000000000..c67b4d3982e4
--- /dev/null
+++ b/ci/torch_latest.py
@@ -0,0 +1,39 @@
+# Copyright (c) Snowflake.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parents[1]
+
+# yapf: disable
+image = (modal.Image
+         .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
+         .run_commands("apt update && apt install -y libaio-dev")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
+         .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
+         .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
+         .run_commands("pip install /root")
+         .add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
+         .add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
+         .add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
+        )
+
+
+app = modal.App("deepspeedai-torch-latest-ci", image=image)
+
+
+@app.function(
+    gpu="l40s:2",
+    timeout=1800,
+)
+def pytest():
+    import subprocess
+    subprocess.run(
+        "pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(),
+        check=True,
+        cwd=ROOT_PATH / ".",
+    )
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
index ad937d81ecbc..fb0e393dd5da 100644
--- a/tests/unit/runtime/zero/test_zero.py
+++ b/tests/unit/runtime/zero/test_zero.py
@@ -394,7 +394,7 @@ def test(self, allgather_bucket_size, zero_stage=2):
 
 
 class TestPartitionNcclAlignment(DistributedTest):
-    world_size = 4
+    world_size = 2
 
     def test(self, zero_stage=2):
         config_dict = {
@@ -835,7 +835,7 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
 @pytest.mark.parametrize("init_context_manager", [True, False])
 @pytest.mark.parametrize("reduce_scatter", [True, False])
 class TestZero3ParamPartitioningLargeParam(DistributedTest):
-    world_size = 4
+    world_size = 2
 
     def test(self, init_context_manager: bool, reduce_scatter: bool, param_sz: int = 8100) -> None:
 
@@ -997,7 +997,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class TestZero3InitForParentWeightInitialization(DistributedTest):
-    world_size = 4
+    world_size = 2
 
     def test(self):