Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
108 commits
Select commit Hold shift + click to select a range
0417dbe
Modal CI
sfc-gh-truwase May 16, 2025
8b49d0d
Modal CI
sfc-gh-truwase May 16, 2025
68a44dd
Modal CI
sfc-gh-truwase May 16, 2025
35547e9
uv for ds install
sfc-gh-truwase May 16, 2025
0c267e4
uv for ds install
sfc-gh-truwase May 16, 2025
ca48e9d
Update torch_latest_ci.py
stas00 May 16, 2025
8f63a4c
uv for ds install
sfc-gh-truwase May 16, 2025
b7e51d8
Merge branch 'tjruwase/modal_ci' of https://github.com/deepspeedai/De…
sfc-gh-truwase May 16, 2025
dba33e9
Modal testing
sfc-gh-truwase May 16, 2025
f688973
Modal testing
sfc-gh-truwase May 16, 2025
5a6d2c8
Handle folders and symlinks
sfc-gh-truwase May 17, 2025
985c5f3
Testing
sfc-gh-truwase May 17, 2025
37a7ed1
Testing
sfc-gh-truwase May 17, 2025
c0c406b
Use torch<2.7.0
sfc-gh-truwase May 17, 2025
8c76099
Full torch latest UT
sfc-gh-truwase May 17, 2025
665770d
Fewer UTs
sfc-gh-truwase May 17, 2025
60c0e82
nvme UTs
sfc-gh-truwase May 18, 2025
79a7a27
zero UTs
sfc-gh-truwase May 19, 2025
8839780
Merge branch 'master' into tjruwase/modal_ci
sfc-gh-truwase May 19, 2025
986008a
Avoid 30min timeout
sfc-gh-truwase May 19, 2025
4e85caf
Merge branch 'master' into tjruwase/modal_ci
sfc-gh-truwase May 19, 2025
6dce798
Update to latest checkout action
loadams May 19, 2025
b3f661e
Merge branch 'master' into tjruwase/modal_ci
loadams May 19, 2025
d3c7c80
Avoid 30min timeout
sfc-gh-truwase May 19, 2025
6925ed2
Merge branch 'master' into tjruwase/modal_ci
loadams May 20, 2025
9756a38
Merge branch 'master' into tjruwase/modal_ci
sfc-gh-truwase Jul 11, 2025
fedfc31
Fix torch version
tjruwase Jul 11, 2025
77d069a
Restructure
tjruwase Jul 12, 2025
07094ca
Make module
tjruwase Jul 12, 2025
7f27b79
Make module
tjruwase Jul 12, 2025
c48255f
Make module
tjruwase Jul 12, 2025
5975b11
Make module
tjruwase Jul 12, 2025
26bf584
Path fix
tjruwase Jul 12, 2025
061ae64
Add half precision UT
tjruwase Jul 12, 2025
ab44341
Add bf16 UT
tjruwase Jul 12, 2025
455ace7
Add accelerate UT
tjruwase Jul 12, 2025
2b94ca3
Add accelerate UT
tjruwase Jul 12, 2025
bfb10b8
Format fixes
tjruwase Jul 12, 2025
feb5a67
Install git
tjruwase Jul 12, 2025
45f6024
protobuf version
tjruwase Jul 12, 2025
fd93348
Debug accelerate UT
tjruwase Jul 12, 2025
1b42646
Debug accelerate UT
tjruwase Jul 12, 2025
18e8624
Debug accelerate UT
tjruwase Jul 13, 2025
adf268d
Debug accelerate UT
tjruwase Jul 13, 2025
e38e74f
Debug accelerate UT
tjruwase Jul 13, 2025
77295af
Merge branch 'master' into tjruwase/modal_ci
sfc-gh-truwase Jul 13, 2025
3dfc13b
Debug bf16 UT
tjruwase Jul 13, 2025
997d195
Reduce world_size
tjruwase Jul 14, 2025
afe54a7
Update .github/workflows/modal-accelerate.yml
sfc-gh-truwase Jul 14, 2025
cd4ea94
Update .github/workflows/modal-torch-latest.yml
sfc-gh-truwase Jul 14, 2025
fe7df64
Update deepspeed/modal_ci/accelerate.py
sfc-gh-truwase Jul 14, 2025
1cfbae1
Update deepspeed/modal_ci/torch_latest.py
sfc-gh-truwase Jul 14, 2025
76d2c5b
Update deepspeed/modal_ci/accelerate.py
sfc-gh-truwase Jul 14, 2025
a66a193
Update deepspeed/modal_ci/accelerate.py
sfc-gh-truwase Jul 14, 2025
d50b171
Update deepspeed/modal_ci/torch_latest.py
sfc-gh-truwase Jul 14, 2025
356d581
Speed with -n 4
tjruwase Jul 14, 2025
c42404e
Merge branch 'tjruwase/modal_ci' of github.com:deepspeedai/deepspeed …
tjruwase Jul 14, 2025
1afe1b7
Merge branch 'master' into tjruwase/modal_ci
sfc-gh-truwase Jul 14, 2025
0537279
Merge branch 'master' into tjruwase/modal_ci
sfc-gh-truwase Jul 18, 2025
4eeff3c
Move ci to top-level
tjruwase Jul 19, 2025
f9e74bb
Merge branch 'tjruwase/modal_ci' of github.com:deepspeedai/deepspeed …
tjruwase Jul 19, 2025
cfe6bef
Path fix
tjruwase Jul 19, 2025
9063108
UT path fixes
tjruwase Jul 19, 2025
2a563f6
Debug UT skips
tjruwase Jul 19, 2025
2313757
Fix path
tjruwase Jul 19, 2025
e5b7e6b
Confirm DS install
tjruwase Jul 21, 2025
801400f
Confirm DS install
tjruwase Jul 21, 2025
e359fe0
Move DS install from GH
sfc-gh-truwase Aug 8, 2025
684f44f
Move DS build
sfc-gh-truwase Aug 8, 2025
f30c188
Debugging
sfc-gh-truwase Aug 8, 2025
7a55264
Fix path
sfc-gh-truwase Aug 8, 2025
2042598
Debugging
sfc-gh-truwase Aug 8, 2025
3751d52
Remove copies
sfc-gh-truwase Aug 8, 2025
0d1d4e3
debug path
sfc-gh-truwase Aug 8, 2025
4c01641
debug path
sfc-gh-truwase Aug 8, 2025
c923db3
Install DS
sfc-gh-truwase Aug 8, 2025
b3a9d21
Install DS
sfc-gh-truwase Aug 8, 2025
95a31d7
Install DS
sfc-gh-truwase Aug 8, 2025
2bb78c5
Install DS
sfc-gh-truwase Aug 8, 2025
09b91ec
Install DS
sfc-gh-truwase Aug 8, 2025
6a15c62
Install DS
sfc-gh-truwase Aug 8, 2025
d8e4077
Install DS
sfc-gh-truwase Aug 8, 2025
9c51c2b
Add repo path
sfc-gh-truwase Aug 9, 2025
8ee00bf
Cleanup
sfc-gh-truwase Aug 9, 2025
2c5ba11
Merge branch 'master' into tjruwase/modal_ci
sfc-gh-truwase Aug 9, 2025
91a4da0
Debug skip
sfc-gh-truwase Aug 9, 2025
63fe8b8
Merge branch 'tjruwase/modal_ci' of https://github.com/deepspeedai/De…
sfc-gh-truwase Aug 9, 2025
7de52da
Fix filter
sfc-gh-truwase Aug 9, 2025
360664e
Check DS package
sfc-gh-truwase Aug 9, 2025
a36e3fa
Check DS package
sfc-gh-truwase Aug 9, 2025
7372b71
Check DS package
sfc-gh-truwase Aug 9, 2025
ba462d4
Check DS package
sfc-gh-truwase Aug 9, 2025
c12fe92
Check DS package
sfc-gh-truwase Aug 9, 2025
4b96926
Fix PYTHONPATH
sfc-gh-truwase Aug 9, 2025
6bf4d18
Fix PYTHONPATH
sfc-gh-truwase Aug 9, 2025
a4927c7
Debug PYTHONPATH
sfc-gh-truwase Aug 9, 2025
56c0aaf
Debug PYTHONPATH
sfc-gh-truwase Aug 9, 2025
134fb7d
Debug PYTHONPATH
sfc-gh-truwase Aug 9, 2025
b180173
Debug PYTHONPATH
sfc-gh-truwase Aug 9, 2025
63da02c
Debug DS build
sfc-gh-truwase Aug 9, 2025
1bee006
Fix DS install
sfc-gh-truwase Aug 9, 2025
a0b84a1
Fix DS path
sfc-gh-truwase Aug 9, 2025
c2c6e58
Cleanup
sfc-gh-truwase Aug 10, 2025
7c9462a
Merge branch 'master' into tjruwase/modal_ci
sfc-gh-truwase Aug 11, 2025
034e61a
Format
sfc-gh-truwase Aug 11, 2025
8a72a98
Merge branch 'tjruwase/modal_ci' of https://github.com/deepspeedai/De…
sfc-gh-truwase Aug 11, 2025
4b2959d
Merge branch 'master' into tjruwase/modal_ci
loadams Aug 11, 2025
936bbc8
Merge branch 'master' into tjruwase/modal_ci
loadams Aug 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions .github/workflows/modal-accelerate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
name: modal-accelerate

# This CI is running on modal.com's GPUs.
#
# It's set up here on github actions and then the cloned repo is sent to modal and everything
# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
# run.
#
# Both files are annotated to what's important and how one might change or update things if needed.
#
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
# Required status for PRs to pass.
#


on:
workflow_dispatch:
push:
branches:
- master

pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
types: [draft, opened, ready_for_review, synchronize]
branches:
- master

concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: true

jobs:
collect-tests:
name: Collect tests to run
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
deepspeed: ${{ steps.filter.outputs.deepspeed }}

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true

- name: Filter changed files
uses: dorny/paths-filter@v2
id: filter
with:
token: ${{ secrets.GITHUB_TOKEN }}
filters: |
deepspeed:
- 'deepspeed/**'
- '.github/workflows/modal*.yml'
- 'ci/**'
- 'tests/unit/**'
- 'csrc/**'

deploy:
name: DeepSpeedAI CI
runs-on: ubuntu-latest
needs: collect-tests
env:
# these are created at https://modal.com/settings/deepspeedai/tokens
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
# this one comes from https://huggingface.co/settings/profile of the bot user
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
HF_TOKEN: ${{ secrets.HF_TOKEN }}

if: needs.collect-tests.outputs.deepspeed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
lfs: true

- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies

- name: Install build dependencies
run: |
pip install uv # much faster than pip
uv pip install --system modal

- name: Run tests
run: |
modal run -m ci.accelerate
99 changes: 99 additions & 0 deletions .github/workflows/modal-torch-latest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
name: modal-torch-latest

# This CI is running on modal.com's GPUs.
#
# It's set up here on github actions and then the cloned repo is sent to modal and everything
# happens on their hw - see deepspeed/modal_ci/torch_latest.py for where the actual vm is loaded, updated and the tests are
# run.
#
# Both files are annotated to what's important and how one might change or update things if needed.
#
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
# Required status for PRs to pass.
#


on:
workflow_dispatch:
push:
branches:
- master

pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
types: [draft, opened, ready_for_review, synchronize]
branches:
- master

concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: true

jobs:
collect-tests:
name: Collect tests to run
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
deepspeed: ${{ steps.filter.outputs.deepspeed }}

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true

- name: Filter changed files
uses: dorny/paths-filter@v2
id: filter
with:
token: ${{ secrets.GITHUB_TOKEN }}
filters: |
deepspeed:
- 'deepspeed/**'
- '.github/workflows/modal*.yml'
- 'ci/**'
- 'tests/unit/**'
- 'csrc/**'

deploy:
name: DeepSpeedAI CI
runs-on: ubuntu-latest
needs: collect-tests
env:
# these are created at https://modal.com/settings/deepspeedai/tokens
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
# this one comes from https://huggingface.co/settings/profile of the bot user
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
HF_TOKEN: ${{ secrets.HF_TOKEN }}

if: needs.collect-tests.outputs.deepspeed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
lfs: true

- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies

- name: Install build dependencies
run: |
pip install uv # much faster than pip
uv pip install --system modal

- name: Run tests
run: |
modal run -m ci.torch_latest
4 changes: 4 additions & 0 deletions ci/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) DeepSpeed Team.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
43 changes: 43 additions & 0 deletions ci/accelerate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) Snowflake.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from pathlib import Path

import modal

ROOT_PATH = Path(__file__).parents[1]

# yapf: disable
image = (modal.Image
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
.run_commands("apt update && apt install -y libaio-dev")
.apt_install("git")
.run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
.run_commands(
"git clone https://github.com/huggingface/accelerate && \
uv pip install --system --compile-bytecode ./accelerate[testing]"
)
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
.run_commands("pip install /root")
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
)

app = modal.App("deepspeedai-accelerate-ci", image=image)

@app.function(
gpu="l40s:1",
timeout=1800,
)
def pytest():
import subprocess
subprocess.run(
"pytest /accelerate/tests/deepspeed".split(),
check=True,
cwd=ROOT_PATH / ".",
)
39 changes: 39 additions & 0 deletions ci/torch_latest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) Snowflake.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from pathlib import Path

import modal

ROOT_PATH = Path(__file__).parents[1]

# yapf: disable
image = (modal.Image
.from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
.run_commands("apt update && apt install -y libaio-dev")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
.pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
.add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
.run_commands("pip install /root")
.add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator")
.add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc")
.add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder")
)


app = modal.App("deepspeedai-torch-latest-ci", image=image)


@app.function(
gpu="l40s:2",
timeout=1800,
)
def pytest():
import subprocess
subprocess.run(
"pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(),
check=True,
cwd=ROOT_PATH / ".",
)
6 changes: 3 additions & 3 deletions tests/unit/runtime/zero/test_zero.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ def test(self, allgather_bucket_size, zero_stage=2):


class TestPartitionNcclAlignment(DistributedTest):
world_size = 4
world_size = 2

def test(self, zero_stage=2):
config_dict = {
Expand Down Expand Up @@ -835,7 +835,7 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
@pytest.mark.parametrize("init_context_manager", [True, False])
@pytest.mark.parametrize("reduce_scatter", [True, False])
class TestZero3ParamPartitioningLargeParam(DistributedTest):
world_size = 4
world_size = 2

def test(self, init_context_manager: bool, reduce_scatter: bool, param_sz: int = 8100) -> None:

Expand Down Expand Up @@ -997,7 +997,7 @@ def forward(self, x: Tensor) -> Tensor:


class TestZero3InitForParentWeightInitialization(DistributedTest):
world_size = 4
world_size = 2

def test(self):

Expand Down