diff --git a/.github/workflows/modal-accelerate.yml b/.github/workflows/modal-accelerate.yml new file mode 100644 index 000000000000..9d59e22658f8 --- /dev/null +++ b/.github/workflows/modal-accelerate.yml @@ -0,0 +1,99 @@ +name: modal-accelerate + +# This CI is running on modal.com's GPUs. +# +# It's set up here on github actions and then the cloned repo is sent to modal and everything +# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are +# run. +# +# Both files are annotated to what's important and how one might change or update things if needed. +# +# Note that since this is a Required job we can't use `on.push.path` file filter - we are using +# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the +# Required status for PRs to pass. +# + + +on: + workflow_dispatch: + push: + branches: + - master + + pull_request: + paths-ignore: + - 'docs/**' + - 'blogs/**' + - 'deepspeed/inference/v2/**' + - 'tests/unit/inference/v2/**' + types: [draft, opened, ready_for_review, synchronize] + branches: + - master + +concurrency: + group: ${{ github.workflow }}-${{ github.ref || github.run_id }} + cancel-in-progress: true + +jobs: + collect-tests: + name: Collect tests to run + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + deepspeed: ${{ steps.filter.outputs.deepspeed }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + lfs: true + + - name: Filter changed files + uses: dorny/paths-filter@v2 + id: filter + with: + token: ${{ secrets.GITHUB_TOKEN }} + filters: | + deepspeed: + - 'deepspeed/**' + - '.github/workflows/modal*.yml' + - 'ci/**' + - 'tests/unit/**' + - 'csrc/**' + + deploy: + name: DeepSpeedAI CI + runs-on: ubuntu-latest + needs: collect-tests + env: + # these are created at https://modal.com/settings/deepspeedai/tokens + # they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + # this one comes from https://huggingface.co/settings/profile of the bot user + # and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions + HF_TOKEN: ${{ secrets.HF_TOKEN }} + + if: needs.collect-tests.outputs.deepspeed == 'true' + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + with: + lfs: true + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: 'pip' # caching pip dependencies + + - name: Install build dependencies + run: | + pip install uv # much faster than pip + uv pip install --system modal + + - name: Run tests + run: | + modal run -m ci.accelerate diff --git a/.github/workflows/modal-torch-latest.yml b/.github/workflows/modal-torch-latest.yml new file mode 100644 index 000000000000..5305f10aea54 --- /dev/null +++ b/.github/workflows/modal-torch-latest.yml @@ -0,0 +1,99 @@ +name: modal-torch-latest + +# This CI is running on modal.com's GPUs. +# +# It's set up here on github actions and then the cloned repo is sent to modal and everything +# happens on their hw - see deepspeed/modal_ci/torch_latest.py for where the actual vm is loaded, updated and the tests are +# run. +# +# Both files are annotated to what's important and how one might change or update things if needed. +# +# Note that since this is a Required job we can't use `on.push.path` file filter - we are using +# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the +# Required status for PRs to pass. +# + + +on: + workflow_dispatch: + push: + branches: + - master + + pull_request: + paths-ignore: + - 'docs/**' + - 'blogs/**' + - 'deepspeed/inference/v2/**' + - 'tests/unit/inference/v2/**' + types: [draft, opened, ready_for_review, synchronize] + branches: + - master + +concurrency: + group: ${{ github.workflow }}-${{ github.ref || github.run_id }} + cancel-in-progress: true + +jobs: + collect-tests: + name: Collect tests to run + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + deepspeed: ${{ steps.filter.outputs.deepspeed }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + lfs: true + + - name: Filter changed files + uses: dorny/paths-filter@v2 + id: filter + with: + token: ${{ secrets.GITHUB_TOKEN }} + filters: | + deepspeed: + - 'deepspeed/**' + - '.github/workflows/modal*.yml' + - 'ci/**' + - 'tests/unit/**' + - 'csrc/**' + + deploy: + name: DeepSpeedAI CI + runs-on: ubuntu-latest + needs: collect-tests + env: + # these are created at https://modal.com/settings/deepspeedai/tokens + # they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + # this one comes from https://huggingface.co/settings/profile of the bot user + # and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions + HF_TOKEN: ${{ secrets.HF_TOKEN }} + + if: needs.collect-tests.outputs.deepspeed == 'true' + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + with: + lfs: true + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: 'pip' # caching pip dependencies + + - name: Install build dependencies + run: | + pip install uv # much faster than pip + uv pip install --system modal + + - name: Run tests + run: | + modal run -m ci.torch_latest diff --git a/ci/__init__.py b/ci/__init__.py new file mode 100644 index 000000000000..6f5f5619004b --- /dev/null +++ b/ci/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) DeepSpeed Team. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team diff --git a/ci/accelerate.py b/ci/accelerate.py new file mode 100644 index 000000000000..f9fc09d75f19 --- /dev/null +++ b/ci/accelerate.py @@ -0,0 +1,43 @@ +# Copyright (c) Snowflake. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from pathlib import Path + +import modal + +ROOT_PATH = Path(__file__).parents[1] + +# yapf: disable +image = (modal.Image + .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10") + .run_commands("apt update && apt install -y libaio-dev") + .apt_install("git") + .run_commands("uv pip install --system --compile-bytecode datasets==3.6.0") + .run_commands( + "git clone https://github.com/huggingface/accelerate && \ + uv pip install --system --compile-bytecode ./accelerate[testing]" + ) + .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any") + .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any") + .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True) + .run_commands("pip install /root") + .add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator") + .add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc") + .add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder") + ) + +app = modal.App("deepspeedai-accelerate-ci", image=image) + +@app.function( + gpu="l40s:1", + timeout=1800, +) +def pytest(): + import subprocess + subprocess.run( + "pytest /accelerate/tests/deepspeed".split(), + check=True, + cwd=ROOT_PATH / ".", + ) diff --git a/ci/torch_latest.py b/ci/torch_latest.py new file mode 100644 index 000000000000..c67b4d3982e4 --- /dev/null +++ b/ci/torch_latest.py @@ -0,0 +1,39 @@ +# Copyright (c) Snowflake. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from pathlib import Path + +import modal + +ROOT_PATH = Path(__file__).parents[1] + +# yapf: disable +image = (modal.Image + .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10") + .run_commands("apt update && apt install -y libaio-dev") + .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any") + .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any") + .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True) + .run_commands("pip install /root") + .add_local_dir(ROOT_PATH / "accelerator", remote_path="/root/deepspeed/accelerator") + .add_local_dir(ROOT_PATH / "csrc", remote_path="/root/deepspeed/ops/csrc") + .add_local_dir(ROOT_PATH / "op_builder", remote_path="/root/deepspeed/ops/op_builder") + ) + + +app = modal.App("deepspeedai-torch-latest-ci", image=image) + + +@app.function( + gpu="l40s:2", + timeout=1800, +) +def pytest(): + import subprocess + subprocess.run( + "pytest -n 4 --verbose tests/unit/runtime/zero/test_zero.py tests/unit/runtime/half_precision/test_bf16.py --torch_ver=2.6 --cuda_ver=12.4".split(), + check=True, + cwd=ROOT_PATH / ".", + ) diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py index ad937d81ecbc..fb0e393dd5da 100644 --- a/tests/unit/runtime/zero/test_zero.py +++ b/tests/unit/runtime/zero/test_zero.py @@ -394,7 +394,7 @@ def test(self, allgather_bucket_size, zero_stage=2): class TestPartitionNcclAlignment(DistributedTest): - world_size = 4 + world_size = 2 def test(self, zero_stage=2): config_dict = { @@ -835,7 +835,7 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor: @pytest.mark.parametrize("init_context_manager", [True, False]) @pytest.mark.parametrize("reduce_scatter", [True, False]) class TestZero3ParamPartitioningLargeParam(DistributedTest): - world_size = 4 + world_size = 2 def test(self, init_context_manager: bool, reduce_scatter: bool, param_sz: int = 8100) -> None: @@ -997,7 +997,7 @@ def forward(self, x: Tensor) -> Tensor: class TestZero3InitForParentWeightInitialization(DistributedTest): - world_size = 4 + world_size = 2 def test(self):