diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml new file mode 100644 index 000000000000..078cd61801a4 --- /dev/null +++ b/.github/workflows/xpu-max1100.yml @@ -0,0 +1,75 @@ +name: xpu-max1100 + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - ".github/workflows/xpu.yml" + - "accelerator/xpu_accelerator.py" + - "accelerator/abstract_accelerator.py" + - "accelerator/cpu_accelerator.py" + - "accelerator/real_accelerator.py" + - "deepspeed/runtime/engine.py" + - "deepspeed/runtime/bf16_optimizer.py" + - "deepspeed/runtime/zero/stage_1_and_2.py" + - "deepspeed/runtime/zero/stage3.py" + - "deepspeed/runtime/zero/partition_parameters.py" + - "deepspeed/runtime/zero/partitioned_param_coordinator.py" + - "deepspeed/runtime/zero/parameter_offload.py" + - "deepspeed/runtime/pipe/engine.py" + - "deepspeed/runtime/utils.py" + - "deepspeed/inference/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + + +jobs: + unit-tests: + runs-on: [self-hosted, intel, xpu] + container: + image: intel/intel-extension-for-pytorch:2.1.20-xpu + ports: + - 80 + options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL + + steps: + - uses: actions/checkout@v3 + - name: Check container state + shell: bash + run: | + ldd --version + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())" + + - name: Install deepspeed + run: | + pip install py-cpuinfo + pip install .[dev,autotuning] + ds_report + python -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)" + + - name: Python environment + run: | + pip list + + - name: Unit tests + run: | + pip install pytest pytest-timeout tabulate + cd tests/unit + pytest --verbose accelerator/* + pytest --verbose autotuning/* + pytest --verbose checkpoint/test_reshape_checkpoint.py + pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py + pytest --verbose runtime/test_ds_config_model.py + pytest --verbose runtime/pipe/test_pipe_schedule.py + pytest --verbose runtime/zero/test_zero_config.py + pytest --verbose runtime/zero/test_zero_tiled.py + pytest --verbose runtime/zero/test_zeropp.py diff --git a/tests/unit/util.py b/tests/unit/util.py index e8e0f476371b..5acf580ed3c8 100644 --- a/tests/unit/util.py +++ b/tests/unit/util.py @@ -48,6 +48,7 @@ def bf16_required_version_check(accelerator_check=True): nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10) npu_available = get_accelerator().device_name() == 'npu' hpu_available = get_accelerator().device_name() == 'hpu' + xpu_available = get_accelerator().device_name() == 'xpu' if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass: return True @@ -55,6 +56,8 @@ def bf16_required_version_check(accelerator_check=True): return True elif hpu_available: return True + elif xpu_available: + return True else: return False