From 873738897805f8bff31fd8145bf67ef9e9379095 Mon Sep 17 00:00:00 2001
From: Liangliang-Ma <1906710196@qq.com>
Date: Wed, 10 Apr 2024 01:17:25 +0800
Subject: [PATCH] Add CI for Intel XPU/Max1100 (#5376)

Add basic workflow for tests on intel xpu. Currently we have part of
tests enabled. We will add more tests in later PRs.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 .github/workflows/xpu-max1100.yml | 75 +++++++++++++++++++++++++++++++
 tests/unit/util.py                |  3 ++
 2 files changed, 78 insertions(+)
 create mode 100644 .github/workflows/xpu-max1100.yml

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
new file mode 100644
index 000000000000..078cd61801a4
--- /dev/null
+++ b/.github/workflows/xpu-max1100.yml
@@ -0,0 +1,75 @@
+name: xpu-max1100
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/xpu.yml"
+      - "accelerator/xpu_accelerator.py"
+      - "accelerator/abstract_accelerator.py"
+      - "accelerator/cpu_accelerator.py"
+      - "accelerator/real_accelerator.py"
+      - "deepspeed/runtime/engine.py"
+      - "deepspeed/runtime/bf16_optimizer.py"
+      - "deepspeed/runtime/zero/stage_1_and_2.py"
+      - "deepspeed/runtime/zero/stage3.py"
+      - "deepspeed/runtime/zero/partition_parameters.py"
+      - "deepspeed/runtime/zero/partitioned_param_coordinator.py"
+      - "deepspeed/runtime/zero/parameter_offload.py"
+      - "deepspeed/runtime/pipe/engine.py"
+      - "deepspeed/runtime/utils.py"
+      - "deepspeed/inference/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, intel, xpu]
+    container:
+      image: intel/intel-extension-for-pytorch:2.1.20-xpu
+      ports:
+        - 80
+      options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Check container state
+      shell: bash
+      run: |
+        ldd --version
+        python -c "import torch; print('torch:', torch.__version__, torch)"
+        python -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+
+    - name: Install deepspeed
+      run: |
+        pip install py-cpuinfo
+        pip install .[dev,autotuning]
+        ds_report
+        python -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
+
+    - name: Python environment
+      run: |
+        pip list
+
+    - name: Unit tests
+      run: |
+        pip install pytest pytest-timeout tabulate
+        cd tests/unit
+        pytest --verbose accelerator/*
+        pytest --verbose autotuning/*
+        pytest --verbose checkpoint/test_reshape_checkpoint.py
+        pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
+        pytest --verbose runtime/test_ds_config_model.py
+        pytest --verbose runtime/pipe/test_pipe_schedule.py
+        pytest --verbose runtime/zero/test_zero_config.py
+        pytest --verbose runtime/zero/test_zero_tiled.py
+        pytest --verbose runtime/zero/test_zeropp.py
diff --git a/tests/unit/util.py b/tests/unit/util.py
index e8e0f476371b..5acf580ed3c8 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -48,6 +48,7 @@ def bf16_required_version_check(accelerator_check=True):
     nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)
     npu_available = get_accelerator().device_name() == 'npu'
     hpu_available = get_accelerator().device_name() == 'hpu'
+    xpu_available = get_accelerator().device_name() == 'xpu'
 
     if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass:
         return True
@@ -55,6 +56,8 @@ def bf16_required_version_check(accelerator_check=True):
         return True
     elif hpu_available:
         return True
+    elif xpu_available:
+        return True
     else:
         return False