diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index b9baede58b3e..2dcd6d7425aa 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -42,23 +42,6 @@ jobs: server-metal: runs-on: [self-hosted, llama-server, macOS, ARM64] - name: server-metal (${{ matrix.wf_name }}) - strategy: - matrix: - build_type: [Release] - wf_name: ["GPUx1"] - include: - - build_type: Release - extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" - wf_name: "GPUx1, backend-sampling" - - build_type: Release - extra_args: "GGML_METAL_DEVICES=2" - wf_name: "GPUx2" - - build_type: Release - extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1" - wf_name: "GPUx2, backend-sampling" - fail-fast: false - steps: - name: Clone id: checkout @@ -67,44 +50,58 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Setup Node.js - uses: actions/setup-node@v6 - with: - node-version: "24" - cache: "npm" - cache-dependency-path: "tools/ui/package-lock.json" - - name: Build id: cmake_build run: | cmake -B build -DGGML_SCHED_NO_REALLOC=ON - cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server - - name: Tests - id: server_integration_tests - if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + - name: Python setup + id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - export ${{ matrix.extra_args }} + + - name: Tests (GPUx1) + id: server_integration_tests + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + pytest -v -x -m "not slow" + + - name: Tests (GPUx1, backend-sampling) + id: server_integration_tests_backend_sampling + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + export LLAMA_ARG_BACKEND_SAMPLING=1 + pytest -v -x -m "not slow" + + - name: Tests (GPUx2) + id: server_integration_tests_gpu2 + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + export GGML_METAL_DEVICES=2 + pytest -v -x -m "not slow" + + - name: Tests (GPUx2, backend-sampling) + id: server_integration_tests_gpu2_backend_sampling + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1 pytest -v -x -m "not slow" server-cuda: runs-on: [self-hosted, llama-server, Linux, NVIDIA] - name: server-cuda (${{ matrix.wf_name }}) - strategy: - matrix: - build_type: [Release] - wf_name: ["GPUx1"] - include: - - build_type: Release - extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" - wf_name: "GPUx1, backend-sampling" - fail-fast: false - steps: - name: Clone id: checkout @@ -117,32 +114,36 @@ jobs: id: cmake_build run: | cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + cmake --build build --config Release -j $(nproc) --target llama-server - - name: Tests - id: server_integration_tests - if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + - name: Python setup + id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - export ${{ matrix.extra_args }} + + - name: Tests (GPUx1) + id: server_integration_tests + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + pytest -v -x -m "not slow" + + - name: Tests (GPUx1, backend-sampling) + id: server_integration_tests_backend_sampling + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + export LLAMA_ARG_BACKEND_SAMPLING=1 pytest -v -x -m "not slow" server-kleidiai: runs-on: ah-ubuntu_22_04-c8g_8x - name: server-kleidiai (${{ matrix.wf_name }}) - strategy: - matrix: - include: - - build_type: Release - extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON" - extra_args: "" - wf_name: "CPUx1, kleidiai" - fail-fast: false - steps: - name: Clone id: checkout @@ -181,16 +182,21 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }} - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON + cmake --build build --config Release -j $(nproc) --target llama-server - - name: Tests - id: server_integration_tests - if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + - name: Python setup + id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - export ${{ matrix.extra_args }} + + - name: Tests + id: server_integration_tests + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate pytest -v -x -m "not slow"