From b61cc74db0954a5c335b554dc66a739871a99632 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 2 Jun 2026 10:39:57 +0300 Subject: [PATCH] ci(self-hosted) : reduce server workflow jobs Reduce the number of parallel jobs in server-self-hosted.yml by stacking test configurations as sequential steps within a single job, following the pattern from #23927. - server-metal: 4 matrix jobs -> 1 job with 4 sequential test steps - server-cuda: 2 matrix jobs -> 1 job with 2 sequential test steps - server-kleidiai: removed unnecessary single-entry matrix - removed unused Setup Node.js step from server-metal Total: 7 parallel jobs -> 3 parallel jobs Assisted-by: llama.cpp:local pi --- .github/workflows/server-self-hosted.yml | 128 ++++++++++++----------- 1 file changed, 67 insertions(+), 61 deletions(-) diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index b9baede58b3e..2dcd6d7425aa 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -42,23 +42,6 @@ jobs: server-metal: runs-on: [self-hosted, llama-server, macOS, ARM64] - name: server-metal (${{ matrix.wf_name }}) - strategy: - matrix: - build_type: [Release] - wf_name: ["GPUx1"] - include: - - build_type: Release - extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" - wf_name: "GPUx1, backend-sampling" - - build_type: Release - extra_args: "GGML_METAL_DEVICES=2" - wf_name: "GPUx2" - - build_type: Release - extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1" - wf_name: "GPUx2, backend-sampling" - fail-fast: false - steps: - name: Clone id: checkout @@ -67,44 +50,58 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Setup Node.js - uses: actions/setup-node@v6 - with: - node-version: "24" - cache: "npm" - cache-dependency-path: "tools/ui/package-lock.json" - - name: Build id: cmake_build run: | cmake -B build -DGGML_SCHED_NO_REALLOC=ON - cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server - - name: Tests - id: server_integration_tests - if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + - name: Python setup + id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - export ${{ matrix.extra_args }} + + - name: Tests (GPUx1) + id: server_integration_tests + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + pytest -v -x -m "not slow" + + - name: Tests (GPUx1, backend-sampling) + id: server_integration_tests_backend_sampling + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + export LLAMA_ARG_BACKEND_SAMPLING=1 + pytest -v -x -m "not slow" + + - name: Tests (GPUx2) + id: server_integration_tests_gpu2 + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + export GGML_METAL_DEVICES=2 + pytest -v -x -m "not slow" + + - name: Tests (GPUx2, backend-sampling) + id: server_integration_tests_gpu2_backend_sampling + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1 pytest -v -x -m "not slow" server-cuda: runs-on: [self-hosted, llama-server, Linux, NVIDIA] - name: server-cuda (${{ matrix.wf_name }}) - strategy: - matrix: - build_type: [Release] - wf_name: ["GPUx1"] - include: - - build_type: Release - extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" - wf_name: "GPUx1, backend-sampling" - fail-fast: false - steps: - name: Clone id: checkout @@ -117,32 +114,36 @@ jobs: id: cmake_build run: | cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + cmake --build build --config Release -j $(nproc) --target llama-server - - name: Tests - id: server_integration_tests - if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + - name: Python setup + id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - export ${{ matrix.extra_args }} + + - name: Tests (GPUx1) + id: server_integration_tests + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + pytest -v -x -m "not slow" + + - name: Tests (GPUx1, backend-sampling) + id: server_integration_tests_backend_sampling + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate + export LLAMA_ARG_BACKEND_SAMPLING=1 pytest -v -x -m "not slow" server-kleidiai: runs-on: ah-ubuntu_22_04-c8g_8x - name: server-kleidiai (${{ matrix.wf_name }}) - strategy: - matrix: - include: - - build_type: Release - extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON" - extra_args: "" - wf_name: "CPUx1, kleidiai" - fail-fast: false - steps: - name: Clone id: checkout @@ -181,16 +182,21 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }} - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON + cmake --build build --config Release -j $(nproc) --target llama-server - - name: Tests - id: server_integration_tests - if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + - name: Python setup + id: setup_python run: | cd tools/server/tests python3 -m venv venv source venv/bin/activate pip install -r requirements.txt - export ${{ matrix.extra_args }} + + - name: Tests + id: server_integration_tests + if: ${{ !github.event.pull_request }} + run: | + cd tools/server/tests + source venv/bin/activate pytest -v -x -m "not slow"