Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 67 additions & 61 deletions .github/workflows/server-self-hosted.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,23 +42,6 @@ jobs:
server-metal:
runs-on: [self-hosted, llama-server, macOS, ARM64]

name: server-metal (${{ matrix.wf_name }})
strategy:
matrix:
build_type: [Release]
wf_name: ["GPUx1"]
include:
- build_type: Release
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
wf_name: "GPUx1, backend-sampling"
- build_type: Release
extra_args: "GGML_METAL_DEVICES=2"
wf_name: "GPUx2"
- build_type: Release
extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
wf_name: "GPUx2, backend-sampling"
fail-fast: false

steps:
- name: Clone
id: checkout
Expand All @@ -67,44 +50,58 @@ jobs:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: "24"
cache: "npm"
cache-dependency-path: "tools/ui/package-lock.json"

- name: Build
id: cmake_build
run: |
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server

- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
- name: Python setup
id: setup_python
run: |
cd tools/server/tests
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
export ${{ matrix.extra_args }}

- name: Tests (GPUx1)
id: server_integration_tests
if: ${{ !github.event.pull_request }}
run: |
cd tools/server/tests
source venv/bin/activate
pytest -v -x -m "not slow"

- name: Tests (GPUx1, backend-sampling)
id: server_integration_tests_backend_sampling
if: ${{ !github.event.pull_request }}
run: |
cd tools/server/tests
source venv/bin/activate
export LLAMA_ARG_BACKEND_SAMPLING=1
pytest -v -x -m "not slow"

- name: Tests (GPUx2)
id: server_integration_tests_gpu2
if: ${{ !github.event.pull_request }}
run: |
cd tools/server/tests
source venv/bin/activate
export GGML_METAL_DEVICES=2
pytest -v -x -m "not slow"

- name: Tests (GPUx2, backend-sampling)
id: server_integration_tests_gpu2_backend_sampling
if: ${{ !github.event.pull_request }}
run: |
cd tools/server/tests
source venv/bin/activate
export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1
pytest -v -x -m "not slow"

server-cuda:
runs-on: [self-hosted, llama-server, Linux, NVIDIA]

name: server-cuda (${{ matrix.wf_name }})
strategy:
matrix:
build_type: [Release]
wf_name: ["GPUx1"]
include:
- build_type: Release
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
wf_name: "GPUx1, backend-sampling"
fail-fast: false

steps:
- name: Clone
id: checkout
Expand All @@ -117,32 +114,36 @@ jobs:
id: cmake_build
run: |
cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
cmake --build build --config Release -j $(nproc) --target llama-server

- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
- name: Python setup
id: setup_python
run: |
cd tools/server/tests
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
export ${{ matrix.extra_args }}

- name: Tests (GPUx1)
id: server_integration_tests
if: ${{ !github.event.pull_request }}
run: |
cd tools/server/tests
source venv/bin/activate
pytest -v -x -m "not slow"

- name: Tests (GPUx1, backend-sampling)
id: server_integration_tests_backend_sampling
if: ${{ !github.event.pull_request }}
run: |
cd tools/server/tests
source venv/bin/activate
export LLAMA_ARG_BACKEND_SAMPLING=1
pytest -v -x -m "not slow"

server-kleidiai:
runs-on: ah-ubuntu_22_04-c8g_8x

name: server-kleidiai (${{ matrix.wf_name }})
strategy:
matrix:
include:
- build_type: Release
extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON"
extra_args: ""
wf_name: "CPUx1, kleidiai"
fail-fast: false

steps:
- name: Clone
id: checkout
Expand Down Expand Up @@ -181,16 +182,21 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }}
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON
cmake --build build --config Release -j $(nproc) --target llama-server

- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
- name: Python setup
id: setup_python
run: |
cd tools/server/tests
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
export ${{ matrix.extra_args }}

- name: Tests
id: server_integration_tests
if: ${{ !github.event.pull_request }}
run: |
cd tools/server/tests
source venv/bin/activate
pytest -v -x -m "not slow"