ggml-org · ggerganov · Jun 1, 2026 · May 30, 2026 · May 30, 2026 · May 31, 2026
@@ -109,40 +109,6 @@ jobs:
           cd build
           ctest -L main --verbose --timeout 900
 
-  macos-latest-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # TODO: this likely does not do anything - if yes, remove it
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: apple-ios
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_APP=OFF \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
   macos-latest-ios-xcode:
     runs-on: macos-latest
 

@@ -35,24 +35,12 @@ env:
 
 jobs:
   ubuntu-24-openvino:
-    name: ubuntu-24-openvino-${{ matrix.openvino_device }}
+    runs-on: [self-hosted, Linux, Intel, OpenVINO]
 
     concurrency:
-      group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
+      group: openvino-gpu-${{ github.head_ref || github.ref }}
       cancel-in-progress: false
 
-    strategy:
-      matrix:
-        include:
-          - variant: cpu
-            runner: '"ubuntu-24.04"'
-            openvino_device: "CPU"
-          - variant: gpu
-            runner: '["self-hosted","Linux","Intel","OpenVINO"]'
-            openvino_device: "GPU"
-
-    runs-on: ${{ fromJSON(matrix.runner) }}
-
     env:
       # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
       OPENVINO_VERSION_MAJOR: "2026.0"
@@ -63,31 +51,14 @@ jobs:
         id: checkout
         uses: actions/checkout@v6
 
-      - name: ccache
-        if: runner.environment == 'github-hosted'
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: openvino-ubuntu-24.04-${{ matrix.variant }}-no-preset-v1
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
       - name: Dependencies
         id: depends
         run: |
           sudo apt-get update
           sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
           sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
 
-      - name: Use OpenVINO Toolkit Cache
-        if: runner.environment == 'github-hosted'
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
       - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
         uses: ./.github/actions/linux-setup-openvino
         with:
           path: ./openvino_toolkit
@@ -109,12 +80,17 @@ jobs:
             -DGGML_OPENVINO=ON
           time cmake --build build/ReleaseOV --config Release -j $(nproc)
 
-      - name: Test
-        id: cmake_test
+      - name: Test (CPU)
+        id: cmake_test_cpu
+        # TODO: fix and re-enable the `test-llama-archs` test below
+        run: |
+          cd ${{ github.workspace }}
+          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
+
+      - name: Test (GPU)
+        id: cmake_test_gpu
         # TODO: fix and re-enable the `test-llama-archs` test below
         run: |
           cd ${{ github.workspace }}
-          if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
-            export GGML_OPENVINO_DEVICE=GPU
-          fi
+          export GGML_OPENVINO_DEVICE=GPU
           ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
@@ -34,8 +34,8 @@ env:
   LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
-  ubuntu-latest-rpc:
-    runs-on: ubuntu-latest
+  ubuntu-24-rpc:
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
 
     continue-on-error: true
 

@@ -210,7 +210,7 @@ jobs:
           GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
             bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  gpu-vulkan:
+  gpu-vulkan-apple:
     runs-on: [self-hosted, macOS, ARM64]
 
     steps:
@@ -261,7 +261,7 @@ jobs:
           # a valid python environment for testing
           LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
 
-  cpu-openvino-low-perf:
+  gpu-openvino-low-perf:
     runs-on: [self-hosted, Linux, Intel, OpenVINO]
 
     concurrency:
@@ -297,8 +297,8 @@ jobs:
           source ./openvino_toolkit/setupvars.sh
           GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  cpu-any-low-perf:
-    runs-on: [self-hosted, CPU]
+  cpu-x64-high-perf:
+    runs-on: [self-hosted, X64]
 
     steps:
       - name: Clone
@@ -308,22 +308,9 @@ jobs:
       - name: Test
         id: ggml-ci
         run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  cpu-any-high-perf:
-    runs-on: [self-hosted, CPU]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  cpu-arm64-graviton4:
+  cpu-arm64-high-perf-graviton4:
     runs-on: ah-ubuntu_22_04-c8g_8x
 
     steps:
@@ -360,7 +347,7 @@ jobs:
       - name: Test
         id: ggml-ci
         run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   cpu-arm64-graviton4-kleidiai:
     runs-on: ah-ubuntu_22_04-c8g_8x

@@ -36,16 +36,8 @@ env:
   LLAMA_ARG_LOG_TIMESTAMPS: 1
 
 jobs:
-  ubuntu:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-24.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
-
-    runs-on: ${{ matrix.os }}
+  ubuntu-arm64:
+    runs-on: ubuntu-24.04-arm
 
     steps:
       - name: Clone
@@ -63,7 +55,7 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
-          key: vulkan-${{ matrix.os }}-new
+          key: vulkan-ubuntu-24.04-arm-new
           variant: ccache
           evict-old-files: 1d
           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -130,15 +130,7 @@ jobs:
           ctest -L main -E test-backend-ops --verbose --timeout 900
 
   ubuntu-wasm:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-24.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
-
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-24.04-arm
 
     steps:
       - name: Clone
@@ -148,7 +140,7 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
-          key: webgpu-${{ matrix.os }}-wasm
+          key: webgpu-ubuntu-24.04-arm-wasm
           evict-old-files: 1d
           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 

@@ -55,21 +55,7 @@ concurrency:
 
 jobs:
   ubuntu:
-    runs-on: ubuntu-24.04
-
-    name: ubuntu (${{ matrix.wf_name }})
-    strategy:
-      matrix:
-        build_type: [Release]
-        wf_name: ["default"]
-        include:
-          - build_type: Release
-            extra_args: ""
-            wf_name:    "default"
-          - build_type: Release
-            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-            wf_name:    "backend-sampling"
-      fail-fast: false
+    runs-on: ubuntu-24.04-arm
 
     steps:
       - name: Dependencies
@@ -96,7 +82,7 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
-          key: server-ubuntu-24.04-x64
+          key: server-ubuntu-24.04-arm
           evict-old-files: 1d
           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
@@ -105,7 +91,7 @@ jobs:
         run: |
           cmake -B build \
             -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+          cmake --build build --config Release -j $(nproc) --target llama-server
 
       - name: Python setup
         id: setup_python
@@ -116,18 +102,32 @@ jobs:
 
       - name: Tests
         id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+        if: ${{ !github.event.pull_request }}
         run: |
           cd tools/server/tests
-          export ${{ matrix.extra_args }}
           pytest -v -x -m "not slow"
 
       - name: Slow tests
         id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+        if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
         run: |
           cd tools/server/tests
-          export ${{ matrix.extra_args }}
+          SLOW_TESTS=1 pytest -v -x
+
+      - name: Tests (Backend sampling)
+        id: server_integration_tests_backend_sampling
+        if: ${{ !github.event.pull_request }}
+        run: |
+          cd tools/server/tests
+          export LLAMA_ARG_BACKEND_SAMPLING=1
+          pytest -v -x -m "not slow"
+
+      - name: Slow tests (Backend sampling)
+        id: server_integration_tests_slow_backend_sampling
+        if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
+        run: |
+          cd tools/server/tests
+          export LLAMA_ARG_BACKEND_SAMPLING=1
           SLOW_TESTS=1 pytest -v -x
 
   windows:
@@ -169,15 +169,15 @@ jobs:
 
       - name: Tests
         id: server_integration_tests
-        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
+        if: ${{ !github.event.pull_request }}
         run: |
           cd tools/server/tests
           $env:PYTHONIOENCODING = ":replace"
           pytest -v -x -m "not slow"
 
       - name: Slow tests
         id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+        if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
         run: |
           cd tools/server/tests
           $env:SLOW_TESTS = "1"