Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
62dbea1
working unified delta net
ymcki Jan 27, 2026
240bd4b
working unified delta net
ymcki Jan 27, 2026
4dea644
Merge branch 'delta_net' of github.com:ymcki/llama.cpp into delta_net
ymcki Jan 27, 2026
15818ac
ci: add test-backend-ops test for CPU (#19268)
am17an Feb 2, 2026
a3fa035
server: print actual model name in 'model not found" error (#19117)
teto Feb 2, 2026
9f682fb
ggml-cpu: FA split across kv for faster TG (#19209)
am17an Feb 2, 2026
07a7412
mtmd: add min/max pixels gguf metadata (#19273)
ngxson Feb 2, 2026
0dfcd3b
jinja : add missing 'in' test to template engine (#19004) (#19239)
sidmohan0 Feb 2, 2026
91ea44e
opencl: refactor some ops, concat, repeat, tanh and scale (#19226)
lhez Feb 2, 2026
aeb827a
spec : simplify time measurement using common_time_meas (#19262)
ggerganov Feb 3, 2026
1efb5f7
vocab: add Falcon-H1-Tiny-Coder FIM tokens (#19249)
vhsw Feb 3, 2026
41e3f02
cuda : revert CUDA_SCALE_LAUNCH_QUEUES override until investigated (#…
gaugarg-nv Feb 3, 2026
e9a859d
ggml: added cleanups in ggml_quantize_free (#19278)
noctrex Feb 3, 2026
1f1e57f
CUDA: Fix loop unrolling for BW in mul_mat_q_stream_k_fixup (#19053)
ORippler Feb 3, 2026
c55bce4
metal : minor cleanup (#19251)
ggerganov Feb 3, 2026
a6fd8ca
models : remove unnecessary cont in openelm (#19289)
CISC Feb 3, 2026
8bece2e
CUDA: use mmvq for mul-mat-id for small batch sizes (#18958)
am17an Feb 3, 2026
32b17ab
vulkan: disable coopmat1 fa on Nvidia Turing (#19290)
0cc4m Feb 3, 2026
faa1bc2
sampling : delegate input allocation to the scheduler (#19266)
ggerganov Feb 3, 2026
6a9bf2f
ci : add sanitizer runs for server (#19291)
ggerganov Feb 3, 2026
44008ce
metal : add solve_tri (#19302)
ggerganov Feb 3, 2026
2ceda3f
ggml-cpu: use LUT for converting e8->f32 scales on x86 (#19288)
am17an Feb 4, 2026
015deb9
ggml-virtgpu: make the code thread safe (#19204)
kpouget Feb 4, 2026
25f40ca
completion : simplify batch (embd) processing (#19286)
danbev Feb 4, 2026
d838c22
spec : fix the check-rate logic of ngram-simple (#19261)
ggerganov Feb 4, 2026
6ab881b
model-conversion : add tensor-info.py utility (#18954)
danbev Feb 4, 2026
eaba92c
tests : add non-cont, inplace rope tests (#19296)
ggerganov Feb 4, 2026
8abcc70
model: (qwen3next) correct vectorized key_gdiff calculation (#19324)
ngxson Feb 4, 2026
423bee4
ci : fix sanitize workflow to enable ggml sanitizers too (#19323)
ggerganov Feb 4, 2026
e0c93af
debug: make common_debug_print_tensor readable (#19331)
ngxson Feb 4, 2026
b536eb0
codeowners : add danbev for examples/debug (#19332)
danbev Feb 4, 2026
e6e934c
vendor: update cpp-httplib version (#19313)
taronaeo Feb 4, 2026
11fb327
vendor : add missing llama_add_compile_flags (#19322)
CISC Feb 5, 2026
af252d0
metal : add missing includes (#19348)
will-lms Feb 5, 2026
c342c3b
vulkan: fix non-contig rope (#19299)
jeffbolznv Feb 5, 2026
3409ab8
vulkan: Set k_load_shmem to false when K is too large (#19301)
jeffbolznv Feb 5, 2026
a498c75
vulkan: fix GPU deduplication logic. (#19222)
okuvshynov Feb 5, 2026
7a4f97d
metal : add diag (#19330)
ggerganov Feb 5, 2026
a4ea7a1
vendor : update BoringSSL to 0.20260204.0 (#19333)
angt Feb 5, 2026
b828e18
docker : fix vulkan build (#19352)
CISC Feb 5, 2026
3795cc1
benches : update models + numbers (#19359)
ggerganov Feb 5, 2026
449ec2a
vulkan: Preprocess FA mask to detect all-neg-inf and all-zero. (#19281)
jeffbolznv Feb 5, 2026
22cae83
metal : adaptive CPU/GPU interleave based on number of nodes (#19369)
ggerganov Feb 5, 2026
3e21647
cuda : cuda graphs now compare all node params (#19383)
ggerganov Feb 6, 2026
e696cfc
llama : rename llama-sampling to llama-sampler (#19363)
danbev Feb 6, 2026
7fcf1ef
metal : skip loading all-zero mask (#19337)
ggerganov Feb 6, 2026
f9bd518
vulkan: make FA mask/softcap enables spec constants (#19309)
jeffbolznv Feb 6, 2026
1946e46
vulkan: For coopmat2 FA, use fp16 accumulators for the final result (…
jeffbolznv Feb 6, 2026
3688c4f
Kimi-Linear support (backend agnostic + MLA KV cache) (#18755)
ymcki Feb 6, 2026
86e31e4
replace parent class of kimi-linear and qwen3next with llm_graph_cont…
ymcki Feb 6, 2026
cff950b
sync to b7841
ymcki Feb 6, 2026
15cbcf1
Merge branch 'master' into delta_net
pwilkin Feb 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .devops/vulkan.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ RUN apt-get update \
build-essential \
git \
python3 \
python3-dev \
python3-pip \
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ jobs:
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)

Expand All @@ -303,6 +304,7 @@ jobs:
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
Expand Down Expand Up @@ -466,7 +468,7 @@ jobs:
export GGML_VK_VISIBLE_DEVICES=0
export GGML_VK_DISABLE_F16=1
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4200
ctest -L main --verbose --timeout 4800

ubuntu-24-cmake-webgpu:
runs-on: ubuntu-24.04
Expand Down Expand Up @@ -1532,7 +1534,7 @@ jobs:
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

ggml-ci-arm64-cpu-high-perf:
runs-on: ubuntu-22.04-arm
Expand All @@ -1558,7 +1560,7 @@ jobs:
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

ggml-ci-arm64-cpu-high-perf-sve:
runs-on: ubuntu-22.04-arm
Expand Down
16 changes: 12 additions & 4 deletions .github/workflows/server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:

strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
build_type: [RelWithDebInfo]
include:
- build_type: Release
Expand All @@ -45,7 +45,7 @@ jobs:
- build_type: Release
sanitizer: ""
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
fail-fast: false

steps:
- name: Dependencies
Expand All @@ -72,7 +72,15 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
cmake -B build \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_SCHED_NO_REALLOC=ON \
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

- name: Python setup
Expand All @@ -88,7 +96,7 @@ jobs:

- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) && matrix.build_type == 'Release' }}
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
run: |
cd tools/server/tests
export ${{ matrix.extra_args }}
Expand Down
23 changes: 0 additions & 23 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -164,29 +164,6 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)

if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
message(STATUS "Using -fsanitize=thread")

add_compile_options(-fsanitize=thread)
link_libraries (-fsanitize=thread)
endif()

if (LLAMA_SANITIZE_ADDRESS)
message(STATUS "Using -fsanitize=address")

add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries (-fsanitize=address)
endif()

if (LLAMA_SANITIZE_UNDEFINED)
message(STATUS "Using -fsanitize=undefined")

add_compile_options(-fsanitize=undefined)
link_libraries (-fsanitize=undefined)
endif()
endif()

include("cmake/license.cmake")
license_add_file("llama.cpp" "LICENSE")

Expand Down
1 change: 1 addition & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
/examples/convert-llama2c-to-ggml/ @ggerganov
/examples/debug/ @danbev @pwilkin
/examples/deprecation-warning/ @ggerganov
/examples/diffusion/ @am17an
/examples/embedding/ @ggerganov
Expand Down
Loading