From 1ad258ca31869f2af6479bffef4b8097da8d341d Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 25 Jun 2025 14:16:31 +0200 Subject: [PATCH 0001/1348] stream : add nullptr check of whisper_context (#3283) * stream : add nullptr check of whisper_context This commit adds a check to ensure that the `whisper_context` is not null after initialization. The motivation for this is that currently, if the initialization fails, the program continues to run leading to a segmentation fault. This sort of check is performed by others examples like whisper-cli. Refs: https://github.com/ggml-org/whisper.cpp/issues/3280#issuecomment-3003778035 * examples : add nullptr check for whisper_context --- examples/bench/bench.cpp | 4 ++++ examples/command/command.cpp | 4 ++++ examples/stream/stream.cpp | 4 ++++ examples/vad-speech-segments/speech.cpp | 4 ++++ 4 files changed, 16 insertions(+) diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index 979dca8ee47..1512f316e54 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -67,6 +67,10 @@ static int whisper_bench_full(const whisper_params & params) { cparams.flash_attn = params.flash_attn; struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); + if (ctx == nullptr) { + fprintf(stderr, "error: failed to initialize whisper context\n"); + return 2; + } { fprintf(stderr, "\n"); diff --git a/examples/command/command.cpp b/examples/command/command.cpp index 7a09d30a319..0f87710cefa 100644 --- a/examples/command/command.cpp +++ b/examples/command/command.cpp @@ -709,6 +709,10 @@ int main(int argc, char ** argv) { cparams.flash_attn = params.flash_attn; struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); + if (ctx == nullptr) { + fprintf(stderr, "error: failed to initialize whisper context\n"); + return 2; + } // print some info about the processing { diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index bc6f13fb267..37b23886821 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -163,6 +163,10 @@ int main(int argc, char ** argv) { cparams.flash_attn = params.flash_attn; struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); + if (ctx == nullptr) { + fprintf(stderr, "error: failed to initialize whisper context\n"); + return 2; + } std::vector pcmf32 (n_samples_30s, 0.0f); std::vector pcmf32_old; diff --git a/examples/vad-speech-segments/speech.cpp b/examples/vad-speech-segments/speech.cpp index 26241d950a0..a22425c4b8c 100644 --- a/examples/vad-speech-segments/speech.cpp +++ b/examples/vad-speech-segments/speech.cpp @@ -111,6 +111,10 @@ int main(int argc, char ** argv) { struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params( cli_params.vad_model.c_str(), ctx_params); + if (vctx == nullptr) { + fprintf(stderr, "error: failed to initialize whisper context\n"); + return 2; + } // Detect speech in the input audio file. if (!whisper_vad_detect_speech(vctx, pcmf32.data(), pcmf32.size())) { From dc8dda60eed32ba3e0a027b729ea5c047c79d9e8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 25 Jun 2025 15:59:23 +0300 Subject: [PATCH 0002/1348] bench : print system info before ctx check --- examples/bench/bench.cpp | 7 +------ scripts/bench-all.sh | 2 ++ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index 1512f316e54..4dbc1eb9b2a 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -66,17 +66,12 @@ static int whisper_bench_full(const whisper_params & params) { cparams.use_gpu = params.use_gpu; cparams.flash_attn = params.flash_attn; - struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); - if (ctx == nullptr) { - fprintf(stderr, "error: failed to initialize whisper context\n"); - return 2; - } - { fprintf(stderr, "\n"); fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info()); } + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); if (ctx == nullptr) { fprintf(stderr, "error: failed to initialize whisper context\n"); return 2; diff --git a/scripts/bench-all.sh b/scripts/bench-all.sh index 01d6143ce31..4c1a7a101fd 100755 --- a/scripts/bench-all.sh +++ b/scripts/bench-all.sh @@ -104,6 +104,8 @@ for model in "${models[@]}"; do if [[ $system_info == *"METAL = 1"* ]]; then config="$config METAL" + elif [[ $system_info == *"Metal : EMBED_LIBRARY = 1"* ]]; then + config="$config METAL" fi commit=$(git rev-parse --short HEAD) From 06bdaa6c0c0e14641404634fb635086c026e76bd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 25 Jun 2025 16:45:19 +0300 Subject: [PATCH 0003/1348] bench : update benches --- scripts/bench-all-gg.txt | 199 ++++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 99 deletions(-) diff --git a/scripts/bench-all-gg.txt b/scripts/bench-all-gg.txt index ad543c22313..eb1c56bed63 100644 --- a/scripts/bench-all-gg.txt +++ b/scripts/bench-all-gg.txt @@ -111,62 +111,61 @@ make -j && ./scripts/bench-all.sh 1 1 0 | CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| M2 ULTRA | METAL | tiny | 1 | 0 | 8.74 | 1.20 | 0.36 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | tiny-q5_0 | 1 | 0 | 10.30 | 1.15 | 0.38 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | tiny-q5_1 | 1 | 0 | 10.71 | 1.13 | 0.38 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | tiny-q8_0 | 1 | 0 | 9.97 | 1.12 | 0.37 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | base | 1 | 0 | 16.77 | 1.71 | 0.44 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | base-q5_0 | 1 | 0 | 16.92 | 1.63 | 0.44 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | base-q5_1 | 1 | 0 | 16.84 | 1.63 | 0.44 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | base-q8_0 | 1 | 0 | 16.12 | 1.63 | 0.44 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | small | 1 | 0 | 45.29 | 3.44 | 0.92 | 0.05 | ad4e350 | -| M2 ULTRA | METAL | small-q5_0 | 1 | 0 | 50.43 | 3.34 | 0.94 | 0.06 | ad4e350 | -| M2 ULTRA | METAL | small-q5_1 | 1 | 0 | 50.49 | 3.35 | 0.93 | 0.06 | ad4e350 | -| M2 ULTRA | METAL | small-q8_0 | 1 | 0 | 47.37 | 3.20 | 0.91 | 0.05 | ad4e350 | -| M2 ULTRA | METAL | medium | 1 | 0 | 122.81 | 7.39 | 1.99 | 0.12 | ad4e350 | -| M2 ULTRA | METAL | medium-q5_0 | 1 | 0 | 140.62 | 6.73 | 2.03 | 0.14 | ad4e350 | -| M2 ULTRA | METAL | medium-q5_1 | 1 | 0 | 140.44 | 6.74 | 2.04 | 0.14 | ad4e350 | -| M2 ULTRA | METAL | medium-q8_0 | 1 | 0 | 131.05 | 6.54 | 1.95 | 0.13 | ad4e350 | -| M2 ULTRA | METAL | medium-dis | 1 | 0 | 110.95 | 0.99 | 0.24 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | large-v2 | 1 | 0 | 222.19 | 10.93 | 3.01 | 0.21 | ad4e350 | -| M2 ULTRA | METAL | large-v2-q5_0 | 1 | 0 | 258.47 | 9.75 | 3.01 | 0.25 | ad4e350 | -| M2 ULTRA | METAL | large-v2-q5_1 | 1 | 0 | 258.40 | 9.85 | 3.01 | 0.24 | ad4e350 | -| M2 ULTRA | METAL | large-v2-q8_0 | 1 | 0 | 236.68 | 9.61 | 2.85 | 0.23 | ad4e350 | -| M2 ULTRA | METAL | large-v2-dis | 1 | 0 | 199.28 | 1.12 | 0.27 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | large-v3-turbo | 1 | 0 | 201.49 | 1.76 | 0.45 | 0.03 | ad4e350 | -| M2 ULTRA | METAL | large-v3-turbo-q5_0 | 1 | 0 | 233.70 | 1.55 | 0.46 | 0.04 | ad4e350 | -| M2 ULTRA | METAL | large-v3-turbo-q8_0 | 1 | 0 | 214.20 | 1.51 | 0.44 | 0.04 | ad4e350 | - +| M2 ULTRA | METAL | tiny | 1 | 0 | 10.15 | 1.20 | 0.36 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | tiny-q5_0 | 1 | 0 | 10.21 | 1.15 | 0.39 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | tiny-q5_1 | 1 | 0 | 9.26 | 1.15 | 0.38 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | tiny-q8_0 | 1 | 0 | 9.00 | 1.12 | 0.37 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | base | 1 | 0 | 15.77 | 1.73 | 0.45 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | base-q5_0 | 1 | 0 | 16.90 | 1.63 | 0.44 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | base-q5_1 | 1 | 0 | 16.93 | 1.64 | 0.44 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | base-q8_0 | 1 | 0 | 16.13 | 1.63 | 0.43 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | small | 1 | 0 | 45.15 | 3.45 | 0.92 | 0.05 | dc8dda60 | +| M2 ULTRA | METAL | small-q5_0 | 1 | 0 | 50.63 | 3.36 | 0.94 | 0.06 | dc8dda60 | +| M2 ULTRA | METAL | small-q5_1 | 1 | 0 | 50.56 | 3.36 | 0.94 | 0.06 | dc8dda60 | +| M2 ULTRA | METAL | small-q8_0 | 1 | 0 | 47.52 | 3.20 | 0.92 | 0.05 | dc8dda60 | +| M2 ULTRA | METAL | medium | 1 | 0 | 122.55 | 7.38 | 1.95 | 0.12 | dc8dda60 | +| M2 ULTRA | METAL | medium-q5_0 | 1 | 0 | 140.61 | 6.73 | 2.02 | 0.14 | dc8dda60 | +| M2 ULTRA | METAL | medium-q5_1 | 1 | 0 | 140.48 | 6.76 | 2.04 | 0.14 | dc8dda60 | +| M2 ULTRA | METAL | medium-q8_0 | 1 | 0 | 131.00 | 6.57 | 1.96 | 0.13 | dc8dda60 | +| M2 ULTRA | METAL | medium-dis | 1 | 0 | 110.85 | 1.00 | 0.24 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | large-v2 | 1 | 0 | 222.28 | 10.96 | 3.03 | 0.21 | dc8dda60 | +| M2 ULTRA | METAL | large-v2-q5_0 | 1 | 0 | 258.64 | 9.79 | 3.04 | 0.25 | dc8dda60 | +| M2 ULTRA | METAL | large-v2-q5_1 | 1 | 0 | 258.32 | 9.87 | 3.05 | 0.24 | dc8dda60 | +| M2 ULTRA | METAL | large-v2-q8_0 | 1 | 0 | 236.55 | 9.61 | 2.87 | 0.23 | dc8dda60 | +| M2 ULTRA | METAL | large-v2-dis | 1 | 0 | 199.84 | 1.14 | 0.27 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | large-v3-turbo | 1 | 0 | 201.52 | 1.77 | 0.45 | 0.03 | dc8dda60 | +| M2 ULTRA | METAL | large-v3-turbo-q5_0 | 1 | 0 | 233.14 | 1.56 | 0.47 | 0.04 | dc8dda60 | +| M2 ULTRA | METAL | large-v3-turbo-q8_0 | 1 | 0 | 214.23 | 1.53 | 0.44 | 0.04 | dc8dda60 | make -j && ./scripts/bench-all.sh 1 1 1 | CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| M2 ULTRA | METAL | tiny | 1 | 1 | 7.82 | 1.31 | 0.35 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | tiny-q5_0 | 1 | 1 | 8.32 | 1.28 | 0.37 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | tiny-q5_1 | 1 | 1 | 8.21 | 1.28 | 0.37 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | tiny-q8_0 | 1 | 1 | 7.97 | 1.23 | 0.36 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | base | 1 | 1 | 13.96 | 1.80 | 0.42 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | base-q5_0 | 1 | 1 | 15.19 | 1.75 | 0.42 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | base-q5_1 | 1 | 1 | 15.09 | 1.75 | 0.42 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | base-q8_0 | 1 | 1 | 14.45 | 1.70 | 0.41 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | small | 1 | 1 | 40.08 | 3.54 | 0.86 | 0.05 | ad4e350 | -| M2 ULTRA | METAL | small-q5_0 | 1 | 1 | 45.07 | 3.51 | 0.88 | 0.05 | ad4e350 | -| M2 ULTRA | METAL | small-q5_1 | 1 | 1 | 45.05 | 3.52 | 0.88 | 0.05 | ad4e350 | -| M2 ULTRA | METAL | small-q8_0 | 1 | 1 | 42.04 | 3.34 | 0.85 | 0.05 | ad4e350 | -| M2 ULTRA | METAL | medium | 1 | 1 | 107.20 | 7.28 | 1.79 | 0.11 | ad4e350 | -| M2 ULTRA | METAL | medium-q5_0 | 1 | 1 | 125.02 | 6.67 | 1.83 | 0.12 | ad4e350 | -| M2 ULTRA | METAL | medium-q5_1 | 1 | 1 | 124.83 | 6.70 | 1.84 | 0.12 | ad4e350 | -| M2 ULTRA | METAL | medium-q8_0 | 1 | 1 | 114.56 | 6.53 | 1.79 | 0.11 | ad4e350 | -| M2 ULTRA | METAL | medium-dis | 1 | 1 | 95.96 | 1.01 | 0.23 | 0.01 | ad4e350 | -| M2 ULTRA | METAL | large-v2 | 1 | 1 | 194.29 | 10.57 | 2.67 | 0.20 | ad4e350 | -| M2 ULTRA | METAL | large-v2-q5_0 | 1 | 1 | 230.74 | 9.57 | 2.73 | 0.23 | ad4e350 | -| M2 ULTRA | METAL | large-v2-q5_1 | 1 | 1 | 229.97 | 9.69 | 2.74 | 0.23 | ad4e350 | -| M2 ULTRA | METAL | large-v2-q8_0 | 1 | 1 | 208.11 | 9.37 | 2.60 | 0.21 | ad4e350 | -| M2 ULTRA | METAL | large-v2-dis | 1 | 1 | 172.72 | 1.12 | 0.26 | 0.02 | ad4e350 | -| M2 ULTRA | METAL | large-v3-turbo | 1 | 1 | 174.46 | 1.74 | 0.42 | 0.03 | ad4e350 | -| M2 ULTRA | METAL | large-v3-turbo-q5_0 | 1 | 1 | 205.78 | 1.54 | 0.42 | 0.04 | ad4e350 | -| M2 ULTRA | METAL | large-v3-turbo-q8_0 | 1 | 1 | 186.33 | 1.50 | 0.40 | 0.03 | ad4e350 | +| M2 ULTRA | METAL | tiny | 1 | 1 | 7.72 | 1.05 | 0.32 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | tiny-q5_0 | 1 | 1 | 8.20 | 0.98 | 0.31 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | tiny-q5_1 | 1 | 1 | 8.13 | 0.99 | 0.31 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | tiny-q8_0 | 1 | 1 | 7.96 | 0.93 | 0.30 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | base | 1 | 1 | 13.52 | 1.39 | 0.35 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | base-q5_0 | 1 | 1 | 14.88 | 1.31 | 0.34 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | base-q5_1 | 1 | 1 | 14.76 | 1.33 | 0.34 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | base-q8_0 | 1 | 1 | 14.04 | 1.28 | 0.34 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | small | 1 | 1 | 38.78 | 2.72 | 0.67 | 0.04 | dc8dda60 | +| M2 ULTRA | METAL | small-q5_0 | 1 | 1 | 44.01 | 2.64 | 0.69 | 0.05 | dc8dda60 | +| M2 ULTRA | METAL | small-q5_1 | 1 | 1 | 44.02 | 2.66 | 0.69 | 0.05 | dc8dda60 | +| M2 ULTRA | METAL | small-q8_0 | 1 | 1 | 40.79 | 2.49 | 0.67 | 0.05 | dc8dda60 | +| M2 ULTRA | METAL | medium | 1 | 1 | 104.48 | 5.57 | 1.61 | 0.10 | dc8dda60 | +| M2 ULTRA | METAL | medium-q5_0 | 1 | 1 | 122.24 | 5.00 | 1.58 | 0.12 | dc8dda60 | +| M2 ULTRA | METAL | medium-q5_1 | 1 | 1 | 121.99 | 5.02 | 1.59 | 0.12 | dc8dda60 | +| M2 ULTRA | METAL | medium-q8_0 | 1 | 1 | 111.68 | 4.99 | 1.52 | 0.11 | dc8dda60 | +| M2 ULTRA | METAL | medium-dis | 1 | 1 | 93.23 | 0.87 | 0.21 | 0.01 | dc8dda60 | +| M2 ULTRA | METAL | large-v2 | 1 | 1 | 189.82 | 8.36 | 2.35 | 0.19 | dc8dda60 | +| M2 ULTRA | METAL | large-v2-q5_0 | 1 | 1 | 225.73 | 7.34 | 2.40 | 0.22 | dc8dda60 | +| M2 ULTRA | METAL | large-v2-q5_1 | 1 | 1 | 225.88 | 7.60 | 2.40 | 0.22 | dc8dda60 | +| M2 ULTRA | METAL | large-v2-q8_0 | 1 | 1 | 203.55 | 7.32 | 2.26 | 0.20 | dc8dda60 | +| M2 ULTRA | METAL | large-v2-dis | 1 | 1 | 168.20 | 0.98 | 0.24 | 0.02 | dc8dda60 | +| M2 ULTRA | METAL | large-v3-turbo | 1 | 1 | 170.22 | 1.46 | 0.37 | 0.03 | dc8dda60 | +| M2 ULTRA | METAL | large-v3-turbo-q5_0 | 1 | 1 | 201.88 | 1.27 | 0.38 | 0.04 | dc8dda60 | +| M2 ULTRA | METAL | large-v3-turbo-q8_0 | 1 | 1 | 182.37 | 1.24 | 0.36 | 0.03 | dc8dda60 | ## M4 Max @@ -219,67 +218,69 @@ make -j && ./scripts/bench-all.sh 1 1 0 | CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| M4 Max | METAL | tiny | 1 | 0 | 13.12 | 0.87 | 0.29 | 0.01 | ad4e3509 | -| M4 Max | METAL | tiny-q8_0 | 1 | 0 | 15.90 | 0.88 | 0.31 | 0.01 | ad4e3509 | -| M4 Max | METAL | base | 1 | 0 | 23.10 | 1.42 | 0.34 | 0.02 | ad4e3509 | -| M4 Max | METAL | base-q8_0 | 1 | 0 | 27.25 | 1.31 | 0.34 | 0.02 | ad4e3509 | -| M4 Max | METAL | small | 1 | 0 | 71.76 | 3.02 | 0.70 | 0.06 | ad4e3509 | -| M4 Max | METAL | small-q8_0 | 1 | 0 | 73.88 | 2.60 | 0.71 | 0.06 | ad4e3509 | -| M4 Max | METAL | medium | 1 | 0 | 208.22 | 6.94 | 1.55 | 0.16 | ad4e3509 | -| M4 Max | METAL | medium-q8_0 | 1 | 0 | 214.65 | 5.90 | 1.57 | 0.17 | ad4e3509 | -| M4 Max | METAL | large-v2 | 1 | 0 | 381.72 | 11.28 | 2.51 | 0.29 | ad4e3509 | -| M4 Max | METAL | large-v2-q8_0 | 1 | 0 | 394.97 | 8.90 | 2.45 | 0.30 | ad4e3509 | +| M4 Max | METAL | tiny | 1 | 0 | 12.83 | 0.94 | 0.30 | 0.01 | dc8dda60 | +| M4 Max | METAL | tiny-q8_0 | 1 | 0 | 12.95 | 0.80 | 0.31 | 0.01 | dc8dda60 | +| M4 Max | METAL | base | 1 | 0 | 23.54 | 1.37 | 0.33 | 0.02 | dc8dda60 | +| M4 Max | METAL | base-q8_0 | 1 | 0 | 24.14 | 1.24 | 0.33 | 0.02 | dc8dda60 | +| M4 Max | METAL | small | 1 | 0 | 71.59 | 3.02 | 0.71 | 0.06 | dc8dda60 | +| M4 Max | METAL | small-q8_0 | 1 | 0 | 73.34 | 2.65 | 0.72 | 0.06 | dc8dda60 | +| M4 Max | METAL | medium | 1 | 0 | 208.53 | 7.02 | 1.58 | 0.16 | dc8dda60 | +| M4 Max | METAL | medium-q8_0 | 1 | 0 | 212.87 | 6.00 | 1.58 | 0.17 | dc8dda60 | +| M4 Max | METAL | large-v2 | 1 | 0 | 379.84 | 11.47 | 2.52 | 0.29 | dc8dda60 | +| M4 Max | METAL | large-v2-q8_0 | 1 | 0 | 390.45 | 9.19 | 2.48 | 0.29 | dc8dda60 | +| M4 Max | METAL | large-v3-turbo | 1 | 0 | 345.74 | 1.99 | 0.44 | 0.05 | dc8dda60 | make -j && ./scripts/bench-all.sh 1 1 1 | CPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| M4 Max | METAL | tiny | 1 | 1 | 15.22 | 0.89 | 0.26 | 0.01 | ad4e3509 | -| M4 Max | METAL | tiny-q8_0 | 1 | 1 | 14.70 | 0.86 | 0.26 | 0.01 | ad4e3509 | -| M4 Max | METAL | base | 1 | 1 | 25.33 | 1.36 | 0.30 | 0.02 | ad4e3509 | -| M4 Max | METAL | base-q8_0 | 1 | 1 | 21.27 | 1.31 | 0.30 | 0.02 | ad4e3509 | -| M4 Max | METAL | small | 1 | 1 | 58.43 | 2.78 | 0.60 | 0.05 | ad4e3509 | -| M4 Max | METAL | small-q8_0 | 1 | 1 | 60.26 | 2.39 | 0.60 | 0.05 | ad4e3509 | -| M4 Max | METAL | medium | 1 | 1 | 169.73 | 6.03 | 1.31 | 0.14 | ad4e3509 | -| M4 Max | METAL | medium-q8_0 | 1 | 1 | 176.61 | 4.99 | 1.31 | 0.14 | ad4e3509 | -| M4 Max | METAL | large-v2 | 1 | 1 | 316.18 | 9.60 | 2.08 | 0.24 | ad4e3509 | -| M4 Max | METAL | large-v2-q8_0 | 1 | 1 | 329.59 | 7.55 | 2.08 | 0.25 | ad4e3509 | +| M4 Max | METAL | tiny | 1 | 1 | 11.70 | 0.74 | 0.23 | 0.01 | dc8dda60 | +| M4 Max | METAL | tiny-q8_0 | 1 | 1 | 12.36 | 0.67 | 0.23 | 0.01 | dc8dda60 | +| M4 Max | METAL | base | 1 | 1 | 21.76 | 1.12 | 0.25 | 0.02 | dc8dda60 | +| M4 Max | METAL | base-q8_0 | 1 | 1 | 22.60 | 0.94 | 0.26 | 0.02 | dc8dda60 | +| M4 Max | METAL | small | 1 | 1 | 67.26 | 2.27 | 0.50 | 0.06 | dc8dda60 | +| M4 Max | METAL | small-q8_0 | 1 | 1 | 68.67 | 1.93 | 0.53 | 0.06 | dc8dda60 | +| M4 Max | METAL | medium | 1 | 1 | 193.58 | 5.31 | 1.20 | 0.16 | dc8dda60 | +| M4 Max | METAL | medium-q8_0 | 1 | 1 | 198.60 | 4.31 | 1.21 | 0.16 | dc8dda60 | +| M4 Max | METAL | large-v2 | 1 | 1 | 357.54 | 8.73 | 1.99 | 0.27 | dc8dda60 | +| M4 Max | METAL | large-v2-q8_0 | 1 | 1 | 363.98 | 6.43 | 1.99 | 0.28 | dc8dda60 | +| M4 Max | METAL | large-v3-turbo | 1 | 1 | 322.32 | 1.66 | 0.37 | 0.05 | dc8dda60 | # V100 -WHISPER_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 0 +GGML_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 0 | GPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| V100 | AVX2 CUDA | tiny | 8 | 0 | 6.15 | 1.02 | 0.30 | 0.01 | ad4e3509 | -| V100 | AVX2 CUDA | tiny-q5_1 | 8 | 0 | 5.92 | 0.96 | 0.25 | 0.01 | ad4e3509 | -| V100 | AVX2 CUDA | base | 8 | 0 | 10.60 | 1.43 | 0.43 | 0.02 | ad4e3509 | -| V100 | AVX2 CUDA | base-q5_1 | 8 | 0 | 10.80 | 1.37 | 0.36 | 0.02 | ad4e3509 | -| V100 | AVX2 CUDA | small | 8 | 0 | 31.83 | 2.82 | 0.87 | 0.04 | ad4e3509 | -| V100 | AVX2 CUDA | small-q5_1 | 8 | 0 | 31.88 | 2.68 | 0.72 | 0.04 | ad4e3509 | -| V100 | AVX2 CUDA | medium | 8 | 0 | 81.30 | 6.02 | 1.81 | 0.09 | ad4e3509 | -| V100 | AVX2 CUDA | medium-q5_0 | 8 | 0 | 83.21 | 5.44 | 1.41 | 0.10 | ad4e3509 | -| V100 | AVX2 CUDA | large-v2 | 8 | 0 | 134.81 | 8.64 | 2.69 | 0.14 | ad4e3509 | -| V100 | AVX2 CUDA | large-v2-q5_0 | 8 | 0 | 138.95 | 7.57 | 2.04 | 0.15 | ad4e3509 | -| V100 | AVX2 CUDA | large-v3-turbo | 8 | 0 | 124.42 | 1.37 | 0.43 | 0.02 | ad4e3509 | -| V100 | AVX2 CUDA | large-v3-turbo-q5_0 | 8 | 0 | 127.81 | 1.13 | 0.32 | 0.03 | ad4e3509 | - - -WHISPER_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 1 +| V100 | AVX2 CUDA | tiny | 8 | 0 | 5.99 | 1.01 | 0.30 | 0.01 | dc8dda60 | +| V100 | AVX2 CUDA | tiny-q5_1 | 8 | 0 | 6.07 | 1.00 | 0.26 | 0.01 | dc8dda60 | +| V100 | AVX2 CUDA | base | 8 | 0 | 10.96 | 1.44 | 0.43 | 0.02 | dc8dda60 | +| V100 | AVX2 CUDA | base-q5_1 | 8 | 0 | 11.11 | 1.41 | 0.37 | 0.02 | dc8dda60 | +| V100 | AVX2 CUDA | small | 8 | 0 | 31.04 | 2.84 | 0.86 | 0.04 | dc8dda60 | +| V100 | AVX2 CUDA | small-q5_1 | 8 | 0 | 31.69 | 2.82 | 0.71 | 0.04 | dc8dda60 | +| V100 | AVX2 CUDA | medium | 8 | 0 | 83.95 | 6.05 | 1.82 | 0.09 | dc8dda60 | +| V100 | AVX2 CUDA | medium-q5_0 | 8 | 0 | 85.86 | 5.58 | 1.45 | 0.10 | dc8dda60 | +| V100 | AVX2 CUDA | large-v2 | 8 | 0 | 138.50 | 8.70 | 2.71 | 0.15 | dc8dda60 | +| V100 | AVX2 CUDA | large-v2-q5_0 | 8 | 0 | 142.31 | 7.82 | 2.03 | 0.16 | dc8dda60 | +| V100 | AVX2 CUDA | large-v3-turbo | 8 | 0 | 128.39 | 1.42 | 0.44 | 0.02 | dc8dda60 | +| V100 | AVX2 CUDA | large-v3-turbo-q5_0 | 8 | 0 | 131.24 | 1.17 | 0.33 | 0.03 | dc8dda60 | + + +GGML_CUDA=1 make -j && ./scripts/bench-all.sh 8 1 1 | GPU | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| V100 | AVX2 CUDA | tiny | 8 | 1 | 4.01 | 0.90 | 0.25 | 0.01 | ad4e3509 | -| V100 | AVX2 CUDA | tiny-q5_1 | 8 | 1 | 4.12 | 0.88 | 0.18 | 0.01 | ad4e3509 | -| V100 | AVX2 CUDA | base | 8 | 1 | 7.00 | 1.30 | 0.35 | 0.01 | ad4e3509 | -| V100 | AVX2 CUDA | base-q5_1 | 8 | 1 | 7.22 | 1.21 | 0.26 | 0.02 | ad4e3509 | -| V100 | AVX2 CUDA | small | 8 | 1 | 18.68 | 2.39 | 0.69 | 0.03 | ad4e3509 | -| V100 | AVX2 CUDA | small-q5_1 | 8 | 1 | 19.38 | 2.32 | 0.51 | 0.03 | ad4e3509 | -| V100 | AVX2 CUDA | medium | 8 | 1 | 53.17 | 5.15 | 1.45 | 0.06 | ad4e3509 | -| V100 | AVX2 CUDA | medium-q5_0 | 8 | 1 | 55.09 | 4.64 | 1.05 | 0.07 | ad4e3509 | -| V100 | AVX2 CUDA | large-v2 | 8 | 1 | 85.77 | 7.57 | 2.19 | 0.10 | ad4e3509 | -| V100 | AVX2 CUDA | large-v2-q5_0 | 8 | 1 | 89.24 | 6.48 | 1.48 | 0.11 | ad4e3509 | -| V100 | AVX2 CUDA | large-v3-turbo | 8 | 1 | 75.56 | 1.25 | 0.37 | 0.02 | ad4e3509 | -| V100 | AVX2 CUDA | large-v3-turbo-q5_0 | 8 | 1 | 78.48 | 1.01 | 0.24 | 0.02 | ad4e3509 | +| V100 | AVX2 CUDA | tiny | 8 | 1 | 4.85 | 0.97 | 0.26 | 0.01 | dc8dda60 | +| V100 | AVX2 CUDA | tiny-q5_1 | 8 | 1 | 4.97 | 0.89 | 0.19 | 0.01 | dc8dda60 | +| V100 | AVX2 CUDA | base | 8 | 1 | 7.23 | 1.28 | 0.35 | 0.02 | dc8dda60 | +| V100 | AVX2 CUDA | base-q5_1 | 8 | 1 | 7.38 | 1.24 | 0.26 | 0.02 | dc8dda60 | +| V100 | AVX2 CUDA | small | 8 | 1 | 20.87 | 2.44 | 0.71 | 0.03 | dc8dda60 | +| V100 | AVX2 CUDA | small-q5_1 | 8 | 1 | 19.80 | 2.35 | 0.51 | 0.03 | dc8dda60 | +| V100 | AVX2 CUDA | medium | 8 | 1 | 54.56 | 5.31 | 1.46 | 0.06 | dc8dda60 | +| V100 | AVX2 CUDA | medium-q5_0 | 8 | 1 | 56.09 | 4.67 | 1.05 | 0.07 | dc8dda60 | +| V100 | AVX2 CUDA | large-v2 | 8 | 1 | 87.05 | 7.65 | 2.16 | 0.10 | dc8dda60 | +| V100 | AVX2 CUDA | large-v2-q5_0 | 8 | 1 | 94.65 | 6.60 | 1.47 | 0.11 | dc8dda60 | +| V100 | AVX2 CUDA | large-v3-turbo | 8 | 1 | 76.46 | 1.29 | 0.37 | 0.02 | dc8dda60 | +| V100 | AVX2 CUDA | large-v3-turbo-q5_0 | 8 | 1 | 79.62 | 1.03 | 0.23 | 0.02 | dc8dda60 | From a8d002cfd879315632a579e73f0148d06959de36 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 25 Jun 2025 16:47:03 +0300 Subject: [PATCH 0004/1348] release : v1.7.6 --- CMakeLists.txt | 2 +- README.md | 6 ++++-- bindings/javascript/package.json | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cf8d3c2003b..36eef350c09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories. project("whisper.cpp" C CXX) -project("whisper.cpp" VERSION 1.7.5) +project("whisper.cpp" VERSION 1.7.6) include(CheckIncludeFileCXX) set(SOVERSION 1) diff --git a/README.md b/README.md index 0298818cc24..2e92a27f07c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp) [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/) -Stable: [v1.7.5](https://github.com/ggml-org/whisper.cpp/releases/tag/v1.7.5) / [Roadmap](https://github.com/orgs/ggml-org/projects/4/) +Stable: [v1.7.6](https://github.com/ggml-org/whisper.cpp/releases/tag/v1.7.6) / [Roadmap](https://github.com/orgs/ggml-org/projects/4/) High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model: @@ -709,7 +709,9 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model ## XCFramework The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS, and macOS. It can be used in Swift projects without the need to compile the -library from source. For examples: +library from source. For example, the v1.7.5 version of the XCFramework can be +used as follows: + ```swift // swift-tools-version: 5.10 // The swift-tools-version declares the minimum version of Swift required to build this package. diff --git a/bindings/javascript/package.json b/bindings/javascript/package.json index df72d132943..3d0e0710519 100644 --- a/bindings/javascript/package.json +++ b/bindings/javascript/package.json @@ -1,6 +1,6 @@ { "name": "whisper.cpp", - "version": "1.7.5", + "version": "1.7.6", "description": "Whisper speech recognition", "main": "whisper.js", "scripts": { From 4daf7050ca2bf17f5166f45ac6da651c4e33f293 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 25 Jun 2025 21:43:58 +0200 Subject: [PATCH 0005/1348] ci : add support for tag-based releases (#3287) This commit modifies the GitHub Actions workflow to support tag-based releases. When a tag is pushed that starts with 'v', the workflow will use that tag name for the release process. I think this was the once the behavior, but it was lost in updates that I've made to the workflow. This commit restores that functionality. --- .github/workflows/build.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3568db32b71..48aa640a185 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -4,6 +4,8 @@ on: push: branches: - master + tags: + - 'v*' pull_request: types: [opened, synchronize, reopened] workflow_dispatch: @@ -62,8 +64,11 @@ jobs: echo "BRANCH_NAME: ${{ env.BRANCH_NAME }}" echo "CUSTOM_TAG: $CUSTOM_TAG" + if [[ "${{ github.ref_type }}" == "tag" ]]; then + echo "Using pushed tag name" + TAG_NAME="${{ github.ref_name }}" # Use custom tag if provided - if [[ -n "$CUSTOM_TAG" ]]; then + elif [[ -n "$CUSTOM_TAG" ]]; then echo "Using custom tag" TAG_NAME="${CUSTOM_TAG}" elif [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then @@ -1226,7 +1231,8 @@ jobs: ./build/bin/quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0 release: - if: ${{ github.event.inputs.create_release == 'true' || github.event.inputs.pre_release_tag != '' }} + if: ${{ github.event.inputs.create_release == 'true' || github.event.inputs.pre_release_tag != '' || startsWith(github.ref, 'refs/tags/v') }} + runs-on: ubuntu-latest From 897b071dc68f11bc41f55e26b741902e3a4e4c10 Mon Sep 17 00:00:00 2001 From: toboil-features <160222185+toboil-features@users.noreply.github.com> Date: Thu, 26 Jun 2025 14:23:19 +0300 Subject: [PATCH 0006/1348] docs : add cmake "-j" flag in README.md (#3284) Make cmake commands encounter multithreading in README.md file. --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2e92a27f07c..6b81a54f7d8 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ Now build the [whisper-cli](examples/cli) example and transcribe an audio file l ```bash # build the project cmake -B build -cmake --build build --config Release +cmake --build build -j --config Release # transcribe an audio file ./build/bin/whisper-cli -f samples/jfk.wav @@ -149,7 +149,7 @@ standard cmake setup with: ```bash # build with GGML_BLAS defined cmake -B build -DGGML_BLAS=1 -cmake --build build --config Release +cmake --build build -j --config Release ./build/bin/whisper-cli [ .. etc .. ] ``` @@ -163,7 +163,7 @@ Here are the steps for creating and using a quantized model: ```bash # quantize a model with Q5_0 method cmake -B build -cmake --build build --config Release +cmake --build build -j --config Release ./build/bin/quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0 # run the examples as usual, specifying the quantized model file @@ -489,7 +489,7 @@ You will need to have [sdl2](https://wiki.libsdl.org/SDL2/Installation) installe ```bash cmake -B build -DWHISPER_SDL2=ON -cmake --build build --config Release +cmake --build build -j --config Release ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000 ``` From 35034c5aea48aa0da75c4d853ec7e8a3055adbc5 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 26 Jun 2025 16:29:29 +0200 Subject: [PATCH 0007/1348] ci : add should_release variable (#3288) * ci : add should_release variable This commit adds a `should_release` variable to the GitHub Actions workflow to determine if a release should be created based on the tag or branch conditions. The motivation for this that it simplifies the logic for deciding whether to upload artifacts or not, making it easier to maintain if we need to change the conditions in the future. * ci : set release draft to true --- .github/workflows/build.yml | 40 ++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 48aa640a185..0e80bdfae78 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -43,6 +43,7 @@ jobs: runs-on: ubuntu-latest outputs: tag_name: ${{ steps.tag.outputs.name }} + should_release: ${{ steps.tag.outputs.should_release }} steps: - name: Checkout with full history @@ -57,6 +58,7 @@ jobs: BUILD_NUMBER=$(git rev-list --count HEAD) SHORT_HASH=$(git rev-parse --short=7 HEAD) CUSTOM_TAG="${{ github.event.inputs.pre_release_tag }}" + SHOULD_RELEASE="false" echo "Raw values:" echo "BUILD_NUMBER: $BUILD_NUMBER" @@ -67,21 +69,31 @@ jobs: if [[ "${{ github.ref_type }}" == "tag" ]]; then echo "Using pushed tag name" TAG_NAME="${{ github.ref_name }}" - # Use custom tag if provided + SHOULD_RELEASE="true" elif [[ -n "$CUSTOM_TAG" ]]; then echo "Using custom tag" TAG_NAME="${CUSTOM_TAG}" + SHOULD_RELEASE="true" + elif [[ "${{ github.event.inputs.create_release }}" == "true" ]]; then + echo "Manual release requested" + SHOULD_RELEASE="true" + TAG_NAME="b${BUILD_NUMBER}" elif [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then echo "Using master branch format" TAG_NAME="b${BUILD_NUMBER}" + SHOULD_RELEASE="false" else echo "Using non-master branch format" SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') TAG_NAME="${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" + SHOULD_RELEASE="false" fi echo "Final tag name: $TAG_NAME" + echo "Should release: $SHOULD_RELEASE" echo "name=$TAG_NAME" >> $GITHUB_OUTPUT + echo "should_release=$SHOULD_RELEASE" >> $GITHUB_OUTPUT + ubuntu-22: if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' || @@ -584,6 +596,7 @@ jobs: if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' || github.event.inputs.run_type == 'full-ci' }} runs-on: windows-latest + needs: determine-tag strategy: matrix: @@ -667,9 +680,7 @@ jobs: Compress-Archive -Path "build/bin/${{ matrix.build }}" -DestinationPath "whisper-bin-${{ matrix.arch }}.zip" - name: Upload binaries - if: matrix.sdl2 == 'ON' && ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') || - github.event.inputs.create_release == 'true' || - github.event.inputs.pre_release_tag != '' }} + if: matrix.sdl2 == 'ON' && ${{ needs.determine-tag.outputs.should_release }} uses: actions/upload-artifact@v4 with: name: whisper-bin-${{ matrix.arch }}.zip @@ -755,9 +766,7 @@ jobs: Compress-Archive -Path "build/bin/${{ matrix.build }}" -DestinationPath "whisper-blas-bin-${{ matrix.arch }}.zip" - name: Upload binaries - if: matrix.blas == 'ON' && matrix.sdl2 == 'ON' && ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') || - github.event.inputs.create_release == 'true' || - github.event.inputs.pre_release_tag != '' }} + if: matrix.blas == 'ON' && matrix.sdl2 == 'ON' && ${{ needs.determine-tag.outputs.should_release }} uses: actions/upload-artifact@v4 with: name: whisper-blas-bin-${{ matrix.arch }}.zip @@ -767,6 +776,7 @@ jobs: if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' || github.event.inputs.run_type == 'full-ci' }} runs-on: windows-2022 + needs: determine-tag strategy: fail-fast: false matrix: @@ -965,9 +975,7 @@ jobs: Compress-Archive -Path "build/bin/${{ matrix.build }}" -DestinationPath "whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}.zip" - name: Upload binaries - if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') || - github.event.inputs.create_release == 'true' || - github.event.inputs.pre_release_tag != '' }} + if: ${{ needs.determine-tag.outputs.should_release }} uses: actions/upload-artifact@v4 with: name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}.zip @@ -1044,16 +1052,11 @@ jobs: - name: Pack artifacts id: pack_artifacts - if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') || - github.event.inputs.create_release == 'true' || - github.event.inputs.pre_release_tag != '' }} run: | zip --symlinks -r whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip build-apple/whisper.xcframework - name: Upload artifacts - if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') || - github.event.inputs.create_release == 'true' || - github.event.inputs.pre_release_tag != '' }} + if: ${{ needs.determine-tag.outputs.should_release }} uses: actions/upload-artifact@v4 with: path: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip @@ -1233,7 +1236,6 @@ jobs: release: if: ${{ github.event.inputs.create_release == 'true' || github.event.inputs.pre_release_tag != '' || startsWith(github.ref, 'refs/tags/v') }} - runs-on: ubuntu-latest needs: @@ -1275,6 +1277,7 @@ jobs: with: tag_name: ${{ needs.determine-tag.outputs.tag_name }} prerelease: ${{ github.event.inputs.pre_release_tag != '' }} + draft: true - name: Upload release id: upload_release @@ -1301,7 +1304,8 @@ jobs: coreml-base-en: if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') || github.event.inputs.create_release == 'true' || - github.event.inputs.pre_release_tag != '' }} + github.event.inputs.pre_release_tag != '' || + startsWith(github.ref, 'refs/tags/v') }} runs-on: macos-latest needs: determine-tag From 32cf4e2aba799aff069011f37ca025401433cf9f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 26 Jun 2025 18:09:42 +0200 Subject: [PATCH 0008/1348] whisper : add version function (#3289) * whisper : add version function This commit adds a version function to the whisper API. The motivation for this is that it might be convenient to have a way to programmatically check the version. Example usage: ```c++ printf("Using whisper version: %s\n", whisper_version()); ``` Will output: ```console Using whisper version: 1.7.6 ``` * examples : add version to android example CMakeLists.txt --- CMakeLists.txt | 4 ++++ .../app/src/main/jni/whisper/CMakeLists.txt | 12 ++++++++++++ .../lib/src/main/jni/whisper/CMakeLists.txt | 12 ++++++++++++ include/whisper.h | 2 ++ src/whisper.cpp | 4 ++++ 5 files changed, 34 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 36eef350c09..989e94ba93d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,6 +178,10 @@ get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS) set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h) install(TARGETS whisper LIBRARY PUBLIC_HEADER) +target_compile_definitions(whisper PRIVATE + WHISPER_VERSION="${PROJECT_VERSION}" +) + configure_package_config_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake diff --git a/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt b/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt index 3514fbe28c7..b22bb15be9f 100644 --- a/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt +++ b/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt @@ -5,6 +5,17 @@ project(whisper.cpp) set(CMAKE_CXX_STANDARD 17) set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../) +# Get whisper.cpp version +file(READ "${WHISPER_LIB_DIR}/CMakeLists.txt" MAIN_CMAKE_CONTENT) +string(REGEX MATCH "project\\(\"whisper\\.cpp\" VERSION ([0-9]+\\.[0-9]+\\.[0-9]+)\\)" VERSION_MATCH "${MAIN_CMAKE_CONTENT}") +if(CMAKE_MATCH_1) + set(WHISPER_VERSION ${CMAKE_MATCH_1} PARENT_SCOPE) +else() + set(WHISPER_VERSION "unknown" PARENT_SCOPE) +endif() + +message(STATUS " Whisper version: ${WHISPER_VERSION}") + set(SOURCE_FILES ${WHISPER_LIB_DIR}/src/whisper.cpp ${CMAKE_SOURCE_DIR}/jni.c @@ -26,6 +37,7 @@ function(build_library target_name) target_link_libraries(${target_name} ${LOG_LIB} android ggml) target_compile_definitions(${target_name} PUBLIC GGML_USE_CPU) + target_compile_definitions(${target_name} PRIVATE WHISPER_VERSION="${WHISPER_VERSION}") if (${target_name} STREQUAL "whisper_v8fp16_va") target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16) diff --git a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt index c986b417d55..a82d0d7b740 100644 --- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt +++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt @@ -5,6 +5,17 @@ project(whisper.cpp) set(CMAKE_CXX_STANDARD 17) set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../..) +# Get whisper.cpp version +file(READ "${WHISPER_LIB_DIR}/CMakeLists.txt" MAIN_CMAKE_CONTENT) +string(REGEX MATCH "project\\(\"whisper\\.cpp\" VERSION ([0-9]+\\.[0-9]+\\.[0-9]+)\\)" VERSION_MATCH "${MAIN_CMAKE_CONTENT}") +if(CMAKE_MATCH_1) + set(WHISPER_VERSION ${CMAKE_MATCH_1} PARENT_SCOPE) +else() + set(WHISPER_VERSION "unknown" PARENT_SCOPE) +endif() + +message(STATUS " Whisper version: ${WHISPER_VERSION}") + # Path to external GGML, otherwise uses the copy in whisper.cpp. option(GGML_HOME "whisper: Path to external GGML source" OFF) @@ -26,6 +37,7 @@ function(build_library target_name) ) target_compile_definitions(${target_name} PUBLIC GGML_USE_CPU) + target_compile_definitions(${target_name} PRIVATE WHISPER_VERSION="${WHISPER_VERSION}") if (${target_name} STREQUAL "whisper_v8fp16_va") target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16) diff --git a/include/whisper.h b/include/whisper.h index 4aeda98f334..fcd756a9fe2 100644 --- a/include/whisper.h +++ b/include/whisper.h @@ -198,6 +198,8 @@ extern "C" { float samples_overlap; // Overlap in seconds when copying audio samples from speech segment. } whisper_vad_params; + WHISPER_API const char * whisper_version(void); + // Various functions for loading a ggml whisper model. // Allocate (almost) all memory needed for the model. // Return NULL on failure diff --git a/src/whisper.cpp b/src/whisper.cpp index fe3e135bee6..347cc178ee7 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -8938,6 +8938,10 @@ void whisper_log_set(ggml_log_callback log_callback, void * user_data) { ggml_log_set(g_state.log_callback, g_state.log_callback_user_data); } +const char * whisper_version(void) { + return WHISPER_VERSION; +} + GGML_ATTRIBUTE_FORMAT(2, 3) static void whisper_log_internal(ggml_log_level level, const char * format, ...) { va_list args; From f8abbeb2346bb2659a9729c69478b548e26cdabe Mon Sep 17 00:00:00 2001 From: KITAITI Makoto Date: Fri, 27 Jun 2025 11:41:26 +0900 Subject: [PATCH 0009/1348] ruby : add Whisper::VERSION (#3292) * Add a test for segment * Check option existence * Use more proper variable to define build option * Assert Core ML enabled * Define Whisper::VERSION * Add test for Whisper::VERSION * Add signature of Whisper::VERSION --- bindings/ruby/ext/options.rb | 9 ++++++--- bindings/ruby/ext/ruby_whisper.c | 1 + bindings/ruby/sig/whisper.rbs | 1 + bindings/ruby/test/test_package.rb | 3 ++- bindings/ruby/test/test_segment.rb | 10 ++++++++++ bindings/ruby/test/test_whisper.rb | 4 ++++ 6 files changed, 24 insertions(+), 4 deletions(-) diff --git a/bindings/ruby/ext/options.rb b/bindings/ruby/ext/options.rb index 30cda0f8018..ede80c0656b 100644 --- a/bindings/ruby/ext/options.rb +++ b/bindings/ruby/ext/options.rb @@ -64,7 +64,7 @@ def configure_metal def configure_coreml if enabled?("WHISPER_COREML") $LDFLAGS << " -framework Foundation -framework CoreML" - $CPPFLAGS << " -DRUBY_WHISPER_USE_COREML" + $defs << "-DRUBY_WHISPER_USE_COREML" end end @@ -73,10 +73,13 @@ def option_name(name) end def enabled?(option) - if @options[option][1].nil? + op = @options[option] + raise "Option not exist: #{option}" unless op + raise "Option not boolean: #{option}(#{op[0]})" unless op[0] == "BOOL" + if op[1].nil? cmake_options[option][1] else - @options[option][1] + op[1] end end end diff --git a/bindings/ruby/ext/ruby_whisper.c b/bindings/ruby/ext/ruby_whisper.c index 35c196abb41..533bda74205 100644 --- a/bindings/ruby/ext/ruby_whisper.c +++ b/bindings/ruby/ext/ruby_whisper.c @@ -148,6 +148,7 @@ void Init_whisper() { mWhisper = rb_define_module("Whisper"); mVAD = rb_define_module_under(mWhisper, "VAD"); + rb_define_const(mWhisper, "VERSION", rb_str_new2(whisper_version())); rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE)); rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO)); rb_define_const(mWhisper, "LOG_LEVEL_WARN", INT2NUM(GGML_LOG_LEVEL_WARN)); diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index e6c0e139492..5966ce31592 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -10,6 +10,7 @@ module Whisper type encoder_begin_callback = ^(Whisper::Context, void, Object user_data) -> void type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish + VERSION: String LOG_LEVEL_NONE: Integer LOG_LEVEL_INFO: Integer LOG_LEVEL_WARN: Integer diff --git a/bindings/ruby/test/test_package.rb b/bindings/ruby/test/test_package.rb index 33cd2a3c195..108f34efbeb 100644 --- a/bindings/ruby/test/test_package.rb +++ b/bindings/ruby/test/test_package.rb @@ -31,10 +31,11 @@ def test_install_with_coreml Dir.mktmpdir do |dir| system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", "--", "--enable-whisper-coreml", exception: true assert_installed dir, gemspec.version + libdir = File.join(dir, "gems", "#{gemspec.name}-#{gemspec.version}", "lib") assert_nothing_raised do - libdir = File.join(dir, "gems", "#{gemspec.name}-#{gemspec.version}", "lib") system "ruby", "-I", libdir, "-r", "whisper", "-e", "Whisper::Context.new('tiny')", exception: true end + assert_match(/COREML = 1/, `ruby -I #{libdir.shellescape} -r whisper -e 'puts Whisper.system_info_str'`) end end end diff --git a/bindings/ruby/test/test_segment.rb b/bindings/ruby/test/test_segment.rb index 5b63e0c4455..cb4ba9eb705 100644 --- a/bindings/ruby/test/test_segment.rb +++ b/bindings/ruby/test/test_segment.rb @@ -72,6 +72,16 @@ def test_on_new_segment_twice whisper.transcribe(AUDIO, params) end + def test_transcription_after_segment_retrieved + params = Whisper::Params.new + segment = whisper.each_segment.first + assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text) + + whisper.transcribe(AUDIO, Whisper::Params.new(offset: 5000)) + assert_not_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text) + assert_match(/what you can do for your country/i, segment.text) + end + def test_pattern_matching segment = whisper.each_segment.first segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:} diff --git a/bindings/ruby/test/test_whisper.rb b/bindings/ruby/test/test_whisper.rb index e429c54317f..12b82a8de09 100644 --- a/bindings/ruby/test/test_whisper.rb +++ b/bindings/ruby/test/test_whisper.rb @@ -116,6 +116,10 @@ def test_system_info_str assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str) end + def test_version + assert_kind_of String, Whisper::VERSION + end + def test_log_set user_data = Object.new logs = [] From 70693944470a764157f59fa6c222105b4a95ca95 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 27 Jun 2025 09:55:56 +0200 Subject: [PATCH 0010/1348] ci: set fail-fast to false in docker.yml (#3294) * ci: set fail-fast to false in docker.yml This commit modifies the GitHub Actions workflow for Docker builds to disable the fail-fast behavior. The motivation for this is that currently if one of the strategy jobs fails any other job that is in progress will be cancelled. There is no need for this as the jobs are independent. * ci : update docker.yml to use a single build This commit updates the docker job to only build the image once instead of twice (only happens when pushing to the master branch). Instead this will tag the image with the commit SHA when pushing to master. The motivation for this change is to reduce the time it takes to run this job and also it might help with the disk space issues we are experiencing for this job when it runs on pushes to master. --- .github/workflows/docker.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index c5e9e90d71b..6cde0510229 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -15,6 +15,7 @@ jobs: env: COMMIT_SHA: ${{ github.sha }} strategy: + fail-fast: false matrix: config: - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64" } @@ -41,21 +42,20 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Build and push Docker image (versioned) - if: github.event_name == 'push' - uses: docker/build-push-action@v5 - with: - context: . - push: true - platforms: ${{ matrix.config.platform }} - tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" - file: ${{ matrix.config.dockerfile }} + - name: Generate tags + id: tags + run: | + TAGS="ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}" + if [ "${{ github.event_name }}" == "push" ]; then + TAGS="$TAGS,ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + fi + echo "tags=$TAGS" >> $GITHUB_OUTPUT - name: Build and push Docker image (tagged) - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: . push: ${{ github.event_name == 'push' }} platforms: ${{ matrix.config.platform }} - tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}" + tags: ${{ steps.tags.outputs.tags }} file: ${{ matrix.config.dockerfile }} From c88ffbf9baeaae8c2cc0a4f496618314bb2ee9e0 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 27 Jun 2025 15:43:56 +0200 Subject: [PATCH 0011/1348] ci : use selective copy for musa image (#3296) This commit modified the musa docker file to selectively copy directories needed for the container image. This commit also added a step to the docker workflow to free up disk space in attempt to make enough room for the large musa build containers. The motivation for this change is to reduce the size of the container image and try to avoid disk usage issues in CI. --- .devops/main-musa.Dockerfile | 7 ++++--- .github/workflows/docker.yml | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/.devops/main-musa.Dockerfile b/.devops/main-musa.Dockerfile index bbc33993881..0c9be7c3e6e 100644 --- a/.devops/main-musa.Dockerfile +++ b/.devops/main-musa.Dockerfile @@ -32,8 +32,9 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/* -COPY --from=build /app /app -RUN du -sh /app/* -RUN find /app -type f -size +100M +COPY --from=build /app/build/bin /app/build/bin +COPY --from=build /app/samples /app/samples +COPY --from=build /app/models /app/models + ENV PATH=/app/build/bin:$PATH ENTRYPOINT [ "bash", "-c" ] diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 6cde0510229..0e2fb1f2b9e 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -42,6 +42,21 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Free up disk space + run: | + sudo apt-get remove -y '^dotnet-.*' '^llvm-.*' '^mysql-.*' '^postgresql-.*' + sudo apt-get autoremove -y + sudo apt-get autoclean + + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + + docker system prune -af + + df -h + - name: Generate tags id: tags run: | From 515df20351112d4c2722bd0a9094f02ad1488fb6 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 24 Jun 2025 06:10:16 +0200 Subject: [PATCH 0012/1348] ggml-quants : rename best_mad to best_error (ggml/1283) This commit renames the variable `best_mad` to `best_error` in the `make_qkx2_quants` function. The motivation for this is that the name `best_mad` can be somewhat confusing if mean absolute deviation (MAD) is not in use. --- ggml/src/ggml-quants.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e389a46dbed..9a7d1b22d79 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -568,14 +568,14 @@ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, co } float iscale = nmax/(max - min); float scale = 1/iscale; - float best_mad = 0; + float best_error = 0; for (int i = 0; i < n; ++i) { int l = nearest_int(iscale*(x[i] - min)); L[i] = MAX(0, MIN(nmax, l)); float diff = scale * L[i] + min - x[i]; diff = use_mad ? fabsf(diff) : diff * diff; float w = weights[i]; - best_mad += w * diff; + best_error += w * diff; } if (nstep < 1) { *the_min = -min; @@ -601,18 +601,18 @@ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, co this_min = 0; this_scale = sum_xl / sum_l2; } - float mad = 0; + float cur_error = 0; for (int i = 0; i < n; ++i) { float diff = this_scale * Laux[i] + this_min - x[i]; diff = use_mad ? fabsf(diff) : diff * diff; float w = weights[i]; - mad += w * diff; + cur_error += w * diff; } - if (mad < best_mad) { + if (cur_error < best_error) { for (int i = 0; i < n; ++i) { L[i] = Laux[i]; } - best_mad = mad; + best_error = cur_error; scale = this_scale; min = this_min; } From 9d7cb80f04eea4c73b6aa6a26e919d76e4e0e3ac Mon Sep 17 00:00:00 2001 From: Acly Date: Tue, 1 Jul 2025 09:11:00 +0200 Subject: [PATCH 0013/1348] ggml-cpu : "align corners" for bilinear upscale/downscale (ggml/1285) * add "align corners" mode for bilinear upscale, and allow downscaling * add ggml_interpolate, deprecate ggml_upscale_ext, pass in align-corners as bit-flag * test-backend-ops: replace ggml_upscale_ext with ggml_interpolate, add test cases for downscale and align-corners --- ggml/include/ggml.h | 22 ++++++++++++++++++++-- ggml/src/ggml-cpu/ops.cpp | 19 ++++++++++++------- ggml/src/ggml.c | 39 ++++++++++++++++++++++++--------------- 3 files changed, 56 insertions(+), 24 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 9c4e24023b5..4231a668eb0 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1765,6 +1765,12 @@ extern "C" { enum ggml_scale_mode { GGML_SCALE_MODE_NEAREST = 0, GGML_SCALE_MODE_BILINEAR = 1, + + GGML_SCALE_MODE_COUNT + }; + + enum ggml_scale_flag { + GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8) }; // interpolate @@ -1777,14 +1783,26 @@ extern "C" { // interpolate // interpolate scale to specified dimensions - GGML_API struct ggml_tensor * ggml_upscale_ext( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext( struct ggml_context * ctx, struct ggml_tensor * a, int ne0, int ne1, int ne2, int ne3, - enum ggml_scale_mode mode); + enum ggml_scale_mode mode), + "use ggml_interpolate instead"); + + // Up- or downsamples the input to the specified size. + // 2D scale modes (eg. bilinear) are applied to the first two dimensions. + GGML_API struct ggml_tensor * ggml_interpolate( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...] // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0] GGML_API struct ggml_tensor * ggml_pad( diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index eff4a53e344..f6ebb4ea763 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6608,12 +6608,13 @@ static void ggml_compute_forward_upscale_f32( GGML_TENSOR_UNARY_OP_LOCALS - const float sf0 = (float)ne0/src0->ne[0]; - const float sf1 = (float)ne1/src0->ne[1]; - const float sf2 = (float)ne2/src0->ne[2]; - const float sf3 = (float)ne3/src0->ne[3]; + float sf0 = (float)ne0/src0->ne[0]; + float sf1 = (float)ne1/src0->ne[1]; + float sf2 = (float)ne2/src0->ne[2]; + float sf3 = (float)ne3/src0->ne[3]; - const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); + const int32_t mode_flags = ggml_get_op_params_i32(dst, 0); + const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF); if (mode == GGML_SCALE_MODE_NEAREST) { for (int64_t i3 = 0; i3 < ne3; i3++) { @@ -6634,8 +6635,12 @@ static void ggml_compute_forward_upscale_f32( } } } else if (mode == GGML_SCALE_MODE_BILINEAR) { - // setting a pixel offset of 0 would replicate the behavior of pytorch interpolate with align_corners=True - const float pixel_offset = 0.5f; + float pixel_offset = 0.5f; + if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) { + pixel_offset = 0.0f; + sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1); + sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1); + } for (int64_t i3 = 0; i3 < ne3; i3++) { const int64_t i03 = i3 / sf3; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f8e7c595bce..354a805179c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4247,24 +4247,21 @@ struct ggml_tensor * ggml_pool_2d_back( return result; } -// ggml_upscale +// ggml_upscale / ggml_interpolate -static struct ggml_tensor * ggml_upscale_impl( +static struct ggml_tensor * ggml_interpolate_impl( struct ggml_context * ctx, struct ggml_tensor * a, - int ne0, - int ne1, - int ne2, - int ne3, - enum ggml_scale_mode mode) { - GGML_ASSERT(a->ne[0] <= ne0); - GGML_ASSERT(a->ne[1] <= ne1); - GGML_ASSERT(a->ne[2] <= ne2); - GGML_ASSERT(a->ne[3] <= ne3); - + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode) { + GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT); + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); - ggml_set_op_params_i32(result, 0, mode); + ggml_set_op_params_i32(result, 0, (int32_t)mode); result->op = GGML_OP_UPSCALE; result->src[0] = a; @@ -4277,7 +4274,8 @@ struct ggml_tensor * ggml_upscale( struct ggml_tensor * a, int scale_factor, enum ggml_scale_mode mode) { - return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode); + GGML_ASSERT(scale_factor > 1); + return ggml_interpolate_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode); } struct ggml_tensor * ggml_upscale_ext( @@ -4288,7 +4286,18 @@ struct ggml_tensor * ggml_upscale_ext( int ne2, int ne3, enum ggml_scale_mode mode) { - return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3, mode); + return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode); +} + +struct ggml_tensor * ggml_interpolate( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode) { + return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode); } // ggml_pad From c1418b9906df2abbee25d673eb78c37564cf30ce Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 21 Jun 2025 08:04:18 +0300 Subject: [PATCH 0014/1348] metal : fix thread-safety (llama/14300) ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 88 ++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 4e7f373cb43..19f4d59e597 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -48,22 +48,28 @@ int mtl_device_ref_count; id mtl_library; + NSLock * mtl_lock; + bool has_simdgroup_reduction; bool has_simdgroup_mm; bool has_residency_sets; bool has_bfloat; bool use_bfloat; + size_t max_size; + char name[128]; } g_ggml_ctx_dev_main = { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, /*.mtl_library =*/ nil, + /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, /*.has_residency_sets =*/ false, /*.has_bfloat =*/ false, /*.use_bfloat =*/ false, + /*.max_size =*/ 0, /*.name =*/ "", }; @@ -71,6 +77,10 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_device_context * ctx) { assert(ctx != NULL); + if (ctx->mtl_lock == nil) { + ctx->mtl_lock = [[NSLock alloc] init]; + } + if (ctx->mtl_device == nil) { ctx->mtl_device = MTLCreateSystemDefaultDevice(); } @@ -94,6 +104,8 @@ ctx->use_bfloat = false; #endif + ctx->max_size = ctx->mtl_device.maxBufferLength; + strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1); } @@ -110,6 +122,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_device_ref_count--; if (ctx->mtl_device_ref_count == 0) { + if (ctx->mtl_lock) { + [ctx->mtl_lock release]; + ctx->mtl_lock = nil; + } + if (ctx->mtl_library) { [ctx->mtl_library release]; ctx->mtl_library = nil; @@ -977,7 +994,7 @@ @implementation GGMLMetalClass struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); struct ggml_backend_metal_device_context * ctx_dev = dev->context; - id device = ggml_backend_metal_device_acq(ctx_dev); + id device = ctx_dev->mtl_device; GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); @@ -991,9 +1008,16 @@ @implementation GGMLMetalClass ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); // load library - if (ctx_dev->mtl_library == nil) { - ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat); + { + [ctx_dev->mtl_lock lock]; + + if (ctx_dev->mtl_library == nil) { + ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat); + } + + [ctx_dev->mtl_lock unlock]; } + id metal_library = ctx_dev->mtl_library; if (metal_library == nil) { GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__); @@ -5284,7 +5308,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } ggml_backend_metal_buffer_rset_free(ctx); - ggml_backend_metal_device_rel(buffer->buft->device->context); if (ctx->owned) { #if TARGET_OS_OSX @@ -5393,7 +5416,10 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba } struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context; - id device = ggml_backend_metal_device_acq(ctx_dev); + + GGML_ASSERT(ctx_dev->mtl_device != nil); + + id device = ctx_dev->mtl_device; ctx->all_data = ggml_metal_host_malloc(size_aligned); ctx->all_size = size_aligned; @@ -5416,14 +5442,12 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); - ggml_backend_metal_device_rel(ctx_dev); return NULL; } if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); - ggml_backend_metal_device_rel(ctx_dev); return NULL; } @@ -5434,17 +5458,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; + GGML_UNUSED(buft); } static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - id device = ggml_backend_metal_device_acq(buft->device->context); - const size_t max_size = device.maxBufferLength; - ggml_backend_metal_device_rel(buft->device->context); + const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size; return max_size; - - GGML_UNUSED(buft); } static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { @@ -5517,7 +5538,10 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main; - id device = ggml_backend_metal_device_acq(ctx_dev); + + GGML_ASSERT(ctx_dev->mtl_device != nil); + + id device = ctx_dev->mtl_device; // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { @@ -5573,7 +5597,6 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); - ggml_backend_metal_device_rel(ctx_dev); return NULL; } @@ -5589,10 +5612,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } static void ggml_backend_metal_free(ggml_backend_t backend) { - struct ggml_backend_metal_context * ctx = backend->context; - struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + struct ggml_backend_metal_context * ctx = backend->context; - ggml_backend_metal_device_rel(ctx_dev); ggml_metal_free(ctx); free(backend); @@ -5732,6 +5753,8 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + GGML_ASSERT(ctx_dev->mtl_device != nil); + return [ctx_dev->mtl_device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } @@ -5751,10 +5774,7 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { } static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) { - // acq/rel just to populate ctx->name in case it hasn't been done yet struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; - ggml_backend_metal_device_acq(ctx_dev); - ggml_backend_metal_device_rel(ctx_dev); return ctx_dev->name; } @@ -5762,12 +5782,10 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { if (@available(macOS 10.12, iOS 16.0, *)) { struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; - id device = ggml_backend_metal_device_acq(ctx_dev); + id device = ctx_dev->mtl_device; *total = device.recommendedMaxWorkingSetSize; *free = *total - device.currentAllocatedSize; - - ggml_backend_metal_device_rel(ctx_dev); } else { *free = 1; *total = 1; @@ -5845,7 +5863,10 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back } struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; - id device = ggml_backend_metal_device_acq(ctx_dev); + + GGML_ASSERT(ctx_dev->mtl_device != nil); + + id device = ctx_dev->mtl_device; // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { @@ -5901,7 +5922,6 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); - ggml_backend_metal_device_rel(ctx_dev); return NULL; } @@ -5915,8 +5935,9 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const } static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || - buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; + return + buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || + buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; GGML_UNUSED(dev); } @@ -6001,8 +6022,19 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r /* .get_proc_address = */ ggml_backend_metal_get_proc_address, }; +// called upon program exit +static void ggml_metal_cleanup(void) { + ggml_backend_metal_device_rel(&g_ggml_ctx_dev_main); +} + +// TODO: make thread-safe ggml_backend_reg_t ggml_backend_metal_reg(void) { - // TODO: make this thread-safe somehow? + ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main); + + // register cleanup callback + // TODO: not ideal, but not sure if there is a better way to do this in Objective-C + atexit(ggml_metal_cleanup); + { g_ggml_backend_metal_reg = (struct ggml_backend_reg) { /* .api_version = */ GGML_BACKEND_API_VERSION, From e0366767952a928106f82c4dddaff58520b865ab Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Sat, 21 Jun 2025 08:17:12 +0200 Subject: [PATCH 0015/1348] Add support for VK_EXT_debug_utils to add labels to Vulkan objects. (llama/13792) * Add support for VK_EXT_debug_utils to add labels to Vulkan objects. In step 1 compute pipelines are getting labeled. * remove #ifdef for debug utils and add queue marker. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 58 +++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 1375bfeb9dc..99be5e45b2a 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1041,6 +1041,14 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { struct vk_instance_t { vk::Instance instance; + bool debug_utils_support = false; // VK_EXT_debug_utils enabled + PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {}; + PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {}; + PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {}; + PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {}; + PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {}; + PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {}; + std::vector device_indices; vk_device devices[GGML_VK_MAX_DEVICES]; }; @@ -1180,6 +1188,14 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin } pipeline->compiled = true; + if (vk_instance.debug_utils_support) { + vk::DebugUtilsObjectNameInfoEXT duoni; + duoni.objectType = vk::ObjectType::ePipeline; + duoni.pObjectName = pipeline->name.c_str(); + duoni.objectHandle = reinterpret_cast(static_cast(pipeline->pipeline)); + vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast(duoni)); + } + { std::lock_guard guard(device->mutex); device->pipelines.insert({ pipeline->name, pipeline }); @@ -3561,6 +3577,8 @@ static void ggml_vk_print_gpu_info(size_t idx) { static bool ggml_vk_instance_validation_ext_available(const std::vector& instance_extensions); static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); +static bool ggml_vk_instance_debug_utils_ext_available(const std::vector & instance_extensions); + static void ggml_vk_instance_init() { if (vk_instance_initialized) { return; @@ -3581,7 +3599,7 @@ static void ggml_vk_instance_init() { #ifdef __APPLE__ const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); #endif - + const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr; std::vector layers; if (validation_ext) { @@ -3596,6 +3614,9 @@ static void ggml_vk_instance_init() { extensions.push_back("VK_KHR_portability_enumeration"); } #endif + if (debug_utils_ext) { + extensions.push_back("VK_EXT_debug_utils"); + } vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions); #ifdef __APPLE__ if (portability_enumeration_ext) { @@ -3619,6 +3640,18 @@ static void ggml_vk_instance_init() { vk_instance.instance = vk::createInstance(instance_create_info); vk_instance_initialized = true; + if (debug_utils_ext) { + vk_instance.debug_utils_support = true; + vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT"); + vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT"); + vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT"); + + } + + size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan @@ -9656,6 +9689,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + if (vk_instance.debug_utils_support) { + vk::DebugUtilsLabelEXT dul = {}; + dul.pLabelName = "ggml_backend_vk_graph_compute"; + dul.color = std::array{1.0f, 1.0f, 1.0f, 1.0f}; + vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast(&dul)); + } + uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false); @@ -10345,6 +10385,22 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve UNUSED(instance_extensions); } +// Extension availability +static bool ggml_vk_instance_debug_utils_ext_available( + const std::vector & instance_extensions) { + // Check for portability enumeration extension for MoltenVK support + for (const auto & properties : instance_extensions) { + if (strcmp("VK_EXT_debug_utils", properties.extensionName) == 0) { + return true; + } + } + + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_EXT_debug_utils not found." << std::endl; + return false; + + UNUSED(instance_extensions); +} + static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: From abb650c0ec77a32a971189bb2d10bbb20e394993 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 22 Jun 2025 12:39:54 +0800 Subject: [PATCH 0016/1348] CUDA: add mean operation (llama/14313) * CUDA: add mean operation * add back sum_rows_f32_cuda * Review: early exit if col!=0 --- ggml/src/ggml-cuda/common.cuh | 20 ++++++++++++++++++++ ggml/src/ggml-cuda/ggml-cuda.cu | 5 +++++ ggml/src/ggml-cuda/mean.cu | 19 +++++++++++++++++++ ggml/src/ggml-cuda/mean.cuh | 3 +++ ggml/src/ggml-cuda/sumrows.cu | 23 +++++------------------ ggml/src/ggml-cuda/sumrows.cuh | 1 - 6 files changed, 52 insertions(+), 19 deletions(-) create mode 100644 ggml/src/ggml-cuda/mean.cu create mode 100644 ggml/src/ggml-cuda/mean.cuh diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 364efcaeccc..2f2fce06770 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -362,6 +362,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #endif // FP16_AVAILABLE } +// Row reduction kernel template - compute sum (norm=false) or mean (norm=true) +template +static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) { + const int row = blockIdx.x; + const int col = threadIdx.x; + + float sum = 0.0f; + for (int i = col; i < ncols; i += blockDim.x) { + sum += x[row * ncols + i]; + } + + sum = warp_reduce_sum(sum); + + if (col != 0) { + return; + } + + dst[row] = norm ? sum / ncols : sum; +} + template static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 5bab92e347a..c6bdd4fb302 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -37,6 +37,7 @@ #include "ggml-cuda/ssm-scan.cuh" #include "ggml-cuda/sum.cuh" #include "ggml-cuda/sumrows.cuh" +#include "ggml-cuda/mean.cuh" #include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/unary.cuh" #include "ggml-cuda/upscale.cuh" @@ -2357,6 +2358,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SUM_ROWS: ggml_cuda_op_sum_rows(ctx, dst); break; + case GGML_OP_MEAN: + ggml_cuda_op_mean(ctx, dst); + break; case GGML_OP_SSM_CONV: ggml_cuda_op_ssm_conv(ctx, dst); break; @@ -3260,6 +3264,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_POOL_2D: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: case GGML_OP_ARGSORT: case GGML_OP_ACC: return true; diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu new file mode 100644 index 00000000000..4b238a3998b --- /dev/null +++ b/ggml/src/ggml-cuda/mean.cu @@ -0,0 +1,19 @@ +#include "mean.cuh" + +void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *) src0->data; + float * dst_d = (float *) dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums(nrows, 1, 1); + reduce_rows_f32<<>>(src0_d, dst_d, ncols); +} diff --git a/ggml/src/ggml-cuda/mean.cuh b/ggml/src/ggml-cuda/mean.cuh new file mode 100644 index 00000000000..2b9b1043343 --- /dev/null +++ b/ggml/src/ggml-cuda/mean.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu index 38dbf1b5e1f..2eee08fa073 100644 --- a/ggml/src/ggml-cuda/sumrows.cu +++ b/ggml/src/ggml-cuda/sumrows.cu @@ -1,25 +1,9 @@ #include "sumrows.cuh" -static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { - const int row = blockIdx.x; - const int col = threadIdx.x; - - float sum = 0.0f; - for (int i = col; i < ncols; i += blockDim.x) { - sum += x[row * ncols + i]; - } - - sum = warp_reduce_sum(sum); - - if (col == 0) { - dst[row] = sum; - } -} - void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_nums(nrows, 1, 1); - k_sum_rows_f32<<>>(x, dst, ncols); + reduce_rows_f32<<>>(x, dst, ncols); } void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ncols = src0->ne[0]; const int64_t nrows = ggml_nrows(src0); - sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream); + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums(nrows, 1, 1); + + reduce_rows_f32<<>>(src0_d, dst_d, ncols); } diff --git a/ggml/src/ggml-cuda/sumrows.cuh b/ggml/src/ggml-cuda/sumrows.cuh index 191db1c1316..3431c599b1b 100644 --- a/ggml/src/ggml-cuda/sumrows.cuh +++ b/ggml/src/ggml-cuda/sumrows.cuh @@ -1,5 +1,4 @@ #include "common.cuh" void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream); - void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); From 791201a974a8ced41afdb5728c175114aa29eaad Mon Sep 17 00:00:00 2001 From: uvos Date: Sun, 22 Jun 2025 16:51:23 +0200 Subject: [PATCH 0017/1348] HIP: enable vec fattn on RDNA4 (llama/14323) --- ggml/src/ggml-cuda/common.cuh | 14 ++++++++++++-- ggml/src/ggml-cuda/ggml-cuda.cu | 7 ++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 2f2fce06770..86c4d29a5d2 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -241,8 +241,18 @@ static bool fp16_mma_available(const int cc) { #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) return false; #else - return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); + if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) { + return true; + } else if (GGML_CUDA_CC_IS_RDNA4(cc)) { +#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12) + return true; +#else + return false; +#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12) + } else { + return false; + } #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c6bdd4fb302..462db71e1a6 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -100,8 +100,7 @@ int ggml_cuda_get_device() { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); cudaError_t err; - if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) - { + if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) { err = cudaMallocManaged(ptr, size); #if defined(GGML_USE_HIP) if (err == hipSuccess) { @@ -119,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) err = cudaMalloc(ptr, size); } #endif // defined(GGML_USE_HIP) - } - else - { + } else { err = cudaMalloc(ptr, size); } return err; From 06b01ba87b4198f0520fdae9b7dff00824ec6df1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 23 Jun 2025 13:11:31 +0200 Subject: [PATCH 0018/1348] CUDA: mul_mat_v support for batch sizes > 1 (llama/14262) * CUDA: mul_mat_v support for batch sizes > 1 * use 64 bit math for initial offset calculation --- ggml/src/ggml-cuda/common.cuh | 4 + ggml/src/ggml-cuda/ggml-cuda.cu | 24 ++- ggml/src/ggml-cuda/mmv.cu | 326 +++++++++++++++++++++++--------- ggml/src/ggml-cuda/mmv.cuh | 5 +- 4 files changed, 256 insertions(+), 103 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 86c4d29a5d2..1369bc2d9e5 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -262,6 +262,10 @@ static bool fp16_mma_hardware_available(const int cc) { GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); } +static bool bf16_mma_hardware_available(const int cc) { + return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE; +} + // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. static bool new_mma_available(const int cc) { return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 462db71e1a6..b3e6833c396 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1943,16 +1943,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src; bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) - && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % 2 == 0 && src1->ne[1] == 1; + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; - bool any_gpus_with_slow_fp16 = false; - bool any_gpus_without_fp16_mma = false; + bool any_gpus_with_slow_fp16 = false; if (split) { ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; @@ -1963,16 +1961,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor continue; } - const int cc = ggml_cuda_info().devices[id].cc; - use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); - any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); - any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc); + const int cc = ggml_cuda_info().devices[id].cc; + use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); + use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]); + any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); } } else { - const int cc = ggml_cuda_info().devices[ctx.device].cc; - use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); - any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); - any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc); + const int cc = ggml_cuda_info().devices[ctx.device].cc; + use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); + use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]); + any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); } // debug helpers @@ -1983,7 +1981,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { + if (!split && use_mul_mat_vec) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst); diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index d8c385e2399..1502e9d942f 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -2,25 +2,26 @@ #include "common.cuh" #include "mmv.cuh" -template +template static __global__ void mul_mat_vec( const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, - const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row, - const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, - const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) { - const int64_t row = blockIdx.x; - const int64_t channel_dst = blockIdx.y; - const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; - const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst; - const int64_t sample_dst = blockIdx.z; - const int64_t sample_x = sample_dst / sample_ratio; - const int64_t sample_y = sample_dst; - const int tid = threadIdx.x; + const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst, + const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { + const int row = blockIdx.x; + const int channel_dst = blockIdx.y; + const int channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; + const int channel_y = ids ? channel_dst % nchannels_y : channel_dst; + const int sample_dst = blockIdx.z; + const int sample_x = sample_dst / sample_ratio; + const int sample_y = sample_dst; + const int tid = threadIdx.x; + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row; - y += sample_y *stride_sample_y + channel_y *stride_channel_y; - dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst; + x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; + y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; + dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; const float2 * y2 = (const float2 *) y; @@ -34,81 +35,108 @@ static __global__ void mul_mat_vec( __syncthreads(); } - float sumf = 0.0f; + float sumf[ncols_dst] = {0.0f}; if constexpr (std::is_same::value) { const float2 * x2 = (const float2 *) x; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + for (int col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = x2[col2]; - const float2 tmpy = y2[col2]; - sumf += tmpx.x*tmpy.x; - sumf += tmpx.y*tmpy.y; + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const float2 tmpy = y2[j*stride_col_y2 + col2]; + sumf[j] += tmpx.x*tmpy.x; + sumf[j] += tmpx.y*tmpy.y; + } } } else if constexpr (std::is_same::value) { const half2 * x2 = (const half2 *) x; if (std::is_same::value) { - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + for (int col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = __half22float2(x2[col2]); - const float2 tmpy = y2[col2]; - sumf += tmpx.x * tmpy.x; - sumf += tmpx.y * tmpy.y; + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const float2 tmpy = y2[j*stride_col_y2 + col2]; + sumf[j] += tmpx.x * tmpy.x; + sumf[j] += tmpx.y * tmpy.y; + } } } else { #ifdef FP16_AVAILABLE - half2 sumh2 = make_half2(0.0f, 0.0f); + half2 sumh2[ncols_dst] = {{0.0f, 0.0f}}; + + for (int col2 = tid; col2 < ncols2; col2 += block_size) { + const half2 tmpx = x2[col2]; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { - const float2 tmp = y2[col2]; - sumh2 += x2[col2] * make_half2(tmp.x, tmp.y); +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const float2 tmpy = y2[j*stride_col_y2 + col2]; + sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y); + } } - sumf = __low2float(sumh2) + __high2float(sumh2); +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]); + } #else NO_DEVICE_CODE; #endif // FP16_AVAILABLE } } else if constexpr (std::is_same::value) { const int * x2 = (const int *) x; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { - const int tmpx = x2[col2]; - const float2 tmpy = y2[col2]; - sumf += float(reinterpret_cast(&tmpx)[0]) * tmpy.x; - sumf += float(reinterpret_cast(&tmpx)[1]) * tmpy.y; + for (int col2 = tid; col2 < ncols2; col2 += block_size) { + const int tmpx = x2[col2]; +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const float2 tmpy = y2[j*stride_col_y2 + col2]; + sumf[j] += float(reinterpret_cast(&tmpx)[0]) * tmpy.x; + sumf[j] += float(reinterpret_cast(&tmpx)[1]) * tmpy.y; + } } } else { static_assert(std::is_same::value, "unsupported type"); } - sumf = warp_reduce_sum(sumf); +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + sumf[j] = warp_reduce_sum(sumf[j]); - if (block_size > warp_size) { - buf_iw[tid/warp_size] = sumf; - __syncthreads(); - if (tid >= warp_size) { - return; + if (block_size > warp_size) { + buf_iw[tid/warp_size] = sumf[j]; + __syncthreads(); + if (tid < warp_size) { + sumf[j] = buf_iw[tid]; + sumf[j] = warp_reduce_sum(sumf[j]); + } + if (j < ncols_dst) { + __syncthreads(); + } } - sumf = buf_iw[tid]; - sumf = warp_reduce_sum(sumf); } - if (tid != 0) { + if (tid >= ncols_dst) { return; } - dst[row] = sumf; + dst[tid*stride_col_dst + row] = sumf[tid]; } -template +template static void launch_mul_mat_vec_cuda( const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t ncols, const int64_t nrows, + const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, cudaStream_t stream) { - GGML_ASSERT(ncols % 2 == 0); - GGML_ASSERT(stride_row % 2 == 0); + GGML_ASSERT(ncols % 2 == 0); + GGML_ASSERT(stride_row % 2 == 0); + GGML_ASSERT(stride_col_y % 2 == 0); GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); GGML_ASSERT( nsamples_dst % nsamples_x == 0); const int64_t channel_ratio = nchannels_dst / nchannels_x; @@ -138,44 +166,52 @@ static void launch_mul_mat_vec_cuda( const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { case 32: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 64: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 96: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 128: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 160: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 192: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 224: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 256: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; default: { GGML_ABORT("fatal error"); @@ -183,23 +219,91 @@ static void launch_mul_mat_vec_cuda( } } +template +static void mul_mat_vec_cuda_switch_ncols_dst( + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols, const int64_t nrows, const int64_t ncols_dst, + const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + cudaStream_t stream) { + switch (ncols_dst) { + case 1: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 2: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 3: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 4: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 5: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 6: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 7: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 8: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + default: + GGML_ABORT("fatal error"); + break; + } +} + template static void mul_mat_vec_cuda( const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t ncols, const int64_t nrows, const int64_t ncols_dst, + const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, enum ggml_prec prec, cudaStream_t stream) { if constexpr(std::is_same::value) { if (prec == GGML_PREC_DEFAULT) { - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + mul_mat_vec_cuda_switch_ncols_dst + (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); return; } } - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + mul_mat_vec_cuda_switch_ncols_dst + (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); } @@ -246,24 +350,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const int64_t stride_channel_dst = ids ? s1 : s2; const int64_t stride_channel_y = ids ? s11 : s12; - GGML_ASSERT(ncols_dst == 1); + GGML_ASSERT(!ids || ncols_dst == 1); switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; @@ -282,16 +386,19 @@ void ggml_cuda_op_mul_mat_vec( GGML_ASSERT(dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne0 = dst->ne[0]; const int64_t row_diff = row_high - row_low; - GGML_ASSERT(src1_ncols == 1); - - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; // ggml_cuda_op provides single, contiguous matrices const int64_t stride_row = ne00; + const int64_t stride_col_y = ne10; + const int64_t stride_col_dst = id == ctx.device ? ne0 : row_diff; // main device has larger memory buffer const int64_t nchannels_x = 1; const int64_t nchannels_y = 1; const int64_t nchannels_dst = 1; @@ -307,19 +414,19 @@ void ggml_cuda_op_mul_mat_vec( switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; @@ -334,3 +441,48 @@ void ggml_cuda_op_mul_mat_vec( GGML_UNUSED(src1_ncols); GGML_UNUSED(src1_padded_row_size); } + +bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) { + if (src0_ne[0] % 2 != 0) { + return false; + } + switch (type) { + case GGML_TYPE_F32: + if (GGML_CUDA_CC_IS_NVIDIA(cc)) { + if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { + return ne11 <= 8; + } + if (cc >= GGML_CUDA_CC_TURING) { + return ne11 <= 4; + } + return ne11 <= 3; + } + return ne11 <= 8; + case GGML_TYPE_F16: + if (GGML_CUDA_CC_IS_NVIDIA(cc)) { + const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1); + if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { + return src0_small && ne11 <= 4; + } + if (fp16_mma_hardware_available(cc)) { + return src0_small && ne11 <= 3; + } + return ne11 <= 8; + } + return ne11 <= 8; + case GGML_TYPE_BF16: + if (GGML_CUDA_CC_IS_NVIDIA(cc)) { + const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1); + if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { + return src0_small && ne11 <= 4; + } + if (bf16_mma_hardware_available(cc)) { + return src0_small && ne11 <= 3; + } + return ne11 <= 8; + } + return ne11 <= 8; + default: + return false; + } +} diff --git a/ggml/src/ggml-cuda/mmv.cuh b/ggml/src/ggml-cuda/mmv.cuh index 756e7e1cc7f..1330bcb6a88 100644 --- a/ggml/src/ggml-cuda/mmv.cuh +++ b/ggml/src/ggml-cuda/mmv.cuh @@ -1,8 +1,5 @@ #include "common.cuh" -// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available -#define MMV_MAX_ROWS 512 - void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); void ggml_cuda_op_mul_mat_vec( @@ -10,3 +7,5 @@ void ggml_cuda_op_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); + +bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11); From 6a1d12a8ea0eafc63635ceba0576471e36eae05c Mon Sep 17 00:00:00 2001 From: uvos Date: Tue, 24 Jun 2025 01:12:56 +0200 Subject: [PATCH 0019/1348] CUDA/HIP: optimize mmv paths taken for HIP devices (llama/14324) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/common.cuh | 6 +++++- ggml/src/ggml-cuda/mmv.cu | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1369bc2d9e5..f6127aeee42 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -263,7 +263,11 @@ static bool fp16_mma_hardware_available(const int cc) { } static bool bf16_mma_hardware_available(const int cc) { - return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE; + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; +} + +static bool fp32_mma_hardware_available(const int cc) { + return GGML_CUDA_CC_IS_CDNA(cc); } // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index 1502e9d942f..e14c93516bd 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -456,6 +456,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return ne11 <= 4; } return ne11 <= 3; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (fp32_mma_hardware_available(cc)) { + return ne11 <= 3; + } + return ne11 <= 8; } return ne11 <= 8; case GGML_TYPE_F16: @@ -468,6 +473,14 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return src0_small && ne11 <= 3; } return ne11 <= 8; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (fp16_mma_hardware_available(cc)) { + if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { + return ne11 <= 5; + } + return ne11 <= 2; + } + return ne11 <= 8; } return ne11 <= 8; case GGML_TYPE_BF16: @@ -480,6 +493,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return src0_small && ne11 <= 3; } return ne11 <= 8; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (bf16_mma_hardware_available(cc)) { + return ne11 <= 3; + } + return ne11 <= 8; } return ne11 <= 8; default: From acfbf2921badff57bf5d4099baaa1052992db941 Mon Sep 17 00:00:00 2001 From: lhez Date: Tue, 24 Jun 2025 11:46:25 -0700 Subject: [PATCH 0020/1348] opencl: ref count `ggml_backend_opencl_context` and refactor profiling (llama/14254) * Move profiling info into `ggml_backend_opencl_context` * Add `enqueue_ndrange_kernel` to launch kernel --- ggml/src/ggml-opencl/ggml-opencl.cpp | 775 +++++++++------------------ 1 file changed, 240 insertions(+), 535 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 628e574f0f7..96e8a8588dc 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive return { type, major, minor, patch }; } +// Profiling +struct ProfilingInfo { + std::string op_name; + std::string kernel_name; + + cl_kernel kernel; + cl_event evt; + + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong overhead_start; + cl_ulong overhead_end; + // For the times below, see spec for clGetEventProfilingInfo + // The time kernel spent in cmd queue - SUBMIT - QUEUED + cl_ulong cmd_queued_duration_ns; + // The time kernel spent for submission - START - SUBMIT + cl_ulong cmd_submit_duration_ns; + // Kernel execution time in nanoseconds - END - START + cl_ulong cmd_duration_ns; + // The time for the kernel to complete - COMPLETE - END + cl_ulong cmd_complete_duration_ns; + // Total time to finish the kernel - COMPELTE - QUEUED + cl_ulong cmd_total_duration_ns; + // Global and local work sizes. + size_t global_size[3]; + size_t local_size[3]; + // Op output size. + size_t output_size[4]; +}; + +static void populateProfilingInfo( + ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim, + size_t global_size[3], size_t local_size[3], + const ggml_tensor * tensor) { + info.op_name = tensor->name; + info.kernel = kernel; + info.evt = evt; + + // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose + info.local_size[0] = 0; + info.local_size[1] = 0; + info.local_size[2] = 0; + + info.global_size[0] = 0; + info.global_size[1] = 0; + info.global_size[2] = 0; + + if (local_size) { + for (cl_uint i = 0; i < work_dim; ++i) { + info.local_size[i] = local_size[i]; + } + } + + for (cl_uint i = 0; i < work_dim; ++i) { + info.global_size[i] = global_size[i]; + } + + info.output_size[0] = tensor->ne[0]; + info.output_size[1] = tensor->ne[1]; + info.output_size[2] = tensor->ne[2]; + info.output_size[3] = tensor->ne[3]; +} + struct ggml_backend_opencl_context; // backend device context @@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context { // backend context struct ggml_backend_opencl_context { + int ref_count; + cl_device_id device; std::string device_name; @@ -369,6 +436,108 @@ struct ggml_backend_opencl_context { cl_kernel kernel_timestep_embedding; cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat; + std::vector profiling_info; + + void write_profiling_info() { + FILE * fperf = fopen("cl_profiling.csv", "w"); + if (!fperf) { + GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); + return; + } + + // Populate profiling info + for (ProfilingInfo & info : profiling_info) { + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong cmd_complete; + + CL_CHECK(clWaitForEvents(1, &info.evt)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); + CL_CHECK(clReleaseEvent(info.evt)); + + char kernel_name[512]; + CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, + sizeof(kernel_name), kernel_name, NULL)); + info.kernel_name = kernel_name; + + info.cmd_queued = cmd_queued; + info.cmd_submit = cmd_submit; + info.cmd_start = cmd_start; + info.cmd_end = cmd_end; + + info.cmd_queued_duration_ns = cmd_submit - cmd_queued; + info.cmd_submit_duration_ns = cmd_start - cmd_submit; + info.cmd_duration_ns = cmd_end - cmd_start; + info.cmd_complete_duration_ns = cmd_complete - cmd_end; + info.cmd_total_duration_ns = cmd_complete - cmd_queued; + } + + // Dump a csv + float total_kernel_time = 0; + fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); + for (const ProfilingInfo & info : profiling_info) { + total_kernel_time += info.cmd_duration_ns/1.e6f; + fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", + info.op_name.c_str(), info.kernel_name.c_str(), + info.cmd_queued_duration_ns/1.e6f, + info.cmd_submit_duration_ns/1.e6f, + info.cmd_duration_ns/1.e6f, + info.cmd_complete_duration_ns/1.e6f, + info.cmd_total_duration_ns/1.e6f, + info.global_size[0], info.global_size[1], info.global_size[2], + info.local_size[0], info.local_size[1], info.local_size[2], + info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); + } + fclose(fperf); + + GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); + + // Dump a simple chrome trace + FILE* ftrace = fopen("cl_trace.json", "w"); + if (!ftrace) { + GGML_LOG_ERROR("Failed to open cl_trace.json\n"); + return; + } + + fprintf(ftrace, "[\n"); + for (const ProfilingInfo & info : profiling_info) { + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_queued/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_submit/1000); + + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_start/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_end/1000); + } + fclose(ftrace); + } + + void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) { +#ifdef GGML_OPENCL_PROFILING + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + + profiling_info.emplace_back(); + populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); +#else + GGML_UNUSED(tensor); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); +#endif + } + #ifdef GGML_OPENCL_USE_ADRENO_KERNELS // Transpose kernels cl_program program_transpose; @@ -395,46 +564,19 @@ struct ggml_backend_opencl_context { cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; #endif // GGML_OPENCL_USE_ADRENO_KERNELS -}; -// All registered devices with a default device in the front. -static std::vector g_ggml_backend_opencl_devices; - -// Profiling + void free() { + ref_count--; + if (ref_count == 0) { #ifdef GGML_OPENCL_PROFILING -struct ProfilingInfo { - std::string op_name; - std::string kernel_name; - - cl_kernel kernel; - cl_event evt; - - cl_ulong cmd_queued; - cl_ulong cmd_submit; - cl_ulong cmd_start; - cl_ulong cmd_end; - cl_ulong overhead_start; - cl_ulong overhead_end; - // For the times below, see spec for clGetEventProfilingInfo - // The time kernel spent in cmd queue - SUBMIT - QUEUED - cl_ulong cmd_queued_duration_ns; - // The time kernel spent for submission - START - SUBMIT - cl_ulong cmd_submit_duration_ns; - // Kernel execution time in nanoseconds - END - START - cl_ulong cmd_duration_ns; - // The time for the kernel to complete - COMPLETE - END - cl_ulong cmd_complete_duration_ns; - // Total time to finish the kernel - COMPELTE - QUEUED - cl_ulong cmd_total_duration_ns; - // Global and local work sizes. - size_t global_size[3]; - size_t local_size[3]; - // Op output size. - size_t output_size[4]; + write_profiling_info(); +#endif + } + } }; -std::vector g_profiling_info; -#endif +// All registered devices with a default device in the front. +static std::vector g_ggml_backend_opencl_devices; inline std::string read_file(const std::string &path) { std::ifstream ifs(path); @@ -1669,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { backend_ctx->device = dev_ctx->device; backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; + // ref_count get increased in ggml_backend_opencl_device_init + // This function is also used to retrieve backend context, so we don't want + // to increase ref_count for each call. We only want to increase ref_count + // when the associated device is initialized + backend_ctx->ref_count = 0; + if (strstr(dev_ctx->device_name.c_str(), "Adreno") || strstr(dev_ctx->device_name.c_str(), "Qualcomm") || strstr(dev_ctx->device_version.c_str(), "Adreno")) { @@ -1841,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { return dev_ctx->backend_ctx; } -static void ggml_cl2_free(void) { -#ifdef GGML_OPENCL_PROFILING - FILE * fperf = fopen("cl_profiling.csv", "w"); - if (!fperf) { - GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); - return; - } +static void ggml_cl2_free(ggml_backend_t backend) { + ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context; + ctx->free(); - // Populate profiling info - for (ProfilingInfo & info : g_profiling_info) { - cl_ulong cmd_queued; - cl_ulong cmd_submit; - cl_ulong cmd_start; - cl_ulong cmd_end; - cl_ulong cmd_complete; - - CL_CHECK(clWaitForEvents(1, &info.evt)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); - CL_CHECK(clReleaseEvent(info.evt)); - - char kernel_name[512]; - CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, - sizeof(kernel_name), kernel_name, NULL)); - info.kernel_name = kernel_name; - - info.cmd_queued = cmd_queued; - info.cmd_submit = cmd_submit; - info.cmd_start = cmd_start; - info.cmd_end = cmd_end; - - info.cmd_queued_duration_ns = cmd_submit - cmd_queued; - info.cmd_submit_duration_ns = cmd_start - cmd_submit; - info.cmd_duration_ns = cmd_end - cmd_start; - info.cmd_complete_duration_ns = cmd_complete - cmd_end; - info.cmd_total_duration_ns = cmd_complete - cmd_queued; - } - - // Dump a csv - float total_kernel_time = 0; - fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); - for (const ProfilingInfo & info : g_profiling_info) { - total_kernel_time += info.cmd_duration_ns/1.e6f; - fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", - info.op_name.c_str(), info.kernel_name.c_str(), - info.cmd_queued_duration_ns/1.e6f, - info.cmd_submit_duration_ns/1.e6f, - info.cmd_duration_ns/1.e6f, - info.cmd_complete_duration_ns/1.e6f, - info.cmd_total_duration_ns/1.e6f, - info.global_size[0], info.global_size[1], info.global_size[2], - info.local_size[0], info.local_size[1], info.local_size[2], - info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); - } - fclose(fperf); - - GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); - - // Dump a simple chrome trace - FILE* ftrace = fopen("cl_trace.json", "w"); - if (!ftrace) { - GGML_LOG_ERROR("Failed to open cl_trace.json\n"); - return; + // The CL context is shared by all backends, release it if all backends have been released + bool should_release_opencl = true; + for (auto device : g_ggml_backend_opencl_devices) { + ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context; + if (ctx_dev->backend_ctx->ref_count > 0) { + should_release_opencl = false; + } } - fprintf(ftrace, "[\n"); - for (const ProfilingInfo & info : g_profiling_info) { - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", - info.kernel_name.c_str(), info.cmd_queued/1000); - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", - info.kernel_name.c_str(), info.cmd_submit/1000); - - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", - info.kernel_name.c_str(), info.cmd_start/1000); - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", - info.kernel_name.c_str(), info.cmd_end/1000); + if (should_release_opencl) { + CL_CHECK(clReleaseContext(ctx->context)); } - fclose(ftrace); -#endif } //------------------------------------------------------------------------------ @@ -2011,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) { } static void ggml_backend_opencl_free(ggml_backend_t backend) { - ggml_cl2_free(); - - GGML_UNUSED(backend); + ggml_cl2_free(backend); } static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -2899,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) { ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev); + // Getting a new reference to the backend, increase ref_count + backend_ctx->ref_count++; ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_opencl_guid(), @@ -3159,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso #define dump_tensor(tensor) #endif -//------------------------------------------------------------------------------ -// Profiling utility -//------------------------------------------------------------------------------ -#ifdef GGML_OPENCL_PROFILING -static void populateProfilingInfo( - ProfilingInfo& info, cl_event evt, cl_kernel kernel, - size_t global_size[3], size_t local_size[3], - const ggml_tensor * tensor) { - info.op_name = tensor->name; - info.kernel = kernel; - info.evt = evt; - - info.local_size[0] = local_size[0]; - info.local_size[1] = local_size[1]; - info.local_size[2] = local_size[2]; - info.global_size[0] = global_size[0]; - info.global_size[1] = global_size[1]; - info.global_size[2] = global_size[2]; - info.output_size[0] = tensor->ne[0]; - info.output_size[1] = tensor->ne[1]; - info.output_size[2] = tensor->ne[2]; - info.output_size[3] = tensor->ne[3]; -} -#endif - //------------------------------------------------------------------------------ // Ops //------------------------------------------------------------------------------ @@ -3227,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c const cl_ulong nb2 = dst ? dst->nb[2] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3271,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1}; size_t local_work_size[] = {1, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3321,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst ? dst->nb[3] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3396,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3461,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst ? dst->nb[3] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3536,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3598,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3661,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3723,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3786,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3821,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3848,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3868,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3895,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3915,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3947,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3967,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3992,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4012,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4044,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4064,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4096,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4116,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4157,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4177,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; //ggml_backend_opencl_device_context * dev_ctx = // (ggml_backend_opencl_device_context *)backend->device->context; @@ -4241,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c // This is local memory - the size depends on subgroup size. CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL)); -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4261,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4300,15 +4202,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1}; size_t local_work_size[] = {(size_t)sgs, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4320,7 +4214,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4397,16 +4290,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const } if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return; - -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) { @@ -4419,7 +4303,6 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con UNUSED(src1_shape_def); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; if (backend_ctx->kernel_repeat == nullptr) { GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__); @@ -4467,15 +4350,7 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = { gws0, gws1, gws2 }; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { @@ -4488,7 +4363,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; if (backend_ctx->kernel_pad == nullptr) { GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__); @@ -4533,15 +4407,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t local_work_size_ptr = nullptr; } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { @@ -4553,7 +4419,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg GGML_ASSERT(dst->type == GGML_TYPE_F32); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); cl_kernel kernel = nullptr; @@ -4644,17 +4509,7 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg local_work_size_ptr = nullptr; } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - size_t profiling_gws[3] = {global_work_size[0], 1, 1}; - size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1}; - populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4732,7 +4587,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con global_work_size[1] = d_ne1; global_work_size[2] = d_ne2; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL)); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } } } else { @@ -4782,7 +4637,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con d_ne2 > 0 ? (size_t)d_ne2 : 1, d_ne3 > 0 ? (size_t)d_ne3 : 1 }; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL)); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst); } } @@ -4795,7 +4650,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor GGML_ASSERT(dst->type == GGML_TYPE_F32); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; if (backend_ctx->kernel_timestep_embedding == nullptr) { GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__); @@ -4828,17 +4682,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor size_t global_work_size[] = {gws0, gws1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem - - g_profiling_info.emplace_back(); - size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1}; - size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS - populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4853,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -5058,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co static_cast(padded_height_B) }; - #ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst); - #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL)); - #endif + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst); } else { // no need to transpose B in other cases // create an image for B from sub_buffer @@ -5188,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co // enqueue kernel with profiling // <--------------------------------------------> // - #ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); - // enqueue kernel without profiling - #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); - #endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); // <--------------------------------------------> // // deallocate sub buffers and images @@ -5277,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co global_work_size[2] = (size_t)ne12*ne13; } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); return; } #else // GGML_OPENCL_SOA_Q @@ -5515,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else if (src0t == GGML_TYPE_Q4_K) { GGML_ASSERT(false && "not implemented"); } else if (src0t == GGML_TYPE_Q3_K) { @@ -5534,30 +5344,14 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { int64_t ny = (ne11 + nrows - 1)/nrows; size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -5574,7 +5368,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, GGML_ASSERT(src2->extra); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra; @@ -5680,15 +5473,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123}; size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5701,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; float scale; memcpy(&scale, dst->op_params, sizeof(scale)); @@ -5730,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5775,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -5840,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1); } static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5871,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr const int ne02 = src0 ? src0->ne[2] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -5895,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { kernel = backend_ctx->kernel_diag_mask_inf; @@ -5923,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } } @@ -5951,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c } ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -6031,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6051,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const GGML_ASSERT(dst->extra); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -6217,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6240,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -6309,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC}; size_t local_work_size[] = {256, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6332,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -6364,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1}; size_t local_work_size[] = {(size_t)ne00_padded, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6386,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -6427,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } //------------------------------------------------------------------------------ From fc2859411283a9e00bc94594f8f03267423fe350 Mon Sep 17 00:00:00 2001 From: Anton Mitkov Date: Wed, 25 Jun 2025 17:09:55 +0100 Subject: [PATCH 0021/1348] sycl: GGML_SYCL_DISABLE_OPT on by default for all Intel Devices (llama/13973) --- ggml/src/ggml-sycl/common.hpp | 25 +------------------------ ggml/src/ggml-sycl/ggml-sycl.cpp | 6 ++---- ggml/src/ggml-sycl/sycl_hw.cpp | 4 +++- ggml/src/ggml-sycl/sycl_hw.hpp | 3 +++ 4 files changed, 9 insertions(+), 29 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 753b4af1436..4e7449d06ec 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -199,7 +199,7 @@ struct sycl_device_info { // size_t smpb; // max. shared memory per block bool vmm; // virtual memory support size_t total_vram; - sycl_hw_info hw_info; + //sycl_hw_info hw_info; \\ device id and aarch, currently not used optimize_feature opt_feature; }; @@ -286,29 +286,6 @@ struct ggml_tensor_extra_gpu { void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); -inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { - optimize_feature opt; - - opt.reorder = - (arch == syclex::architecture::intel_gpu_dg1 || - arch == syclex::architecture::intel_gpu_acm_g10 || - arch == syclex::architecture::intel_gpu_acm_g11 || - arch == syclex::architecture::intel_gpu_acm_g12 || - arch == syclex::architecture::intel_gpu_pvc || - arch == syclex::architecture::intel_gpu_pvc_vg || - arch == syclex::architecture::intel_gpu_mtl_u || - arch == syclex::architecture::intel_gpu_mtl_s || - arch == syclex::architecture::intel_gpu_mtl_h || - arch == syclex::architecture::intel_gpu_arl_u || - arch == syclex::architecture::intel_gpu_arl_s || - arch == syclex::architecture::intel_gpu_arl_h || - arch == syclex::architecture::intel_gpu_bmg_g21 || - arch == syclex::architecture::intel_gpu_lnl_m - ); - - return opt; -} - namespace sycl_ex = sycl::ext::oneapi::experimental; struct ggml_backend_sycl_context { int device; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f25a96a625c..9cb36ae99e7 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -83,9 +83,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.devices[i].cc = 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - info.devices[i].hw_info = get_device_hw_info(&device); - info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch); - + info.devices[i].opt_feature.reorder = !device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); info.max_work_group_sizes[i] = prop.get_max_work_group_size(); } @@ -195,7 +193,7 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1); + g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); diff --git a/ggml/src/ggml-sycl/sycl_hw.cpp b/ggml/src/ggml-sycl/sycl_hw.cpp index da121ffc261..7041140034b 100644 --- a/ggml/src/ggml-sycl/sycl_hw.cpp +++ b/ggml/src/ggml-sycl/sycl_hw.cpp @@ -1,6 +1,7 @@ #include "sycl_hw.hpp" - +// TODO: currently not used +/* sycl_hw_info get_device_hw_info(sycl::device *device_ptr) { sycl_hw_info res; int32_t id = device_ptr->get_info(); @@ -11,3 +12,4 @@ sycl_hw_info get_device_hw_info(sycl::device *device_ptr) { return res; } +*/ diff --git a/ggml/src/ggml-sycl/sycl_hw.hpp b/ggml/src/ggml-sycl/sycl_hw.hpp index bf689450ce6..36b140bf037 100644 --- a/ggml/src/ggml-sycl/sycl_hw.hpp +++ b/ggml/src/ggml-sycl/sycl_hw.hpp @@ -10,6 +10,8 @@ namespace syclex = sycl::ext::oneapi::experimental; +// TODO: currently not used +/* struct sycl_hw_info { syclex::architecture arch; int32_t device_id; @@ -18,6 +20,7 @@ struct sycl_hw_info { bool is_in_vector(std::vector &vec, int item); sycl_hw_info get_device_hw_info(sycl::device *device_ptr); +*/ #endif // SYCL_HW_HPP From 99764f57670202b7fca72d908b36092fe0ea382d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 25 Jun 2025 23:26:51 +0200 Subject: [PATCH 0022/1348] ggml : do not output unprintable characters on GGUF load failure (llama/14381) --- ggml/src/gguf.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index a0a318a29f5..5ffd12b8b27 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -335,7 +335,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par for (uint32_t i = 0; i < magic.size(); i++) { if (magic[i] != GGUF_MAGIC[i]) { - GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]); + char c0 = isprint(magic[0]) ? magic[0] : '?'; + char c1 = isprint(magic[1]) ? magic[1] : '?'; + char c2 = isprint(magic[2]) ? magic[2] : '?'; + char c3 = isprint(magic[3]) ? magic[3] : '?'; + GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3); gguf_free(ctx); return nullptr; } From 16aa7d151dfe18f1a6ec8ee48eeb29de2dc344c1 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 26 Jun 2025 05:49:04 +0800 Subject: [PATCH 0023/1348] ggml-cpu: enable IBM NNPA Vector Intrinsics (llama/14317) * ggml-cpu: add nnpa compile flag Signed-off-by: Aaron Teo (cherry picked from commit 4a9f60c201573128f73a65999b3e5cc497fae5c1) * ggml-cpu: add fp16->fp32 nnpa first Signed-off-by: Aaron Teo (cherry picked from commit 8d4a7987f9c1887f716be96250f2caeee0253929) * ggml-cpu: add fp32->fp16 Signed-off-by: Aaron Teo (cherry picked from commit 0ff0d6516247a41d2ade42b42cf0d676a4dd1627) * ggml-cpu: better variable names Signed-off-by: Aaron Teo (cherry picked from commit 2f58bbcbb89c183340e252362b2a40651f573f1f) * docs: update s390x docs Signed-off-by: Aaron Teo (cherry picked from commit 01b929491b50071a5d0572235dcf5a449da70aa7) * ggml-cpu: add debugging prints to see if dlf16 is correct Signed-off-by: Aaron Teo * ggml-cpu: fix print vs printf Signed-off-by: Aaron Teo * ggml-cpu: fix float placeholder Signed-off-by: Aaron Teo * ggml-cpu: ensure fp16 and fp32 load and stores are called Signed-off-by: Aaron Teo * ggml-cpu: fp16 load ensured to hit Signed-off-by: Aaron Teo * ggml-cpu: remove sigint from fp16 store for some reason, the function is not getting a hit when debugged with gdb. we will need to investigate further Signed-off-by: Aaron Teo * ggml-cpu: activate nnpa for ggml_cpu_fp16_to_fp32 Signed-off-by: Aaron Teo * ggml-cpu: nnpa activate ggml_cpu_fp16_to_fp32 for 8 elements Signed-off-by: Aaron Teo * ggml-cpu: nnpa switch to vec_xst test Signed-off-by: Aaron Teo * ggml-cpu: switch to vec_xst for 4 element loops also Signed-off-by: Aaron Teo * ggml-cpu: rework noop Signed-off-by: Aaron Teo * ggml-cpu: remove noop, general code cleanup Signed-off-by: Aaron Teo * ggml-cpu: clarify variable naming Signed-off-by: Aaron Teo * ggml-cpu: activate nnpa for ggml_cpu_fp32_to_fp16 Signed-off-by: Aaron Teo * ggml-cpu: add breakpoint for debugging Signed-off-by: Aaron Teo * ggml-cpu: test fix for conversion failure Signed-off-by: Aaron Teo * ggml-cpu: disable fp32->fp16 nnpa conversions for now there are some conversion failures in nnpa that requires the eyes of an ibm stsm. will create a separate pr to introduce the fp32->fp16 change. Signed-off-by: Aaron Teo * ggml-cpu: switch to elif macro Signed-off-by: Aaron Teo * ggml-cpu: reattempt fp32->fp16 Signed-off-by: Aaron Teo * ggml-cpu: fix typo Signed-off-by: Aaron Teo * ggml-cpu: reattempt fp32->fp16 Signed-off-by: Aaron Teo * ggml-cpu: fix compiler types Signed-off-by: Aaron Teo * ggml-cpu: change to typedef vector types Signed-off-by: Aaron Teo * ggml-cpu: add 4 element loops for fp32->fp16 Signed-off-by: Aaron Teo * ggml-cpu: clarified vector naming Signed-off-by: Aaron Teo * ggml-cpu: bring back fp32->fp16 store nnpa Signed-off-by: Aaron Teo * ggml-cpu: activate nnpa fp32->fp16 or fp16->fp32 compute Signed-off-by: Aaron Teo * ggml-cpu: add nnpa macro check in ggml-impl Signed-off-by: Aaron Teo * ggml-cpu: add missing __func__ Signed-off-by: Aaron Teo * ggml-cpu: diagnose why __NNPA__ macro is not being defined Signed-off-by: Aaron Teo * ggml-cpu: import vecintrin.h to fix compiler errors Signed-off-by: Aaron Teo * ggml-cpu: update macro tests Signed-off-by: Aaron Teo * ggml-cpu: move s390x typedef to own header file Signed-off-by: Aaron Teo * Revert "ggml-cpu: move s390x typedef to own header file" This reverts commit 157f856c34589566151630e294563a420702db39. Signed-off-by: Aaron Teo * ggml-cpu: switch to importing ggml-cpu-impl instead Signed-off-by: Aaron Teo * ggml-cpu: fix macro declaration Signed-off-by: Aaron Teo * ggml-cpu: test more macros Signed-off-by: Aaron Teo * ggml-cpu: add debug prints Signed-off-by: Aaron Teo * ggml-cpu: bruteforce macro definitions Signed-off-by: Aaron Teo * ggml-cpu: move macro definitions Signed-off-by: Aaron Teo * ggml-cpu: add ggml-impl.h to cmakelists Signed-off-by: Aaron Teo * ggml-cpu: switch to private macros Signed-off-by: Aaron Teo * ggml-cpu: move s390x typedef to own header file Signed-off-by: Aaron Teo (cherry picked from commit 157f856c34589566151630e294563a420702db39) * ggml-cpu: move things around Signed-off-by: Aaron Teo * ggml-cpu: bring back compile macros Signed-off-by: Aaron Teo * ggml-cpu: switch to quotes for import Signed-off-by: Aaron Teo * ggml-cpu: add compiler error macro Signed-off-by: Aaron Teo * ggml-cpu: add s390x detection in ggml-src Signed-off-by: Aaron Teo * ggml-cpu: bring back compile definitions Signed-off-by: Aaron Teo * ggml-cpu: undo cmakelists work Signed-off-by: Aaron Teo * Revert "ggml-cpu: move s390x typedef to own header file" This reverts commit 18d79e1a30b39d9aaa0bd58400c5cf2c32135c9a. Signed-off-by: Aaron Teo * ggml-cpu: remove typedefs.h Signed-off-by: Aaron Teo * ggml-cpu: remove typedef from cmakelists Signed-off-by: Aaron Teo * ggml-cpu: add ggml-impl.h future notes Signed-off-by: Aaron Teo * ggml-cpu: add todo comment for future reference Signed-off-by: Aaron Teo * ggml-cpu: clarify naming of dlf16 Signed-off-by: Aaron Teo * ggml-cpu: remove unnecessary target compile definitions Signed-off-by: Aaron Teo * ggml-cpu: move nnpa fp16->fp32 and fp32->fp16 to simd-mappings Signed-off-by: Aaron Teo * ggml: refactor fp32->fp16 and fp16->fp32 simd to ggml-cpu Signed-off-by: Aaron Teo * docs: update broken huggingface link for s390x Signed-off-by: Aaron Teo * ggml-cpu: fix duplicate func names during compile Signed-off-by: Aaron Teo * Revert "ggml-cpu: fix duplicate func names during compile" This reverts commit fbb733451f27677063b914d4f6c9a9841d45b38d. Signed-off-by: Aaron Teo * Revert "ggml: refactor fp32->fp16 and fp16->fp32 simd to ggml-cpu" This reverts commit bd288e8fa52b5244f65cee21cb61062f1a9e0ca5. Signed-off-by: Aaron Teo * ggml: refactor fp16<->fp32 simd to ggml-cpu Signed-off-by: Aaron Teo * ggml-cpu: fix missing simd-mappings.h import in quants.c Signed-off-by: Aaron Teo * ggml-cpu: fix missing simd-mappings.h within repack Signed-off-by: Aaron Teo * ggml-cpu: fix amx mmq missing simd-mappings.h Signed-off-by: Aaron Teo * ggml-cpu: attempt at fixing loongarch failing build Signed-off-by: Aaron Teo * ggml-cpu: move nnpa together with other fp16<->fp32 simd Signed-off-by: Aaron Teo * ggml-cpu: fix wrong refactor of ggml-base ref: https://github.com/ggml-org/llama.cpp/pull/14317#discussion_r2164176555 Signed-off-by: Aaron Teo * ggml: remove dependency on ggml-cpu from ggml-base Signed-off-by: Aaron Teo * ggml-cpu: rename all fp16<->fp32 macros to prefix with ggml_cpu ref: https://github.com/ggml-org/llama.cpp/pull/14317#discussion_r2164449406 Signed-off-by: Aaron Teo * ggml-cpu: remove mistaken fallback macro fallback logic was already implemented but i was too sleepy to realise Signed-off-by: Aaron Teo * ggml: move ggml_table_f32_f16 to ggml-cpu ref: https://github.com/ggml-org/llama.cpp/pull/14317#discussion_r2164775006 Signed-off-by: Aaron Teo * ggml-cpu: move ggml_table_f32_f16 back to ggml-base due to ci failures Signed-off-by: Aaron Teo * Revert "ggml-cpu: move ggml_table_f32_f16 back to ggml-base due to ci failures" This reverts commit 32a3533564bdb7902cefb9c89b1c9e956a81ce29. Signed-off-by: Aaron Teo * Revert "ggml: move ggml_table_f32_f16 to ggml-cpu" This reverts commit 9e40d984ad27d7b60392fb2b7548885201864fe4. Signed-off-by: Aaron Teo * ggml: move ggml_table_f32_f16 to ggml-cpu ref: https://github.com/ggml-org/llama.cpp/pull/14317#discussion_r2164775006 Signed-off-by: Aaron Teo (cherry picked from commit 9e40d984ad27d7b60392fb2b7548885201864fe4) * ggml: move ggml_table_f32_f16 to ggml-cpu.c Signed-off-by: Aaron Teo * ggml-cpu: extern c ggml_table_f32_f16 + chore docs Signed-off-by: Aaron Teo * ggml-cpu: dedup ggml_table_f32_f16 from simd-mappings.h we rely on the variable declaration in ggml-cpu.c instead Signed-off-by: Aaron Teo * Revert "ggml-cpu: dedup ggml_table_f32_f16 from simd-mappings.h" This reverts commit f71b21d2f74f5e03ec0c2b4fefd3cbf395aecf16. Signed-off-by: Aaron Teo * ggml-cpu: bring back ggml_table_f32_f16 Signed-off-by: Aaron Teo * Revert "ggml-cpu: bring back ggml_table_f32_f16" This reverts commit 2dce119178bed5ef5c8398c4230ddd14fef80e49. Signed-off-by: Aaron Teo * fix ggml time initialization * fix f32_f16 table init * remove extra line --------- Signed-off-by: Aaron Teo Co-authored-by: slaren --- ggml/CMakeLists.txt | 1 + ggml/include/ggml-cpu.h | 1 + ggml/src/ggml-cpu/CMakeLists.txt | 8 + ggml/src/ggml-cpu/amx/mmq.cpp | 19 +- ggml/src/ggml-cpu/arch/arm/quants.c | 217 +++++++++---------- ggml/src/ggml-cpu/arch/arm/repack.cpp | 25 +-- ggml/src/ggml-cpu/arch/loongarch/quants.c | 105 +++++----- ggml/src/ggml-cpu/arch/powerpc/quants.c | 111 +++++----- ggml/src/ggml-cpu/arch/riscv/quants.c | 83 ++++---- ggml/src/ggml-cpu/arch/riscv/repack.cpp | 47 +++-- ggml/src/ggml-cpu/arch/s390/quants.c | 57 ++--- ggml/src/ggml-cpu/arch/wasm/quants.c | 59 +++--- ggml/src/ggml-cpu/arch/x86/quants.c | 165 +++++++-------- ggml/src/ggml-cpu/arch/x86/repack.cpp | 39 ++-- ggml/src/ggml-cpu/common.h | 5 +- ggml/src/ggml-cpu/ggml-cpu-impl.h | 12 +- ggml/src/ggml-cpu/ggml-cpu.c | 75 +++++-- ggml/src/ggml-cpu/ggml-cpu.cpp | 3 + ggml/src/ggml-cpu/llamafile/sgemm.cpp | 5 +- ggml/src/ggml-cpu/ops.cpp | 96 ++++----- ggml/src/ggml-cpu/quants.c | 49 ++--- ggml/src/ggml-cpu/repack.cpp | 29 +-- ggml/src/ggml-cpu/simd-mappings.h | 244 +++++++++++++++++++--- ggml/src/ggml-cpu/vec.cpp | 4 +- ggml/src/ggml-cpu/vec.h | 90 ++++---- ggml/src/ggml-impl.h | 244 ++++++---------------- ggml/src/ggml.c | 11 - 27 files changed, 963 insertions(+), 841 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4e7399f9e68..215eb234868 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF) option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF) option(GGML_VXE "ggml: enable vxe" ON) +option(GGML_NNPA "ggml: enable nnpa" ON) option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index de77a875ec5..e3b79d09bb6 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -101,6 +101,7 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_vxe (void); + GGML_BACKEND_API int ggml_cpu_has_nnpa (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); GGML_BACKEND_API int ggml_cpu_has_llamafile (void); diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 71b1d67b8d0..671fad4d228 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -448,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # TODO: Separation to determine activation of VX/VXE/VXE2 if (${S390X_M} MATCHES "8561|8562") + set(GGML_NNPA OFF) message(STATUS "z15 target") list(APPEND ARCH_FLAGS -march=z15) elseif (${S390X_M} MATCHES "3931") @@ -464,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() if (GGML_VXE) + message(STATUS "VX/VXE/VXE2 enabled") list(APPEND ARCH_FLAGS -mvx -mzvector) + list(APPEND ARCH_DEFINITIONS GGML_VXE) + endif() + + if (GGML_NNPA) + message(STATUS "NNPA enabled") + list(APPEND ARCH_DEFINITIONS GGML_NNPA) endif() elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm") message(STATUS "Wasm detected") diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index cec34eb6416..47c61b88164 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -8,6 +8,7 @@ #include "mmq.h" #include "ggml-impl.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "quants.h" #include "ggml-quants.h" #include @@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_ // Quantize these floats const float iscale = 127.f / amax; - y[i].d = GGML_FP32_TO_FP16(1 / iscale); + y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale); const float id = ( amax != 0.0f ) ? iscale : 0.f; const __m512 vscale = _mm512_set1_ps(id); @@ -1090,7 +1091,7 @@ struct acc_C { const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1113,8 +1114,8 @@ struct acc_C { const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half)))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); - const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); + const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1137,7 +1138,7 @@ struct acc_C { const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni for (int k = 0; k < 8; ++k) { va[k] = _mm512_set1_epi32(a_ptr[k]); } - vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); - vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s)); + vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d)); + vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s)); } // load b @@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnniqs + 16); float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) }; float32x4_t scale = vld1q_f32(_scale); @@ -274,10 +275,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // dot product sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4, svdot_s32(svdup_n_s32(0), qx0ls, qy0l), - svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4, svdot_s32(svdup_n_s32(0), qx1ls, qy1l), - svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); @@ -313,9 +314,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // dot product sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); @@ -354,9 +355,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // dot product sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32, - svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32, - svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1)); @@ -404,8 +405,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -423,7 +424,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -464,10 +465,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i]; float32_t summs_t[4] = { - GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s), - GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s), - GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s), - GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s) + GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y0->s), + GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y0->s), + GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y1->s), + GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y1->s) }; summs0 = vaddq_f32(summs0, vld1q_f32(summs_t)); @@ -490,10 +491,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // mmla into int32x4_t float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) }; float32x4_t scale = vld1q_f32(_scale); @@ -539,7 +540,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; - summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s); + summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s) + GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -562,8 +563,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; @@ -582,7 +583,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -666,10 +667,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -694,7 +695,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -739,8 +740,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8x16_t m4b = vdupq_n_u8(0x0F); - summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s); - summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s); + summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); // extract the 5th bit via lookup table ((b) << 4) memcpy(&qh0, x0->qh, sizeof(qh0)); @@ -784,10 +785,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; @@ -812,7 +813,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -864,10 +865,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) }; float32x4_t scale = vld1q_f32(_scale); @@ -934,10 +935,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16, svdot_s32(svdup_n_s32(0), qx0_0, qy0_0), - svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16, svdot_s32(svdup_n_s32(0), qx1_0, qy1_0), - svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1)); @@ -960,9 +961,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); @@ -1002,8 +1003,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64); // scale creation - const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d); - const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d); + const float32_t deq1 = GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d); + const float32_t deq2 = GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d); // duplicate deq1 in first half of vector and deq2 in second half of vector const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2); @@ -1043,11 +1044,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -1059,7 +1060,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -1217,7 +1218,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const int16x8_t ysum0 = vld1q_s16(y[i].bsums); const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; #if defined(__ARM_FEATURE_DOTPROD) sumi0 = vaddq_s32(sumi0, sumi1); @@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } - sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); } *s = sumf; @@ -1362,7 +1363,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const int16x8_t ysum0 = vld1q_s16(y[i].bsums); const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; #if defined(__ARM_FEATURE_DOTPROD) sumi0 = vaddq_s32(sumi0, sumi1); @@ -1393,7 +1394,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); sumf += (float) sumi * d; } @@ -1425,9 +1426,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi switch (vector_length) { case 128: for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); svfloat32_t d_broad = svdup_n_f32((float32_t)d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; @@ -1570,9 +1571,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi case 256: case 512: for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); svfloat32_t d_broad = svdup_n_f32((float32_t)d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; @@ -1671,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sum = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1742,8 +1743,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -1805,7 +1806,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3_sv = x[i].qs; const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask; @@ -1981,7 +1982,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].hmask; @@ -2112,7 +2113,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2258,18 +2259,18 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)), vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins)))); const float32x4_t dmins = { - GGML_FP16_TO_FP32(x0->dmin) * y0->d, - GGML_FP16_TO_FP32(x0->dmin) * y1->d, - GGML_FP16_TO_FP32(x1->dmin) * y0->d, - GGML_FP16_TO_FP32(x1->dmin) * y1->d, + GGML_CPU_FP16_TO_FP32(x0->dmin) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->dmin) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->dmin) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->dmin) * y1->d, }; vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins); const float32x4_t superblock_scale = { - GGML_FP16_TO_FP32(x0->d) * y0->d, - GGML_FP16_TO_FP32(x0->d) * y1->d, - GGML_FP16_TO_FP32(x1->d) * y0->d, - GGML_FP16_TO_FP32(x1->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y1->d, }; vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale); } @@ -2289,8 +2290,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -2377,8 +2378,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -2478,9 +2479,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2520,8 +2521,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -2630,9 +2631,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2827,10 +2828,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32); const float32x4_t superblock_scale = { - GGML_FP16_TO_FP32(x0->d) * y0->d, - GGML_FP16_TO_FP32(x0->d) * y1->d, - GGML_FP16_TO_FP32(x1->d) * y0->d, - GGML_FP16_TO_FP32(x1->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y1->d, }; visum = vsubq_s32(visum, vibias); @@ -2858,7 +2859,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi svuint8_t q6h_1, q6h_2, q6h_3, q6h_4; for (int i = 0; i < nb; ++i) { - const float d_all = GGML_FP16_TO_FP32(x[i].d); + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q6 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -3011,7 +3012,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d_all = GGML_FP16_TO_FP32(x[i].d); + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q6 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -3128,7 +3129,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -3199,7 +3200,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; float sumf1 = 0, sumf2 = 0; @@ -3234,7 +3235,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -3284,7 +3285,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint8x8_t scales8 = vld1_u8(x[i].scales); @@ -3329,7 +3330,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3398,7 +3399,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -3458,7 +3459,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -3521,7 +3522,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3557,7 +3558,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3630,7 +3631,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; @@ -3691,7 +3692,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -3786,7 +3787,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } - sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3); + sumf += y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3); } *s = sumf; @@ -3817,7 +3818,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -3905,7 +3906,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } - sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); + sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); } *s = sumf; @@ -3952,7 +3953,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qh += 2; } - sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); } *s = sumf; @@ -4003,13 +4004,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); sumf += - GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) + - GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2); + GGML_CPU_FP16_TO_FP32(x[ib+0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) + + GGML_CPU_FP16_TO_FP32(x[ib+1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2); } #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -4071,7 +4072,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } - sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); + sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); } *s = sumf; @@ -4079,7 +4080,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 39a0dd301db..2f8bc9e2517 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include @@ -51,7 +52,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < 8; j++) { @@ -102,7 +103,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -145,7 +146,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < 4; j++) { @@ -221,7 +222,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -311,7 +312,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -399,7 +400,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -514,7 +515,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -608,7 +609,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -1117,7 +1118,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -1570,7 +1571,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -2039,7 +2040,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -2147,7 +2148,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c index f2ea965724a..9e33fb32286 100644 --- a/ggml/src/ggml-cpu/arch/loongarch/quants.c +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -474,7 +475,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Quantize these floats const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id ); @@ -548,7 +549,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Quantize these floats const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = __lasx_xvreplfr2vr_s( id ); @@ -576,7 +577,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Compute the sum of the quants and set y[i].s const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3)); const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7)); - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1))); + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1))); // Convert int32 to int16 ni0 = lsx_packs_w( ni0, ni1 ); @@ -667,7 +668,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); __m256i qx = bytes_from_nibbles_32(x[ib].qs); @@ -699,7 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi for (; ib + 1 < nb; ib += 2) { // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0); @@ -717,7 +718,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) ); + const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) ); const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0); @@ -766,7 +767,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -797,10 +798,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const float d0 = GGML_FP16_TO_FP32(x[ib].d); - const float d1 = GGML_FP16_TO_FP32(y[ib].d); + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); const __m256 d0v = __lasx_xvreplfr2vr_s( d0 ); const __m256 d1v = __lasx_xvreplfr2vr_s( d1 ); @@ -834,7 +835,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -865,7 +866,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME + const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -902,7 +903,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -934,16 +935,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d)); + const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d)); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10)); qx = __lasx_xvor_v(qx, bxhi); - const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d)); + const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d)); const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); const __m256 q = mul_sum_us8_pairs_float(qx, qy); @@ -973,7 +974,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -1003,7 +1004,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { // Compute combined scale for the block - const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0); __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); @@ -1023,7 +1024,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -1047,8 +1048,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1116,8 +1117,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -1170,7 +1171,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales @@ -1294,7 +1295,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1330,8 +1331,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -1438,9 +1439,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1477,8 +1478,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * GGML_RESTRICT q5 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -1593,9 +1594,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1624,7 +1625,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q4 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -1713,7 +1714,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1780,7 +1781,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = __lasx_xvldi(0); @@ -1820,7 +1821,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -1895,7 +1896,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1980,7 +1981,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2049,7 +2050,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); @@ -2108,7 +2109,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -2168,7 +2169,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2213,7 +2214,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2279,7 +2280,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; @@ -2340,7 +2341,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -2451,7 +2452,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum); accum1 += d * sumi1; } @@ -2484,7 +2485,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -2530,9 +2531,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p_1 = lasx_madd_h(p16_1, mone); const __m256i p_2 = lasx_madd_h(p16_2, mone); - accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), + accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), __lasx_xvffint_s_w(p_1), accum1); - accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), + accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), __lasx_xvffint_s_w(p_2), accum2); } @@ -2540,7 +2541,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -2595,7 +2596,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v sumi1 = __lasx_xvadd_w(p_1, sumi1); sumi2 = __lasx_xvadd_w(p_2, sumi2); } - accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum); } @@ -2604,7 +2605,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c index ce4e47a8639..053d5cbdc7b 100644 --- a/ggml/src/ggml-cpu/arch/powerpc/quants.c +++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -67,7 +68,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float id = d ? 1.0f/d : 0.0f; const vector float vid = vec_splats(id); - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const vector float v = vec_round(vec_mul(srcv[j], vid)); @@ -112,7 +113,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float id = d ? 1.0f/d : 0.0f; const vector float vid = vec_splats(id); - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); vector int accv = vec_splats(0); @@ -127,7 +128,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i accv = vec_add(accv, vec_sld(accv, accv, 4)); accv = vec_add(accv, vec_sld(accv, accv, 8)); - y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0)); + y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0)); } #else @@ -170,8 +171,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); @@ -214,7 +215,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -249,12 +250,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m)); - vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f}; + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m)); + vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f}; vsumf0 = vec_madd(vxmin, vys, vsumf0); vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); @@ -291,7 +292,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -326,8 +327,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])}; @@ -379,7 +380,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -415,12 +416,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m)); - vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f}; + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m)); + vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f}; vsumf0 = vec_madd(vxmin, vys, vsumf0); vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])}; @@ -470,7 +471,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -502,8 +503,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); vector signed char q8x0 = vec_xl( 0, x[ib].qs); @@ -542,7 +543,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -574,11 +575,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); vector float vdmin = vec_mul(vxmin, vyd); vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); @@ -708,8 +709,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -770,7 +771,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -962,7 +963,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1005,11 +1006,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); vector float vdmin = vec_mul(vxmin, vyd); vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); @@ -1177,9 +1178,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1222,11 +1223,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); vector float vdmin = vec_mul(vxmin, vyd); UNUSED(kmask1); @@ -1394,9 +1395,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1432,7 +1433,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -1591,7 +1592,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1659,7 +1660,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -1742,7 +1743,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -1790,7 +1791,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -1871,7 +1872,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1939,7 +1940,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -2033,7 +2034,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -2096,7 +2097,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -2176,7 +2177,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2236,7 +2237,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -2329,7 +2330,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -2394,7 +2395,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -2505,7 +2506,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -2546,8 +2547,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); @@ -2582,7 +2583,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -2620,7 +2621,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v for (int ibl = 0; ibl < nb; ++ibl) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d)); vector float vyd = vec_splats(y[ibl].d); vector float vd = vec_mul(vxd, vyd); @@ -2697,7 +2698,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index 6f3aa94fbbe..8b64d8adc48 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -45,7 +46,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl); @@ -85,7 +86,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl); @@ -102,7 +103,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // set y[i].s int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); - y[i].s = GGML_FP32_TO_FP16(sum*d); + y[i].s = GGML_CPU_FP32_TO_FP16(sum*d); } #else @@ -160,7 +161,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } #endif @@ -177,7 +178,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -225,7 +226,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } #endif @@ -242,7 +243,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -293,7 +294,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl); int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum); - sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } #endif @@ -316,7 +317,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -366,7 +367,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl); int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum); - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } #endif @@ -389,7 +390,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -427,7 +428,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } #endif @@ -438,7 +439,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -465,8 +466,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * q2 = x[i].qs; const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); uint8_t *patmp = atmp; int vsums; int tmp; @@ -569,8 +570,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); size_t vl = 16; @@ -644,8 +645,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * q2 = x[i].qs; const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); uint8_t *patmp = atmp; int vsums; int tmp; @@ -750,8 +751,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -916,7 +917,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi q3 += 32; q8 += 128; scale += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; sumf += d * isum; } @@ -1017,7 +1018,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; sumf += d*sum_t; @@ -1134,7 +1135,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi q3 += 32; q8 += 128; scale += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; sumf += d * isum; } break; @@ -1202,7 +1203,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1239,8 +1240,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int tmp, tmp2, sumi; __asm__ __volatile__( @@ -1361,8 +1362,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi size_t vl = 8; - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); @@ -1422,8 +1423,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi break; case 128: for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int tmp, tmp2, sumi; __asm__ __volatile__( @@ -1580,9 +1581,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1627,8 +1628,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * GGML_RESTRICT hm = x[i].qh; const int8_t * GGML_RESTRICT q8 = y[i].qs; - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl); vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl); @@ -1749,9 +1750,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1778,7 +1779,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -1862,7 +1863,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi case 256: for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q6 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -1943,7 +1944,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi case 128: for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -2058,7 +2059,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp index 0882b410243..45c91a69482 100644 --- a/ggml/src/ggml-cpu/arch/riscv/repack.cpp +++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include @@ -90,16 +91,16 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); // vector version needs Zvfhmin extension - const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d); + const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d); const float b_scales[8] = { - GGML_FP16_TO_FP32(b_ptr[l].d[0]), - GGML_FP16_TO_FP32(b_ptr[l].d[1]), - GGML_FP16_TO_FP32(b_ptr[l].d[2]), - GGML_FP16_TO_FP32(b_ptr[l].d[3]), - GGML_FP16_TO_FP32(b_ptr[l].d[4]), - GGML_FP16_TO_FP32(b_ptr[l].d[5]), - GGML_FP16_TO_FP32(b_ptr[l].d[6]), - GGML_FP16_TO_FP32(b_ptr[l].d[7]) + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7]) }; const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4); @@ -129,7 +130,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -181,20 +182,20 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo // vector version needs Zvfhmin extension const float a_scales[4] = { - GGML_FP16_TO_FP32(a_ptr[l].d[0]), - GGML_FP16_TO_FP32(a_ptr[l].d[1]), - GGML_FP16_TO_FP32(a_ptr[l].d[2]), - GGML_FP16_TO_FP32(a_ptr[l].d[3]) + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3]) }; const float b_scales[8] = { - GGML_FP16_TO_FP32(b_ptr[l].d[0]), - GGML_FP16_TO_FP32(b_ptr[l].d[1]), - GGML_FP16_TO_FP32(b_ptr[l].d[2]), - GGML_FP16_TO_FP32(b_ptr[l].d[3]), - GGML_FP16_TO_FP32(b_ptr[l].d[4]), - GGML_FP16_TO_FP32(b_ptr[l].d[5]), - GGML_FP16_TO_FP32(b_ptr[l].d[6]), - GGML_FP16_TO_FP32(b_ptr[l].d[7]) + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7]) }; const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); @@ -382,7 +383,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 26bd9087571..a840219a4fc 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -49,7 +50,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f / d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const __vector float v = vec_mul(srcv[j], vec_splats(id)); @@ -94,7 +95,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f / d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); __vector int32_t acc = vec_splats(0); @@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i acc = vec_add(acc, vi); } - y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3])); + y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3])); } #else GGML_UNUSED(nb); @@ -164,7 +165,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_); const __vector float v_xy = vec_float(vec_unpackh(v_xy_)); - const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); acc = vec_madd(v_xy, v_d, acc); } @@ -185,7 +186,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -219,7 +220,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); const uint8x16_t v_x = vec_xl(0, x[ib].qs); const int8x16_t v_xl = (const int8x16_t)(v_x & v_m); @@ -231,7 +232,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); const float32x4_t v_xy = vec_float(v_xy_); - const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); acc = vec_madd(v_xy, v_d, acc); } @@ -252,7 +253,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); const float32x4_t v_xy = vec_float(v_xy_); - const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); acc = vec_madd(v_xy, v_d, acc); } @@ -305,7 +306,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -348,7 +349,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sum = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * restrict x0l = x[i].qs; const uint8_t * restrict x0h = x[i].hmask; @@ -497,7 +498,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -537,8 +538,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); @@ -647,9 +648,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -698,8 +699,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); @@ -819,9 +820,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -859,7 +860,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi int8x16_t v_y[4]; for (int i = 0; i < nb; ++i) { - const float d_all = GGML_FP16_TO_FP32(x[i].d); + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT x0l = x[i].ql; const uint8_t * GGML_RESTRICT x0h = x[i].qh; @@ -1004,7 +1005,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1071,7 +1072,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi // float sumf = 0; // for (int i = 0; i < nb; ++i) { -// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; +// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; // const uint16_t * GGML_RESTRICT q2 = x[i].qs; // const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1121,7 +1122,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi // float sumf = 0.f; // for (int i = 0; i < nb; ++i) { -// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; +// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; // const uint16_t * GGML_RESTRICT q2 = x[i].qs; // const int8_t * GGML_RESTRICT q8 = y[i].qs; // int32_t bsum = 0; @@ -1182,12 +1183,12 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs); const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); - sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]); + sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]); } #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -1257,7 +1258,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2; } - sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); + sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); } *s = sumf; @@ -1265,7 +1266,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 4ec97f533f1..b0904d8a3ab 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -65,7 +66,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); @@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); v128_t accv = wasm_i32x4_splat(0); @@ -126,7 +127,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i accv = wasm_i32x4_add(accv, vi); } - y[i].s = GGML_FP32_TO_FP16( + y[i].s = GGML_CPU_FP32_TO_FP16( d * (wasm_i32x4_extract_lane(accv, 0) + wasm_i32x4_extract_lane(accv, 1) + wasm_i32x4_extract_lane(accv, 2) + @@ -324,8 +325,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi ); // Accumulate results with scaling - float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d); - float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d); + float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d); + float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d); sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0))); sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1))); @@ -348,7 +349,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -428,7 +429,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); + wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)))); } sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -454,7 +455,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -491,7 +492,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; - summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s); + summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -538,7 +539,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); + wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)))); } sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -564,7 +565,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -620,7 +621,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1)); // Convert to float and accumulate - const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d); + const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d); sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale))); } @@ -635,7 +636,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -746,8 +747,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi isum += wasm_i32x4_extract_lane(isum_vec, 0); } - const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf += dall * isum - dmin * summs; } @@ -768,8 +769,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -880,7 +881,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi } // Accumulate results - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const v128_t v_d = wasm_f32x4_splat(d); v128_t v_sum = wasm_f32x4_add( wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d), @@ -957,7 +958,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -991,8 +992,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign const uint8_t * GGML_RESTRICT q4 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1136,9 +1137,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1170,8 +1171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign const uint8_t * GGML_RESTRICT q5 = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -1331,9 +1332,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1420,7 +1421,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi wasm_v128_store(&aux32[0], acc0); wasm_v128_store(&aux32[4], acc1); - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) { sums[l] += d * aux32[l]; } @@ -1470,7 +1471,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index e3f722b52c9..e7527c00a8f 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -256,9 +257,9 @@ static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_ // quad fp16 delta calculation static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) { - // GGML_FP16_TO_FP32 is faster than Intel F16C - return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)), - _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0))); + // GGML_CPU_FP16_TO_FP32 is faster than Intel F16C + return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)), + _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0))); } #endif #elif defined(__SSSE3__) @@ -305,7 +306,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Quantize these floats const float d = maxScalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -401,7 +402,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Quantize these floats const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -425,7 +426,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i #if defined(__AVX2__) // Compute the sum of the quants and set y[i].s - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 @@ -455,7 +456,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Compute the sum of the quants and set y[i].s const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); @@ -552,7 +553,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); __m256i qx = bytes_from_nibbles_32(x[ib].qs); @@ -613,7 +614,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs); @@ -631,7 +632,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) ); + const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); @@ -680,7 +681,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -711,10 +712,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const float d0 = GGML_FP16_TO_FP32(x[ib].d); - const float d1 = GGML_FP16_TO_FP32(y[ib].d); + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); const __m256 d0v = _mm256_set1_ps( d0 ); const __m256 d1v = _mm256_set1_ps( d1 ); @@ -752,7 +753,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -783,7 +784,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -807,7 +808,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); const __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -851,7 +852,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -883,16 +884,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d)); + const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); qx = _mm256_or_si256(qx, bxhi); - const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d)); + const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d)); const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_us8_pairs_float(qx, qy); @@ -910,9 +911,9 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d)); + const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); const __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -926,7 +927,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi bxh = _mm_or_si128(bxh, bxhih); bx_0 = MM256_SET_M128I(bxh, bxl); - const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d)); + const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d)); const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0); @@ -956,7 +957,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -986,7 +987,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { // Compute combined scale for the block - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs); __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); @@ -1025,7 +1026,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -1144,7 +1145,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d)); sumi0 = _mm256_sub_epi16(sumi0, ysum); sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2)); @@ -1190,7 +1191,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } - sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); } *s = sumf; @@ -1244,7 +1245,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d)); sumi0 = _mm256_add_epi16(sumi0, sumi1); sumi0 = _mm256_sub_epi16(sumi0, ysum); @@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); sumf += (float) sumi * d; } @@ -1299,8 +1300,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1366,8 +1367,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1477,8 +1478,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -1533,7 +1534,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1638,7 +1639,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1824,7 +1825,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1862,8 +1863,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -1928,8 +1929,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q4 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2049,9 +2050,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2092,8 +2093,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * GGML_RESTRICT q5 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -2170,8 +2171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q5 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2311,9 +2312,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2344,7 +2345,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q4 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -2422,7 +2423,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q4 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -2555,7 +2556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2622,7 +2623,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = _mm256_setzero_si256(); @@ -2663,7 +2664,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i sumi1_0 = _mm_setzero_si128(); @@ -2717,7 +2718,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -2792,7 +2793,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2913,7 +2914,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3035,7 +3036,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3104,7 +3105,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); @@ -3177,7 +3178,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); @@ -3253,7 +3254,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -3313,7 +3314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3358,7 +3359,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3414,7 +3415,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3480,7 +3481,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; @@ -3565,7 +3566,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; @@ -3648,7 +3649,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -3753,7 +3754,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum); accum1 += d * sumi1; @@ -3801,7 +3802,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum); accum1 += d * sumi1; @@ -3835,7 +3836,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -3947,7 +3948,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 8; qh += 4; } - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16)); accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); @@ -4033,7 +4034,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 8; qh += 4; } - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16)); accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1); accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2); @@ -4083,7 +4084,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qh += 2; } - sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); } *s = sumf; @@ -4129,9 +4130,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); - accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), + accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), _mm256_cvtepi32_ps(p_1), accum1); - accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), + accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), _mm256_cvtepi32_ps(p_2), accum2); } @@ -4164,7 +4165,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -4219,7 +4220,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v sumi1 = _mm256_add_epi32(p_1, sumi1); sumi2 = _mm256_add_epi32(p_2, sumi2); } - accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum); } @@ -4267,7 +4268,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0); __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1); - accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum); } @@ -4276,7 +4277,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index e7635a294a7..c00c1e541cb 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include @@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) { float tmp[16]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } for (int i = 0; i < 8; i++) { - tmp[i + 8] = GGML_FP16_TO_FP32(y[i]); + tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]); } return _mm512_loadu_ps(tmp); @@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) { _mm_storeu_si128((__m128i*)tmphalf, x); for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); } return _mm512_loadu_ps(tmp); @@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { float tmp[8]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) { float tmp[8]; for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); - tmp[i + 4] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); + tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask)); for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); } return _mm256_loadu_ps(tmp); @@ -211,7 +212,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f; // Store the scale for the individual block - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); // Store the values in blocks of eight values - Aim is to use these later for block interleaving srcv[row_iter][0] = v0; @@ -297,7 +298,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -647,7 +648,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask); // Load and convert to FP32 scale from block_q8_0 - const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d)); + const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d)); // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs)); @@ -706,7 +707,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -972,13 +973,13 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; } } for (int sb = 0; sb < 8; sb++) { uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; for (int j = 0; j < ncols_interleaved; j++) { - sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; + sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; } } } @@ -1755,7 +1756,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -3259,7 +3260,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; } } } @@ -3268,7 +3269,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo for(int m = 0; m < 4; m++) { const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); for(int j = 0; j < ncols_interleaved; j++) { - sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; + sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; } } } diff --git a/ggml/src/ggml-cpu/common.h b/ggml/src/ggml-cpu/common.h index 5624176cce9..353563dc35c 100644 --- a/ggml/src/ggml-cpu/common.h +++ b/ggml/src/ggml-cpu/common.h @@ -4,6 +4,7 @@ #include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "simd-mappings.h" #ifdef __cplusplus @@ -12,11 +13,11 @@ // convenience functions/macros for use in template calls // note: these won't be required after the 'traits' lookup table is used. static inline ggml_fp16_t f32_to_f16(float x) { - return GGML_FP32_TO_FP16(x); + return GGML_CPU_FP32_TO_FP16(x); } static inline float f16_to_f32(ggml_fp16_t x) { - return GGML_FP16_TO_FP32(x); + return GGML_CPU_FP16_TO_FP32(x); } static inline ggml_bf16_t f32_to_bf16(float x) { diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 73a8f93987a..d839cf5c55e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -62,11 +62,17 @@ struct ggml_compute_params { #if defined(__s390x__) && defined(__VEC__) #ifndef __VXE__ #define __VXE__ -#endif +#endif // __VXE__ #ifndef __VXE2__ #define __VXE2__ -#endif -#endif +#endif // __VXE2__ +#endif // __s390x__ && __VEC__ + +#if defined(__s390x__) && defined(GGML_NNPA) +#ifndef __NNPA__ +#define __NNPA__ +#endif // __NNPA__ +#endif // __s390x__ && GGML_NNPA #if defined(__ARM_FEATURE_SVE) #include diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1d3cd009aff..7cae96f4b48 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -72,6 +72,9 @@ #define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0) +// precomputed f32 table for f16 (256 KB) (simd-mappings.h) +float ggml_table_f32_f16[1 << 16]; + #if defined(__ARM_ARCH) struct ggml_arm_arch_features_type { int sve_cnt; @@ -736,7 +739,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value)); } } break; case GGML_TYPE_BF16: @@ -795,7 +798,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value)); } } break; case GGML_TYPE_BF16: @@ -846,7 +849,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } case GGML_TYPE_BF16: { @@ -891,7 +894,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -920,7 +923,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i case GGML_TYPE_I32: return ((int32_t *) data)[0]; case GGML_TYPE_F16: - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); case GGML_TYPE_BF16: return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); case GGML_TYPE_F32: @@ -947,7 +950,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -985,7 +988,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { } case GGML_TYPE_F16: { - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } case GGML_TYPE_BF16: { @@ -1024,7 +1027,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -1051,7 +1054,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, case GGML_TYPE_I32: return ((int32_t *) data)[0]; case GGML_TYPE_F16: - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); case GGML_TYPE_BF16: return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); case GGML_TYPE_F32: @@ -1078,7 +1081,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -3141,9 +3144,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); _mm_storel_epi64((__m128i *)(y + i), y_vec); } +#elif defined(__NNPA__) + for (; i + 7 < n; i += 8) { + float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0)); + float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4)); + uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0); + uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); + vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); + } + for (; i + 3 < n; i += 4) { + float32x4_t v_x = vec_xl(0, (const float *)(x + i)); + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0); + uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); + vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); + } #endif for (; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(x[i]); } } @@ -3167,9 +3185,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { __m128 y_vec = _mm_cvtph_ps(x_vec); _mm_storeu_ps(y + i, y_vec); } +#elif defined(__NNPA__) + for (; i + 7 < n; i += 8) { + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i)); + uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0); + float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0); + float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0); + vec_xst(v_yh, 0, (float *)(y + i + 0)); + vec_xst(v_yl, 0, (float *)(y + i + 4)); + } + for (; i + 3 < n; i += 4) { + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i)); + uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0); + float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0); + vec_xst(v_yh, 0, (float *)(y + i)); + } #endif + for (; i < n; ++i) { - y[i] = GGML_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP16_TO_FP32(x[i]); } } @@ -3369,6 +3403,14 @@ int ggml_cpu_has_vxe(void) { #endif } +int ggml_cpu_has_nnpa(void) { +#if defined(GGML_NNPA) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_neon(void) { #if defined(__ARM_ARCH) && defined(__ARM_NEON) return 1; @@ -3418,7 +3460,7 @@ int ggml_cpu_has_sme(void) { } void ggml_cpu_init(void) { - // needed to initialize f16 tables + // needed to initialize ggml_time { struct ggml_init_params params = { 0, NULL, false }; struct ggml_context * ctx = ggml_init(params); @@ -3439,9 +3481,10 @@ void ggml_cpu_init(void) { uint16_t u16; ggml_fp16_t fp16; } u = {i}; - float f = GGML_FP16_TO_FP32(u.fp16); - ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); - ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); + float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16); + ggml_table_f32_f16[i] = f; + ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f)); + ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f)); } const uint64_t t_end = ggml_time_us(); UNUSED(t_end); diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 735ef3f015c..a98866a2d80 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -578,6 +578,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_vxe()) { features.push_back({ "VXE", "1" }); } + if (ggml_cpu_has_nnpa()) { + features.push_back({ "NNPA", "1" }); + } if (ggml_cpu_has_wasm_simd()) { features.push_back({ "WASM_SIMD", "1" }); } diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 7ed3874afb8..ed61869a550 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -52,6 +52,7 @@ #include "ggml-impl.h" #include "ggml-cpu-impl.h" #include "ggml-quants.h" +#include "simd-mappings.h" #include #include @@ -73,7 +74,7 @@ namespace { inline float unhalf(ggml_fp16_t d) { - return GGML_FP16_TO_FP32(d); + return GGML_CPU_FP16_TO_FP32(d); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -252,7 +253,7 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) { float tmp[4]; for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(p[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]); } return vec_xl(0, (const float *)(tmp)); diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index f6ebb4ea763..525fd178760 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -108,7 +108,7 @@ static void ggml_compute_forward_dup_f16( for (int i01 = ir0; i01 < ir1; i01++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); + dst_ptr[id] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]); id++; } } @@ -130,7 +130,7 @@ static void ggml_compute_forward_dup_f16( const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + src0_f32[i00] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]); } quantize_row_q(src0_f32, dst_ptr + id, ne00); @@ -156,7 +156,7 @@ static void ggml_compute_forward_dup_f16( for (int i00 = 0; i00 < ne00; i00++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + dst_ptr[id] = GGML_CPU_FP16_TO_FP32(*src0_ptr); id++; } } @@ -267,7 +267,7 @@ static void ggml_compute_forward_dup_f16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + *(float *) dst_ptr = GGML_CPU_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -372,7 +372,7 @@ static void ggml_compute_forward_dup_bf16( for (int i01 = ir0; i01 < ir1; i01++) { const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); id++; } } @@ -473,7 +473,7 @@ static void ggml_compute_forward_dup_bf16( for (int i00 = 0; i00 < ne00; i00++) { const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); id++; } } @@ -566,7 +566,7 @@ static void ggml_compute_forward_dup_bf16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); + *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); if (++i10 == ne0) { i10 = 0; @@ -765,7 +765,7 @@ static void ggml_compute_forward_dup_f32( for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr); id++; } } @@ -878,7 +878,7 @@ static void ggml_compute_forward_dup_f32( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -1419,7 +1419,7 @@ static void ggml_compute_forward_add1_f16_f32( ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v); } } } @@ -1435,7 +1435,7 @@ static void ggml_compute_forward_add1_f16_f16( GGML_ASSERT(ggml_is_scalar(src1)); // scalar to add - const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); + const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); const int ith = params->ith; const int nth = params->nth; @@ -1467,7 +1467,7 @@ static void ggml_compute_forward_add1_f16_f16( ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v); } } } @@ -1889,7 +1889,7 @@ static void ggml_compute_forward_sum_f16( } } } - ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); + ((ggml_fp16_t *) dst->data)[0] = GGML_CPU_FP32_TO_FP16(sum); } static void ggml_compute_forward_sum_bf16( @@ -2660,7 +2660,7 @@ static void ggml_compute_forward_gelu_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2763,7 +2763,7 @@ static void ggml_compute_forward_gelu_erf_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2866,7 +2866,7 @@ static void ggml_compute_forward_gelu_quick_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2969,7 +2969,7 @@ static void ggml_compute_forward_silu_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -3163,7 +3163,7 @@ static void ggml_compute_forward_silu_back_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -4500,7 +4500,7 @@ static void ggml_compute_forward_get_rows_back_f32_f16( for (int j = 0; j < nc; ++j) { ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); + ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v); } } } @@ -4792,7 +4792,7 @@ static void ggml_compute_forward_soft_max_f32( if (mp_f32) { if (use_f16) { for (int i = 0; i < nc; ++i) { - wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]); + wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]); } } else { for (int i = 0; i < nc; ++i) { @@ -5018,8 +5018,8 @@ static void ggml_compute_forward_clamp_f16( ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); for (int i = 0; i < nc; i++) { - float v = GGML_FP16_TO_FP32(src0_ptr[i]); - dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min)); + float v = GGML_CPU_FP16_TO_FP32(src0_ptr[i]); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(MAX(MIN(v, max), min)); } } } @@ -5476,11 +5476,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { @@ -5492,11 +5492,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } } else { @@ -5507,11 +5507,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[1]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[1]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } @@ -5525,11 +5525,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { @@ -5640,7 +5640,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]); + dst_data[i10*ne11 + i11] = GGML_CPU_FP32_TO_FP16(src[i10]); } } } @@ -5933,7 +5933,7 @@ static void ggml_compute_forward_im2col_f16( if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; } else { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]); } } } @@ -6109,7 +6109,7 @@ void ggml_compute_forward_conv_transpose_2d( const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; for (int i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); + dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]); } } } @@ -6358,7 +6358,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { case GGML_OP_POOL_AVG: drow[i] += srow_j; break; case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; @@ -6450,7 +6450,7 @@ void ggml_compute_forward_pool_2d( for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; if (j < 0 || j >= src->ne[0]) continue; - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { case GGML_OP_POOL_AVG: *out += srow_j; break; case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; @@ -6538,7 +6538,7 @@ void ggml_compute_forward_pool_2d_back( } const float val = dst->type == GGML_TYPE_F32 ? - ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); + ((const float *) drowf)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); if (val <= maxval) { continue; } @@ -6558,7 +6558,7 @@ void ggml_compute_forward_pool_2d_back( if (dst->type == GGML_TYPE_F32) { ((float *) drow)[j] += grad0; } else { - ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); + ((ggml_fp16_t *) drow)[j] = GGML_CPU_FP32_TO_FP16(grad0 + GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); } } else if (op == GGML_OP_POOL_AVG) { const float grad = grad0 / ka; @@ -6577,7 +6577,7 @@ void ggml_compute_forward_pool_2d_back( if (dst->type == GGML_TYPE_F32) { ((float *) drow)[j] += grad; } else { - ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad); + ((ggml_fp16_t *) drow)[j] += GGML_CPU_FP32_TO_FP16(grad); } } } @@ -7147,7 +7147,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( // loop over n_kv and n_head_kv // ref: https://arxiv.org/pdf/2112.05682.pdf for (int64_t ic = 0; ic < nek1; ++ic) { - const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f; + const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f; if (mv == -INFINITY) { continue; } @@ -7215,7 +7215,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( if (v->type == GGML_TYPE_F16) { for (int64_t d = 0; d < DV; ++d) { - VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]); + VKQ32[d] = GGML_CPU_FP16_TO_FP32(VKQ16[d]); } } diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index d2e705f287a..ee35ab42fda 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -2,6 +2,7 @@ #include "ggml-common.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "ggml-quants.h" #include "quants.h" @@ -137,7 +138,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -174,7 +175,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -217,7 +218,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -260,7 +261,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -342,7 +343,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } - sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); } *s = sumf; @@ -372,7 +373,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); sumf += (float) sumi * d; } @@ -405,8 +406,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -504,7 +505,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -577,9 +578,9 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -657,9 +658,9 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -714,7 +715,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -739,7 +740,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -778,7 +779,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -829,7 +830,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -882,7 +883,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -924,7 +925,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -1002,7 +1003,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -1063,7 +1064,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, qh += 2; } - sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); } *s = sumf; @@ -1087,7 +1088,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0; for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -1113,7 +1114,7 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 692c53e01c0..72ee93a5abc 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include "arch-fallback.h" @@ -72,7 +73,7 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -110,7 +111,7 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -236,7 +237,7 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -280,7 +281,7 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -325,7 +326,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -396,13 +397,13 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; } } for (int sb = 0; sb < 8; sb++) { uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; for (int j = 0; j < ncols_interleaved; j++) { - sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; + sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; } } } @@ -449,7 +450,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -500,7 +501,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -555,7 +556,7 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -609,7 +610,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -688,7 +689,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; } } } @@ -697,7 +698,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for(int m = 0; m < 4; m++) { const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); for(int j = 0; j < ncols_interleaved; j++) { - sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; + sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; } } } @@ -753,7 +754,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index e42364c59aa..b68ac0dd68b 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -2,10 +2,167 @@ #include "ggml-cpu-impl.h" +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + +#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include +#endif + +#if defined(__F16C__) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + // // simd mappings // +// FP16 to FP32 conversion + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +// +// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 +// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 +// +#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x) + + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + + static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; + } + + static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __fp16 tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; + } +#elif defined(__F16C__) + #ifdef _MSC_VER + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) + #else + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) + #endif +#elif defined(__POWER9_VECTOR__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x) + /* the inline asm below is about 12% faster than the lookup method */ + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) + + static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) { + float f; + double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; + } + + static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) { + double d; + ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; + } +#elif defined(__riscv) && defined(__riscv_zfhmin) + static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) { + float f; + __asm__( + "fmv.h.x %[f], %[h]\n\t" + "fcvt.s.h %[f], %[f]" + : [f] "=&f" (f) + : [h] "r" (h) + ); + return f; + } + + static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __asm__( + "fcvt.h.s %[f], %[f]\n\t" + "fmv.x.h %[h], %[f]" + : [h] "=&r" (res) + : [f] "f" (f) + ); + return res; + } + + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x) + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) +#elif defined(__NNPA__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x) + + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) + + static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) { + uint16x8_t v_h = vec_splats(h); + uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0); + return vec_extend_to_fp32_hi(v_hd, 0)[0]; + } + + static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) { + float32x4_t v_f = vec_splats(f); + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0); + uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0); + return vec_extract(v_h, 0); + } +#endif + +// precomputed f32 table for f16 (256 KB) +// defined in ggml-cpu.c, initialized in ggml_cpu_init() +extern float ggml_table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_CPU_FP16_TO_FP32) +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#endif + +#if !defined(GGML_CPU_FP32_TO_FP16) +#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif + + // we define a common set of C macros which map to specific intrinsics based on the current architecture // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros @@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { float tmp[8]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { _mm256_storeu_ps(arr, y); for (int i = 0; i < 8; i++) - x[i] = GGML_FP32_TO_FP16(arr[i]); + x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); } #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) @@ -574,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) { inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(p[0]); - tmp[1] = GGML_FP16_TO_FP32(p[1]); - tmp[2] = GGML_FP16_TO_FP32(p[2]); - tmp[3] = GGML_FP16_TO_FP32(p[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]); return wasm_v128_load(tmp); } @@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { wasm_v128_store(tmp, x); - p[0] = GGML_FP32_TO_FP16(tmp[0]); - p[1] = GGML_FP32_TO_FP16(tmp[1]); - p[2] = GGML_FP32_TO_FP16(tmp[2]); - p[3] = GGML_FP32_TO_FP16(tmp[3]); + p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]); + p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]); + p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]); + p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]); } #define GGML_F16x4 v128_t @@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); return _mm_loadu_ps(tmp); } @@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) { _mm_storeu_ps(arr, y); - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); + x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); + x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); + x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); + x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); } #define GGML_F32Cx4 __m128 @@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { #define GGML_F32x4_ZERO __lsx_vldi(0) #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0) -#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0) +#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0) #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) #define GGML_F32x4_ADD __lsx_vfadd_s #define GGML_F32x4_MUL __lsx_vfmul_s @@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); return __lsx_vld(tmp, 0); } @@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { __lsx_vst(y, arr, 0); - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); + x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); + x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); + x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); + x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); } #define GGML_F32Cx4 __m128 @@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F32_STEP 32 #define GGML_F32_EPR 4 -#define GGML_F32x4 __vector float +#define GGML_F32x4 float32x4_t #define GGML_F32x4_ZERO vec_splats(0.0f) #define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_LOAD(p) vec_xl(0, p) @@ -962,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F16_STEP GGML_F32_STEP #define GGML_F16_EPR GGML_F32_EPR -static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) { +static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) { +#if defined(__NNPA__) + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x); + uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0); + return vec_extend_to_fp32_hi(v_xd, 0); +#else float tmp[4]; for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } // note: keep type-cast here to prevent compiler bugs // see: https://github.com/ggml-org/llama.cpp/issues/12846 return vec_xl(0, (const float *)(tmp)); +#endif } -static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) { +static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) { +#if defined(__NNPA__) + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0); + uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0); + + x[0] = vec_extract(v_x, 0); + x[1] = vec_extract(v_x, 1); + x[2] = vec_extract(v_x, 2); + x[3] = vec_extract(v_x, 3); +#else float arr[4]; // note: keep type-cast here to prevent compiler bugs // see: https://github.com/ggml-org/llama.cpp/issues/12846 - vec_xst(y, 0, (float *)(arr)); + vec_xst(v_y, 0, (float *)(arr)); for (int i = 0; i < 4; i++) { - x[i] = GGML_FP32_TO_FP16(arr[i]); + x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); } +#endif } #define GGML_F16_VEC GGML_F32x4 @@ -1004,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) { #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) #endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index f7614568ea3..5e34d79a169 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -219,11 +219,11 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G // leftovers for (int i = np; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); } #else for (int i = 0; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); } #endif diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 09dbade2179..84f6c0e6d26 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } @@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } @@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i])); + y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i])); } } inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i])); } } @@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG // leftovers for (int i = np; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); } } #else for (int i = 0; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); } } #endif @@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, // leftovers for (int i = np; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); } #else // scalar for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); } #endif } @@ -430,12 +430,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float // leftovers for (int i = np; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); } #else // scalar for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); } #endif } @@ -444,103 +444,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v*v); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v*v); } } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); } inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); } inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); } } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); + y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); } } inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f); } } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); } } inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i])))); + y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i])))); } } // TODO: optimize performance inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); } } inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); + y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); } } inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); } inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i]))); } } @@ -562,9 +562,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float xi = GGML_FP16_TO_FP32(x[i]); + float xi = GGML_CPU_FP16_TO_FP32(x[i]); float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); - y[i] = GGML_FP32_TO_FP16(res); + y[i] = GGML_CPU_FP32_TO_FP16(res); } } @@ -577,9 +577,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { } else if (x[i] >= 10.0f) { y[i] = x[i]; } else { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]); } } } @@ -613,9 +613,9 @@ inline static float ggml_gelu_quick_f32(float x) { inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); } } #else @@ -628,8 +628,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v)))); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v)))); } } @@ -638,8 +638,8 @@ inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) { - float v = GGML_FP16_TO_FP32(x); - return GGML_FP32_TO_FP16(v/(1.0f + expf(-v))); + float v = GGML_CPU_FP16_TO_FP32(x); + return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v))); } #if __FINITE_MATH_ONLY__ @@ -888,9 +888,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) { } inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) { - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); const float s = 1.0f/(1.0f + expf(-v)); - return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); + return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); } inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { @@ -928,7 +928,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { float sum = 0.0f; for (int i = 0; i < n; ++i) { - sum += GGML_FP16_TO_FP32(x[i]); + sum += GGML_CPU_FP16_TO_FP32(x[i]); } *s = sum; } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 6dc5ce0d92f..57761644f43 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -317,203 +317,81 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); GGML_API void * ggml_aligned_malloc(size_t size); GGML_API void ggml_aligned_free(void * ptr, size_t size); -// FP16 to FP32 conversion +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -// -// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 -// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 -// -#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - - #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - __fp16 tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - __fp16 tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; - } - -#elif defined(__F16C__) - - #ifdef _MSC_VER - #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) - #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) - #else - #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) - #endif - -#elif defined(__POWER9_VECTOR__) - - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - /* the inline asm below is about 12% faster than the lookup method */ - #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) - #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - float f; - double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - double d; - ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; - } - -#elif defined(__riscv) && defined(__riscv_zfhmin) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - float f; - __asm__( - "fmv.h.x %[f], %[h]\n\t" - "fcvt.s.h %[f], %[f]" - : [f] "=&f" (f) - : [h] "r" (h) - ); - return f; - } +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - __asm__( - "fcvt.h.s %[f], %[f]\n\t" - "fmv.x.h %[h], %[f]" - : [h] "=&r" (res) - : [f] "f" (f) - ); - return res; - } +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) - #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float exp_scale = 0x1.0p-112f; #else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - // FP16 <-> FP32 - // ref: https://github.com/Maratyszcza/FP16 - - static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; - } - - static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; - } - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; - #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) - const float exp_scale = 0x1.0p-112f; - #else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); - #endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; - #else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); - #endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); } - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) - -// precomputed f32 table for f16 (256 KB) -// defined in ggml.c, initialized in ggml_init() -GGML_API float ggml_table_f32_f16[1 << 16]; - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return ggml_table_f32_f16[s]; + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); } -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#endif +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -#endif /** * Converts brain16 to float32. diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 354a805179c..c963b338a18 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -61,9 +61,6 @@ #define m512i(p) (__m512i)(p) #endif -// precomputed f32 table for f16 (256 KB) (ggml-impl.h) -float ggml_table_f32_f16[1 << 16]; - #if defined(__linux__) || \ defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH) @@ -1422,14 +1419,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { // initialize time system (required on Windows) ggml_time_init(); - for (int i = 0; i < (1 << 16); ++i) { - union { - uint16_t u16; - ggml_fp16_t fp16; - } u = {i}; - ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); - } - is_first_call = false; } From a5e6a3c953d6b1ce27515f480a42bc2b7f99968d Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Thu, 26 Jun 2025 12:11:59 +0800 Subject: [PATCH 0024/1348] musa: enable fp16 mma (all) and cublas on qy2 (llama/13842) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * musa: enable fp16 mma (all) and cublas on qy2 Signed-off-by: Xiaodong Ye * Update ggml/src/ggml-cuda/ggml-cuda.cu Co-authored-by: Johannes Gäßler * Address review comments Signed-off-by: Xiaodong Ye * Address review comments Signed-off-by: Xiaodong Ye * musa: disable MUL_MAT_ID (q2_k × f32) due to precision issues Signed-off-by: Xiaodong Ye --------- Signed-off-by: Xiaodong Ye Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/common.cuh | 25 +++++++++++++------------ ggml/src/ggml-cuda/fattn-wmma-f16.cu | 4 ++++ ggml/src/ggml-cuda/ggml-cuda.cu | 25 +++++++++++++++---------- ggml/src/ggml-musa/mudnn.cuh | 4 ++-- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index f6127aeee42..ea203550238 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -76,11 +76,9 @@ #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1) // Moore Threads -#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210) - -#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 -#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 -#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD +#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 +#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 +#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) @@ -203,9 +201,9 @@ typedef float2 dfloat2; #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) #define FP16_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) #if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4))) #define FP16_MMA_AVAILABLE @@ -219,9 +217,9 @@ typedef float2 dfloat2; #define CP_ASYNC_AVAILABLE #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE -#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1) +#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) #define FLASH_ATTN_AVAILABLE -#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1) +#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) static bool fp16_available(const int cc) { return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL; @@ -233,7 +231,8 @@ static bool fast_fp16_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fast_fp16_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } // Any FP16 tensor core instructions are available for ggml code. @@ -242,7 +241,8 @@ static bool fp16_mma_available(const int cc) { return false; #else if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) { + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || + GGML_CUDA_CC_IS_MTHREADS(cc)) { return true; } else if (GGML_CUDA_CC_IS_RDNA4(cc)) { #if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12) @@ -259,7 +259,8 @@ static bool fp16_mma_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fp16_mma_hardware_available(const int cc) { return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } static bool bf16_mma_hardware_available(const int cc) { diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index c5668adb152..f3b794c3644 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -9,7 +9,11 @@ #ifdef FP16_MMA_AVAILABLE #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #include +#ifdef GGML_USE_MUSA +namespace wmma = mtmusa::wmma; +#else // GGML_USE_MUSA namespace wmma = nvcuda::wmma; +#endif // GGML_USE_MUSA #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE) #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers #include diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b3e6833c396..b30c13c62f2 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1227,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas( const int cc = ggml_cuda_info().devices[id].cc; + const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); + const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT; - if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { + if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { ggml_cuda_pool_alloc src1_as_bf16(ctx.pool(id)); if (src1->type != GGML_TYPE_BF16) { const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type); @@ -1257,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas( const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); - } else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) { + } else if (fast_fp16_hardware_available(cc) && use_fp16) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); if (src0->type != GGML_TYPE_F16) { @@ -3061,9 +3064,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; } #ifdef GGML_USE_MUSA - if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 && - !ggml_is_transposed(a) && !ggml_is_transposed(b)) { - return false; + const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; + if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) { + if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT && + a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) { + return false; + } + if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID && + a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) { + return false; + } } #endif // GGML_USE_MUSA switch (a->type) { @@ -3090,11 +3100,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_BF16: -#ifdef GGML_USE_MUSA - if (a->type == GGML_TYPE_Q3_K) { - return false; - } -#endif // GGML_USE_MUSA return true; default: return false; diff --git a/ggml/src/ggml-musa/mudnn.cuh b/ggml/src/ggml-musa/mudnn.cuh index a63be5755c7..c30128561e8 100644 --- a/ggml/src/ggml-musa/mudnn.cuh +++ b/ggml/src/ggml-musa/mudnn.cuh @@ -1,7 +1,7 @@ #pragma once -#include "../include/ggml.h" -#include "../ggml-cuda/common.cuh" +#include "ggml-cuda/common.cuh" +#include "ggml.h" // Asynchronously copies data from src tensor to dst tensor using the provided context. // Returns a musaError_t indicating success or failure. From c848b9fbef6b40b4e6864081984788d3b6e5e199 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Jun 2025 15:50:15 +0300 Subject: [PATCH 0025/1348] metal : batch rows copy in a single threadgroup (llama/14384) * metal : batch rows copy in a single threadgroup ggml-ci * metal : handle some edge cases when threadgroup size is not a power of 2 ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 43 ++++++++++++++++++++++++---- ggml/src/ggml-metal/ggml-metal.metal | 11 +++++-- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 19f4d59e597..248fa378ef0 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -2450,6 +2450,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00); ggml_metal_kargs_sum_rows args = { @@ -3780,6 +3781,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_rms_norm args = { @@ -3816,6 +3818,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_l2_norm args = { @@ -3888,6 +3891,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_norm args = { @@ -4974,8 +4978,39 @@ static bool ggml_metal_encode_node( default: GGML_ABORT("not implemented"); } + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + // TODO: support + //const int32_t nk00 = ne00/ggml_blck_size(dst->type); + const int32_t nk00 = ne00; + + int nth = 32; // SIMD width + + while (nth < nk00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) { + nth *= 2; + } + + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); + + // when rows are small, we can batch them together in a single threadgroup + int nrptg = 1; + + // TODO: relax this constraint in the future + if (ggml_blck_size(src0->type) == 1 && ggml_blck_size(dst->type) == 1) { + if (nth > nk00) { + nrptg = (nth + nk00 - 1)/nk00; + nth = nk00; + + if (nrptg*nth > (int) pipeline.maxTotalThreadsPerThreadgroup) { + nrptg--; + } + } + } + + nth = MIN(nth, nk00); + ggml_metal_kargs_cpy args = { - /*.ne00 =*/ ne00, + /*.ne00 =*/ nk00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, /*.ne03 =*/ ne03, @@ -4998,11 +5033,7 @@ static bool ggml_metal_encode_node( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); - int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)]; } break; case GGML_OP_SET: { diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 3da19879b4b..f028276068e 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4306,11 +4306,16 @@ kernel void kernel_cpy( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { + ushort3 tptg[[threads_per_threadgroup]]) { const int i03 = tgpig[2]; const int i02 = tgpig[1]; - const int i01 = tgpig[0]; + const int i01 = tgpig[0]*tptg.y + tiitg/tptg.x; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; @@ -4321,7 +4326,7 @@ kernel void kernel_cpy( device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) { + for (int64_t i00 = tiitg%tptg.x; i00 < args.ne00; i00 += tptg.x) { device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); dst_data[i00] = (T1) src[0]; } From c300f1e32dda70f7c8c47338244355bd26dae870 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Jun 2025 15:51:19 +0300 Subject: [PATCH 0026/1348] metal : add special-case mat-vec mul for ne00 == 4 (llama/14385) ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 25 +++++++++-- ggml/src/ggml-metal/ggml-metal.metal | 64 ++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 248fa378ef0..d8d30cc0b41 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -211,11 +211,14 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, @@ -1175,11 +1178,14 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, rwkv_wkv6_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, rwkv_wkv7_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4, mul_mv_f32_f32_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, mul_mv_bf16_f32, has_simdgroup_reduction && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4, mul_mv_bf16_f32_c4, use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, mul_mv_bf16_f32_1row, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, mul_mv_bf16_f32_l4, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, mul_mv_bf16_bf16, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4, mul_mv_f16_f32_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, has_simdgroup_reduction); @@ -3111,14 +3117,23 @@ static bool ggml_metal_encode_node( nsg = 1; nr0 = 1; nr1 = 4; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + if (ne00 == 4) { + nr0 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + } } break; case GGML_TYPE_F16: { nsg = 1; nr0 = 1; if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { + if (ne00 == 4) { + nr0 = 32; + nr1 = 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4].pipeline; + } else if (ne11 * ne12 < 4) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; @@ -3137,7 +3152,11 @@ static bool ggml_metal_encode_node( nsg = 1; nr0 = 1; if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { + if (ne00 == 4) { + nr0 = 32; + nr1 = 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4].pipeline; + } else if (ne11 * ne12 < 4) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index f028276068e..5f004a856bd 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2532,6 +2532,70 @@ template [[host_name("kernel_mul_mv_bf16_f32")]] kernel mul_mv_t kernel_mul_mv< template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv; #endif +template +void kernel_mul_mv_c4_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig, + ushort tiisg) { + const int r0 = tgpig.x*32 + tiisg; + const int rb = tgpig.y*N_MV_T_T; + const int im = tgpig.z; + + if (r0 >= args.ne01) { + return; + } + + const uint i12 = im%args.ne12; + const uint i13 = im/args.ne12; + + const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; + + device const T04 * x = (device const T04 *) (src0 + offset0); + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1; + + for (int row = 0; row < N_MV_T_T; ++row) { + int r1 = rb + row; + if (r1 >= args.ne11) { + break; + } + + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const T14 * y = (device const T14 *) (src1 + offset1); + + dst_f32[(uint64_t)r1*args.ne0 + r0] = dot((float4) x[0], (float4) y[0]); + } +} + +template +kernel void kernel_mul_mv_c4( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_c4_impl( + args, + src0, + src1, + dst, + tgpig, + tiisg); +} + +typedef decltype(kernel_mul_mv_c4) mul_mv_c4_t; + +template [[host_name("kernel_mul_mv_f32_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +template [[host_name("kernel_mul_mv_f16_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +#if defined(GGML_METAL_USE_BF16) +template [[host_name("kernel_mul_mv_bf16_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +#endif + template kernel void kernel_mul_mv_1row( constant ggml_metal_kargs_mul_mv & args, From 566462a5c0387d8a4be87ed4aa2d4d81575dadcd Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Thu, 26 Jun 2025 13:46:53 -0300 Subject: [PATCH 0027/1348] cmake: regen vulkan shaders when shaders-gen sources change (llama/14398) * Add shaders-gen sources as target deps --- ggml/src/ggml-vulkan/CMakeLists.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 39f022f33d8..0bf4cb14f88 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -143,7 +143,8 @@ if (Vulkan_FOUND) -DCMAKE_BUILD_TYPE=$ ${VULKAN_SHADER_GEN_CMAKE_ARGS} - BUILD_COMMAND ${CMAKE_COMMAND} --build . --config $ + BUILD_COMMAND ${CMAKE_COMMAND} --build . --config $ + BUILD_ALWAYS TRUE # NOTE: When DESTDIR is set using Makefile generators and # "make install" triggers the build step, vulkan-shaders-gen @@ -164,6 +165,14 @@ if (Vulkan_FOUND) file(GLOB _ggml_vk_shader_files CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.comp") + # Because external projects do not provide source-level tracking, + # the vulkan-shaders-gen sources need to be explicitly added to + # ensure that changes will cascade into shader re-generation. + + file(GLOB _ggml_vk_shaders_gen_sources + CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.cpp" + "${_ggml_vk_input_dir}/*.h") + add_custom_command( OUTPUT ${_ggml_vk_header} ${_ggml_vk_source} @@ -177,6 +186,7 @@ if (Vulkan_FOUND) --no-clean DEPENDS ${_ggml_vk_shader_files} + ${_ggml_vk_shaders_gen_sources} vulkan-shaders-gen COMMENT "Generate vulkan shaders" From 48fb51f3142559b6dbb42756cf843c185f8d19ce Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Fri, 27 Jun 2025 16:41:40 +0300 Subject: [PATCH 0028/1348] ggml : add ggml_set_rows (llama/14274) * ggml : add ggml_set_rows Add ggml_set_rows(a, b, c) which copies rows from 'b' into 'a' using indices from 'c'. ref: #8366 * use I64 for indices * ggml : add repeat impl for i64 * ggml : add ggml_is_contiguous_rows * ggml : ggml_set_rows support broadcast * ggml : ggml_set_rows support quantized dst ggml-ci * ggml : support GGML_TYPE_F32 ".from_float" trait * ggml : ggml_set_rows update comment + better index name * tests : add ggml_set_rows * metal : add ggml_set_rows implementation ggml-ci * ggml : simplify forward_dup_f32 * ggml : fix supports_op * tests : add comment to set_rows * ggml : leave the repeat_i64 for a separate PR ggml-ci * ggml : set_rows use std::min instead of MIN * ggml : better error message for set_rows unsupported type * metal : perform op->type check only once * tests : more consistent implementation + more tests ggml-ci --------- Co-authored-by: Georgi Gerganov --- ggml/include/ggml-cpu.h | 1 + ggml/include/ggml.h | 21 ++ ggml/src/ggml-cpu/ggml-cpu.c | 10 + ggml/src/ggml-cpu/ggml-cpu.cpp | 1 + ggml/src/ggml-cpu/ops.cpp | 96 ++++-- ggml/src/ggml-cpu/ops.h | 1 + ggml/src/ggml-metal/ggml-metal-impl.h | 16 + ggml/src/ggml-metal/ggml-metal.m | 112 +++++- ggml/src/ggml-metal/ggml-metal.metal | 469 ++++++++++++++++---------- ggml/src/ggml.c | 41 ++- 10 files changed, 564 insertions(+), 204 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index e3b79d09bb6..be40b100979 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -134,6 +134,7 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t); GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 4231a668eb0..042a13e055b 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -470,6 +470,7 @@ extern "C" { GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_GET_ROWS_BACK, + GGML_OP_SET_ROWS, GGML_OP_DIAG, GGML_OP_DIAG_MASK_INF, GGML_OP_DIAG_MASK_ZERO, @@ -687,6 +688,9 @@ extern "C" { // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); + // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements + GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -1375,6 +1379,23 @@ extern "C" { struct ggml_tensor * b, // row indices struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape + // a TD [n_embd, ne1, ne2, ne3] + // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3 + // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1) + // + // undefined behavior if destination rows overlap + // + // broadcast: + // ne2 % ne11 == 0 + // ne3 % ne12 == 0 + // + // return view(a) + GGML_API struct ggml_tensor * ggml_set_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, // destination + struct ggml_tensor * b, // source + struct ggml_tensor * c); // row indices + GGML_API struct ggml_tensor * ggml_diag( struct ggml_context * ctx, struct ggml_tensor * a); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 7cae96f4b48..2042ee71f1f 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -195,6 +195,7 @@ typedef pthread_t ggml_thread_t; static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = { + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, .nrows = 1, @@ -1817,6 +1818,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_get_rows_back(params, tensor); } break; + case GGML_OP_SET_ROWS: + { + ggml_compute_forward_set_rows(params, tensor); + } break; case GGML_OP_DIAG: { ggml_compute_forward_diag(params, tensor); @@ -2170,6 +2175,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { n_tasks = n_threads; } break; case GGML_OP_GET_ROWS: + case GGML_OP_SET_ROWS: { // FIXME: get_rows can use additional threads, but the cost of launching additional threads // decreases performance with GPU offloading @@ -3124,6 +3130,10 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g return ggml_graph_compute(cgraph, &cplan); } +void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) { + memcpy(y, x, n * sizeof(float)); +} + void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { int64_t i = 0; #if defined(__F16C__) diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index a98866a2d80..c9daa4c39e8 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -416,6 +416,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st switch (op->op) { case GGML_OP_CPY: + case GGML_OP_SET_ROWS: return op->type != GGML_TYPE_IQ3_XXS && op->type != GGML_TYPE_IQ3_S && diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 525fd178760..7ead13f42a3 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -696,24 +696,8 @@ static void ggml_compute_forward_dup_f32( if (ggml_is_contiguous(dst)) { // TODO: simplify if (nb00 == sizeof(float)) { - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - const size_t rs = ne00 * nb00; - char * dst_ptr = (char *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, rs); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else if (ggml_get_type_traits_cpu(dst->type)->from_float) { - ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float; + if (ggml_get_type_traits_cpu(dst->type)->from_float) { + ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float; size_t id = 0; size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); @@ -724,7 +708,7 @@ static void ggml_compute_forward_dup_f32( id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - quantize_row_q(src0_ptr, dst_ptr + id, ne00); + from_float(src0_ptr, dst_ptr + id, ne00); id += rs; } id += rs * (ne01 - ir1); @@ -2300,6 +2284,12 @@ void ggml_compute_forward_repeat( { ggml_compute_forward_repeat_f32(params, dst); } break; + // TODO: templateify the implemenation and support for I64 + // ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225 + //case GGML_TYPE_I64: + // { + // ggml_compute_forward_repeat_i64(params, dst); + // } break; default: { GGML_ABORT("fatal error"); @@ -4470,6 +4460,74 @@ void ggml_compute_forward_get_rows( //} } +static void ggml_compute_forward_set_rows_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ne01; + + assert(ne0 == nc); + assert(ne2 == ne02); + assert(ne3 == ne03); + assert(src0->type == GGML_TYPE_F32); + assert(ne02 % ne11 == 0); + assert(ne03 % ne12 == 0); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = std::min(ir0 + dr, nr); + + ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float; + + for (int64_t i03 = 0; i03 < ne03; ++i03) { + for (int64_t i02 = 0; i02 < ne02; ++i02) { + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i03%ne12; + const int64_t i11 = i02%ne11; + const int64_t i10 = i; + + const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + GGML_ASSERT(i1 >= 0 && i1 < ne1); + + from_float( + (const float *) ((char *) src0->data + i*nb01 + i02*nb02 + i03*nb03), + ((char *) dst->data + i1*nb1 + i02*nb2 + i03*nb3), nc); + } + } + } +} + +void ggml_compute_forward_set_rows( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_set_rows_f32(params, dst); + } break; + default: + { + GGML_ABORT("src0->type = %d (%s) not supported", src0->type, ggml_type_name(src0->type)); + } + } +} + // ggml_compute_forward_get_rows_back static void ggml_compute_forward_get_rows_back_f32_f16( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 2d8544d7d3d..3a395fdcd9f 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -53,6 +53,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 17eab976f3a..260440aedde 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -521,6 +521,22 @@ typedef struct { uint64_t nb2; } ggml_metal_kargs_get_rows; +typedef struct { + int32_t nk0; + int32_t ne01; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + int32_t ne11; + int32_t ne12; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; +} ggml_metal_kargs_set_rows; + typedef struct { int64_t ne00; int64_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index d8d30cc0b41..349f0ff998e 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -202,6 +202,15 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, + GGML_METAL_KERNEL_TYPE_SET_ROWS_F32, + GGML_METAL_KERNEL_TYPE_SET_ROWS_F16, + GGML_METAL_KERNEL_TYPE_SET_ROWS_BF16, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q8_0, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_0, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_1, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_0, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, + GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_L2_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, @@ -1169,6 +1178,15 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_F32, set_rows_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_F16, set_rows_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_BF16, set_rows_bf16, use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q8_0, set_rows_q8_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_0, set_rows_q4_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_1, set_rows_q4_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_0, set_rows_q5_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, set_rows_q5_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, set_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM, l2_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction); @@ -1635,6 +1653,10 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex const bool use_bfloat = ctx_dev->use_bfloat; if (!use_bfloat) { + if (op->type == GGML_TYPE_BF16) { + return false; + } + for (size_t i = 0, n = 3; i < n; ++i) { if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { return false; @@ -1804,6 +1826,27 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex { return op->ne[3] == 1; } + case GGML_OP_SET_ROWS: + { + if (op->src[0]->type != GGML_TYPE_F32) { + return false; + } + + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_IQ4_NL: + return true; + default: + return false; + }; + } default: return false; } @@ -3777,13 +3820,74 @@ static bool ggml_metal_encode_node( }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&args length:sizeof(args) atIndex:3]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; } break; + case GGML_OP_SET_ROWS: + { + id pipeline = nil; + + switch (dst->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_F16 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_BF16 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q8_0 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_0 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_1 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_0 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL].pipeline; break; + default: GGML_ABORT("not implemented"); + } + + const int32_t nk0 = ne0/ggml_blck_size(dst->type); + + int nth = 32; // SIMD width + + while (nth < nk0 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) { + nth *= 2; + } + + int nrptg = 1; + if (nth > nk0) { + nrptg = (nth + nk0 - 1)/nk0; + nth = nk0; + + if (nrptg*nth > (int) pipeline.maxTotalThreadsPerThreadgroup) { + nrptg--; + } + } + + nth = MIN(nth, nk0); + + ggml_metal_kargs_set_rows args = { + /*.nk0 =*/ nk0, + /*.ne01 =*/ ne01, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + /*.nb3 =*/ nb3, + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)]; + } break; case GGML_OP_RMS_NORM: { GGML_ASSERT(ne00 % 4 == 0); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 5f004a856bd..984a0ab503e 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -35,6 +35,17 @@ constexpr constant static float kvalues_iq4nl_f[16] = { -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f }; +static inline int best_index_int8(int n, constant float * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} + // NOTE: this is not dequantizing - we are simply fitting the template template void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { @@ -97,6 +108,173 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r } } +void quantize_q4_0(device const float * src, device block_q4_0 & dst) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK4_0; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK4_0/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + dst.qs[j] = xi0; + dst.qs[j] |= xi1 << 4; + } +} + +void quantize_q4_1(device const float * src, device block_q4_1 & dst) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < QK4_1; j++) { + const float v = src[j]; + if (min > v) min = v; + if (max < v) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + dst.m = min; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (src[0 + j] - min)*id; + const float x1 = (src[QK4_1/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + dst.qs[j] = xi0; + dst.qs[j] |= xi1 << 4; + } +} + +void quantize_q5_0(device const float * src, device block_q5_0 & dst) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK5_0; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / -16; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + uint32_t qh = 0; + for (int j = 0; j < QK5_0/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK5_0/2 + j]*id; + + const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); + + dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); + } + + thread const uint8_t * qh8 = (thread const uint8_t *)&qh; + + for (int j = 0; j < 4; ++j) { + dst.qh[j] = qh8[j]; + } +} + +void quantize_q5_1(device const float * src, device block_q5_1 & dst) { + float max = src[0]; + float min = src[0]; + + for (int j = 1; j < QK5_1; j++) { + const float v = src[j]; + min = v < min ? v : min; + max = v > max ? v : max; + } + + const float d = (max - min) / 31; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + dst.m = min; + + uint32_t qh = 0; + for (int j = 0; j < QK5_1/2; ++j) { + const float x0 = (src[0 + j] - min)*id; + const float x1 = (src[QK5_1/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); + } + + thread const uint8_t * qh8 = (thread const uint8_t *)&qh; + + for (int j = 0; j < 4; ++j) { + dst.qh[j] = qh8[j]; + } +} + +void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK4_NL; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / kvalues_iq4nl_f[0]; + const float id = d ? 1.0f/d : 0.0f; + + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK4_NL/2 + j]*id; + + const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0); + const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1); + + dst.qs[j] = xi0 | (xi1 << 4); + + const float v0 = kvalues_iq4nl_f[xi0]; + const float v1 = kvalues_iq4nl_f[xi1]; + const float w0 = src[0 + j]*src[0 + j]; + const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j]; + sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j]; + sumq2 += w0*v0*v0 + w1*v1*v1; + + } + + dst.d = sumq2 > 0 ? sumqx/sumq2 : d; +} + template void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) { device const uint16_t * qs = ((device const uint16_t *)xb + 2); @@ -279,6 +457,26 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re } } +void quantize_q8_0(device const float * src, device block_q8_0 & dst) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = src[j]; + amax = MAX(amax, fabs(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = src[j]*id; + + dst.qs[j] = round(x0); + } +} + template void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { const float d = xb->d; @@ -4410,6 +4608,7 @@ template [[host_name("kernel_cpy_bf16_f32")]] kernel kernel_cpy_t kernel_cpy; #endif +// TODO: templetify these kernels kernel void kernel_cpy_f32_q8_0( constant ggml_metal_kargs_cpy & args, device const char * src0, @@ -4433,23 +4632,7 @@ kernel void kernel_cpy_f32_q8_0( for (int64_t i00 = tpitg.x*QK8_0; i00 < args.ne00; i00 += ntg.x*QK8_0) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = src[j]; - amax = MAX(amax, fabs(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK8_0].d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = src[j]*id; - - dst_data[i00/QK8_0].qs[j] = round(x0); - } + quantize_q8_0(src, dst_data[i00/QK8_0]); } } @@ -4476,32 +4659,7 @@ kernel void kernel_cpy_f32_q4_0( for (int64_t i00 = tpitg.x*QK4_0; i00 < args.ne00; i00 += ntg.x*QK4_0) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK4_0; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / -8; - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK4_0].d = d; - - for (int j = 0; j < QK4_0/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK4_0/2 + j]*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - - dst_data[i00/QK4_0].qs[j] = xi0; - dst_data[i00/QK4_0].qs[j] |= xi1 << 4; - } + quantize_q4_0(src, dst_data[i00/QK4_0]); } } @@ -4528,31 +4686,7 @@ kernel void kernel_cpy_f32_q4_1( for (int64_t i00 = tpitg.x*QK4_1; i00 < args.ne00; i00 += ntg.x*QK4_1) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < QK4_1; j++) { - const float v = src[j]; - if (min > v) min = v; - if (max < v) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK4_1].d = d; - dst_data[i00/QK4_1].m = min; - - for (int j = 0; j < QK4_1/2; ++j) { - const float x0 = (src[0 + j] - min)*id; - const float x1 = (src[QK4_1/2 + j] - min)*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - - dst_data[i00/QK4_1].qs[j] = xi0; - dst_data[i00/QK4_1].qs[j] |= xi1 << 4; - } + quantize_q4_1(src, dst_data[i00/QK4_1]); } } @@ -4579,38 +4713,7 @@ kernel void kernel_cpy_f32_q5_0( for (int64_t i00 = tpitg.x*QK5_0; i00 < args.ne00; i00 += ntg.x*QK5_0) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK5_0; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / -16; - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK5_0].d = d; - - uint32_t qh = 0; - for (int j = 0; j < QK5_0/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK5_0/2 + j]*id; - - const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); - const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); - - dst_data[i00/QK5_0].qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); - } - thread const uint8_t * qh8 = (thread const uint8_t *)&qh; - for (int j = 0; j < 4; ++j) { - dst_data[i00/QK5_0].qh[j] = qh8[j]; - } + quantize_q5_0(src, dst_data[i00/QK5_0]); } } @@ -4637,49 +4740,8 @@ kernel void kernel_cpy_f32_q5_1( for (int64_t i00 = tpitg.x*QK5_1; i00 < args.ne00; i00 += ntg.x*QK5_1) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float max = src[0]; - float min = src[0]; - - for (int j = 1; j < QK5_1; j++) { - const float v = src[j]; - min = v < min ? v : min; - max = v > max ? v : max; - } - - const float d = (max - min) / 31; - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK5_1].d = d; - dst_data[i00/QK5_1].m = min; - - uint32_t qh = 0; - for (int j = 0; j < QK5_1/2; ++j) { - const float x0 = (src[0 + j] - min)*id; - const float x1 = (src[QK5_1/2 + j] - min)*id; - - const uint8_t xi0 = (uint8_t)(x0 + 0.5f); - const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - - dst_data[i00/QK5_1].qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); - } - thread const uint8_t * qh8 = (thread const uint8_t *)&qh; - for (int j = 0; j < 4; ++j) { - dst_data[i00/QK5_1].qh[j] = qh8[j]; - } - } -} - -static inline int best_index_int8(int n, constant float * val, float x) { - if (x <= val[0]) return 0; - if (x >= val[n-1]) return n-1; - int ml = 0, mu = n-1; - while (mu-ml > 1) { - int mav = (ml+mu)/2; - if (x < val[mav]) mu = mav; else ml = mav; + quantize_q5_1(src, dst_data[i00/QK5_1]); } - return x - val[mu-1] < val[mu] - x ? mu-1 : mu; } kernel void kernel_cpy_f32_iq4_nl( @@ -4705,40 +4767,7 @@ kernel void kernel_cpy_f32_iq4_nl( for (int64_t i00 = tpitg.x*QK4_NL; i00 < args.ne00; i00 += ntg.x*QK4_NL) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK4_NL; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / kvalues_iq4nl_f[0]; - const float id = d ? 1.0f/d : 0.0f; - - float sumqx = 0, sumq2 = 0; - for (int j = 0; j < QK4_NL/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK4_NL/2 + j]*id; - - const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0); - const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1); - - dst_data[i00/QK4_NL].qs[j] = xi0 | (xi1 << 4); - - const float v0 = kvalues_iq4nl_f[xi0]; - const float v1 = kvalues_iq4nl_f[xi1]; - const float w0 = src[0 + j]*src[0 + j]; - const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j]; - sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j]; - sumq2 += w0*v0*v0 + w1*v1*v1; - - } - - dst_data[i00/QK4_NL].d = sumq2 > 0 ? sumqx/sumq2 : d; + quantize_iq4_nl(src, dst_data[i00/QK4_NL]); } } @@ -6419,10 +6448,10 @@ kernel void kernel_mul_mv_iq4_xs_f32( template kernel void kernel_get_rows_q( + constant ggml_metal_kargs_get_rows & args, device const void * src0, device const void * src1, device float * dst, - constant ggml_metal_kargs_get_rows & args, uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], uint3 tptg [[threads_per_threadgroup]]) { @@ -6442,10 +6471,10 @@ kernel void kernel_get_rows_q( template kernel void kernel_get_rows_f( + constant ggml_metal_kargs_get_rows & args, device const void * src0, device const void * src1, device float * dst, - constant ggml_metal_kargs_get_rows & args, uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], uint3 tptg [[threads_per_threadgroup]]) { @@ -6463,10 +6492,10 @@ kernel void kernel_get_rows_f( } kernel void kernel_get_rows_i32( + constant ggml_metal_kargs_get_rows & args, device const void * src0, device const void * src1, device int32_t * dst, - constant ggml_metal_kargs_get_rows & args, uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], uint3 tptg [[threads_per_threadgroup]]) { @@ -6483,6 +6512,67 @@ kernel void kernel_get_rows_i32( } } +template +kernel void kernel_set_rows_q32( + constant ggml_metal_kargs_set_rows & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg [[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + + const int32_t i12 = i03%args.ne12; + const int32_t i11 = i02%args.ne11; + + const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x; + if (i01 >= args.ne01) { + return; + } + + const int32_t i10 = i01; + const int64_t i1 = ((const device int64_t *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0]; + + device block_q * dst_row = ( device block_q *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3); + const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + + for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) { + quantize_func(src_row + 32*ind, dst_row[ind]); + } +} + +template +kernel void kernel_set_rows_f( + constant ggml_metal_kargs_set_rows & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg [[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + + const int32_t i12 = i03%args.ne12; + const int32_t i11 = i02%args.ne11; + + const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x; + if (i01 >= args.ne01) { + return; + } + + const int32_t i10 = i01; + const int64_t i1 = ((const device int64_t *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0]; + + device T * dst_row = ( device T *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3); + const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + + for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) { + dst_row[ind] = (T) src_row[ind]; + } +} #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B @@ -6906,6 +6996,27 @@ template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_q_t kernel_get template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_q_t kernel_get_rows_q; template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get_rows_q; +// +// set rows +// + +typedef decltype(kernel_set_rows_f) set_rows_f_t; + +template [[host_name("kernel_set_rows_f32")]] kernel set_rows_f_t kernel_set_rows_f; +template [[host_name("kernel_set_rows_f16")]] kernel set_rows_f_t kernel_set_rows_f; +#if defined(GGML_METAL_USE_BF16) +template [[host_name("kernel_set_rows_bf16")]] kernel set_rows_f_t kernel_set_rows_f; +#endif + +typedef decltype(kernel_set_rows_q32) set_rows_q32_t; + +template [[host_name("kernel_set_rows_q8_0")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q4_0")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q4_1")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q5_0")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q5_1")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_iq4_nl")]] kernel set_rows_q32_t kernel_set_rows_q32; + // // matrix-matrix multiplication // diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c963b338a18..be8a9c4488b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -933,6 +933,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "TRANSPOSE", "GET_ROWS", "GET_ROWS_BACK", + "SET_ROWS", "DIAG", "DIAG_MASK_INF", "DIAG_MASK_ZERO", @@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1029,6 +1030,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "transpose(x)", "get_rows(x)", "get_rows_back(x)", + "set_rows(x)", "diag(x)", "diag_mask_inf(x)", "diag_mask_zero(x)", @@ -1079,7 +1081,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1348,6 +1350,12 @@ bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) { tensor->nb[2] == ggml_type_size(tensor->type); } +bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor) { + return + tensor->ne[0] == ggml_blck_size(tensor->type) || + tensor->nb[0] == ggml_type_size(tensor->type); +} + static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -3384,6 +3392,35 @@ struct ggml_tensor * ggml_get_rows_back( return result; } +// ggml_set_rows + +struct ggml_tensor * ggml_set_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(a->ne[0] == b->ne[0]); + GGML_ASSERT(a->ne[2] == b->ne[2]); + GGML_ASSERT(a->ne[3] == b->ne[3]); + GGML_ASSERT(b->ne[1] == c->ne[0]); + GGML_ASSERT(b->ne[2] % c->ne[1] == 0); + GGML_ASSERT(b->ne[3] % c->ne[2] == 0); + GGML_ASSERT(c->ne[3] == 1); + GGML_ASSERT(b->type == GGML_TYPE_F32); + GGML_ASSERT(c->type == GGML_TYPE_I64); + + GGML_ASSERT(ggml_is_contiguous_rows(a)); + GGML_ASSERT(ggml_is_contiguous_rows(b)); + + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + result->op = GGML_OP_SET_ROWS; + result->src[0] = b; + result->src[1] = c; + + return result; +} + // ggml_diag struct ggml_tensor * ggml_diag( From 0b09f5bbad8c6f8467bf8fb6a94ee413ff0001b2 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Fri, 27 Jun 2025 22:35:30 -0500 Subject: [PATCH 0029/1348] vulkan: Fix GGML_VULKAN_SHADER_DEBUG_INFO (llama/14427) This setting needs to be passed through to vulkan-shaders-gen --- ggml/src/ggml-vulkan/CMakeLists.txt | 1 + ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 0bf4cb14f88..b97e7bf9955 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -99,6 +99,7 @@ if (Vulkan_FOUND) if (GGML_VULKAN_SHADER_DEBUG_INFO) add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO) + list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DGGML_VULKAN_SHADER_DEBUG_INFO=ON) endif() if (GGML_VULKAN_VALIDATE) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index 14e9daaa01a..e1f613fb4f6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -19,6 +19,10 @@ if (GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) message(STATUS "Enabling bfloat16 glslc support") endif() +if (GGML_VULKAN_SHADER_DEBUG_INFO) + add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO) + message(STATUS "Enabling shader debug info") +endif() set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) From f31ed384f4be6a6114b5d6158375b2ecbfdda3ed Mon Sep 17 00:00:00 2001 From: Xinpeng Dou <15529241576@163.com> Date: Sat, 28 Jun 2025 17:35:41 +0800 Subject: [PATCH 0030/1348] fix async_mode bug (llama/14432) --- ggml/src/ggml-cann/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index ba2cef0c25f..8dfe3b061c1 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -359,7 +359,7 @@ struct ggml_backend_cann_context { ggml_cann_set_device(device); description = aclrtGetSocName(); - bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or("")); + async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or("")); GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__, device, async_mode ? "ON" : "OFF"); } From 9e48afba2f0d0069966e8d6049fae84450741eec Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 28 Jun 2025 10:17:09 -0500 Subject: [PATCH 0031/1348] vulkan: lock accesses of pinned_memory vector (llama/14333) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 99be5e45b2a..e42f115d040 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -305,7 +305,7 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& } struct vk_device_struct { - std::mutex mutex; + std::recursive_mutex mutex; vk::PhysicalDevice physical_device; vk::PhysicalDeviceProperties properties; @@ -1197,7 +1197,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin } { - std::lock_guard guard(device->mutex); + std::lock_guard guard(device->mutex); device->pipelines.insert({ pipeline->name, pipeline }); } @@ -1411,7 +1411,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector guard(device->mutex); + std::lock_guard guard(device->mutex); q.queue_family_index = queue_family_index; q.transfer_only = transfer_only; @@ -4124,6 +4124,7 @@ static void * ggml_vk_host_malloc(vk_device& device, size_t size) { return nullptr; } + std::lock_guard guard(device->mutex); device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); return buf->ptr; @@ -4134,6 +4135,8 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) { return; } VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")"); + std::lock_guard guard(device->mutex); + vk_buffer buf; size_t index; for (size_t i = 0; i < device->pinned_memory.size(); i++) { @@ -4156,6 +4159,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) { } static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) { + std::lock_guard guard(device->mutex); buf = nullptr; buf_offset = 0; for (size_t i = 0; i < device->pinned_memory.size(); i++) { @@ -4457,7 +4461,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); } } else { - std::lock_guard guard(dst->device->mutex); + std::lock_guard guard(dst->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); @@ -4548,7 +4552,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_ memcpy(dst, (uint8_t *) src->ptr + offset, size); } else { - std::lock_guard guard(src->device->mutex); + std::lock_guard guard(src->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(src->device, subctx); @@ -4578,7 +4582,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { if (src->device == dst->device) { - std::lock_guard guard(src->device->mutex); + std::lock_guard guard(src->device->mutex); VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); // Copy within the device vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); @@ -4613,7 +4617,7 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); - std::lock_guard guard(dst->device->mutex); + std::lock_guard guard(dst->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); From f641a4c410832a29f4389043cf932811de8e377a Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 28 Jun 2025 10:36:40 -0500 Subject: [PATCH 0032/1348] vulkan: handle noncontig in the final case of ggml_vk_get_cpy_pipeline (llama/14378) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e42f115d040..996ccbf66b0 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4844,9 +4844,17 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const // type size must be exactly 2 or 4. GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4); if ((ggml_type_size(src->type) % 4) == 0) { - return ctx->device->pipeline_contig_cpy_f32_f32; + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f32; + } else { + return ctx->device->pipeline_cpy_f32_f32; + } } else { - return ctx->device->pipeline_contig_cpy_f16_f16; + if (contig) { + return ctx->device->pipeline_contig_cpy_f16_f16; + } else { + return ctx->device->pipeline_cpy_f16_f16; + } } } @@ -4907,7 +4915,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT const uint64_t ne00 = src0->ne[0]; @@ -5135,7 +5143,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT const uint64_t ne00 = src0->ne[0]; @@ -5736,7 +5744,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ids->type == GGML_TYPE_I32); From b900ee424c67a14fd3f650553e6445b83186e39b Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 29 Jun 2025 01:30:53 +0800 Subject: [PATCH 0033/1348] CUDA: add bf16 and f32 support to cublas_mul_mat_batched (llama/14361) * CUDA: add bf16 and f32 support to cublas_mul_mat_batched * Review: add type traits and make function more generic * Review: make check more explicit, add back comments, and fix formatting * Review: fix formatting, remove useless type conversion, fix naming for bools --- ggml/src/ggml-cuda/convert.cu | 22 ++++ ggml/src/ggml-cuda/convert.cuh | 5 + ggml/src/ggml-cuda/ggml-cuda.cu | 207 ++++++++++++++++++++------------ 3 files changed, 158 insertions(+), 76 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index c6dec4276b3..eeaa14bf579 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -728,3 +728,25 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { return nullptr; } } + +to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return convert_unary_cuda; + case GGML_TYPE_F16: + return convert_unary_cuda; + default: + return nullptr; + } +} + +to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_F16: + return convert_unary_cuda; + case GGML_TYPE_BF16: + return convert_unary_cuda; + default: + return nullptr; + } +} diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index b65b98e08e7..f04214be175 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -22,5 +22,10 @@ using to_t_nc_cuda_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream); +typedef to_t_nc_cuda_t to_fp32_nc_cuda_t; typedef to_t_nc_cuda_t to_fp16_nc_cuda_t; +typedef to_t_nc_cuda_t to_bf16_nc_cuda_t; + +to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type); to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); +to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b30c13c62f2..811422f3850 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1749,7 +1749,7 @@ static void ggml_cuda_op_mul_mat( } static __global__ void k_compute_batched_ptrs( - const half * src0_as_f16, const half * src1_as_f16, char * dst, + const void * src0_as_f16, const void * src1_as_f16, char * dst, const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23, @@ -1772,83 +1772,131 @@ static __global__ void k_compute_batched_ptrs( ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } -static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +// Type traits for mapping ggml types to CUDA/cuBLAS types +template +struct batched_mul_mat_traits; + +template<> +struct batched_mul_mat_traits { + using cuda_type = float; + static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + static inline const cudaDataType_t data_type = CUDA_R_32F; + static inline const ggml_type ggml_type_val = GGML_TYPE_F32; + static inline const float alpha = 1.0f; + static inline const float beta = 0.0f; + static inline const void* get_alpha() { static const float val = alpha; return &val; } + static inline const void* get_beta() { static const float val = beta; return &val; } + static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp32_nc_cuda(src_type); } +}; + +template<> +struct batched_mul_mat_traits { + using cuda_type = nv_bfloat16; + static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + static inline const cudaDataType_t data_type = CUDA_R_16BF; + static inline const ggml_type ggml_type_val = GGML_TYPE_BF16; + static inline const float alpha = 1.0f; + static inline const float beta = 0.0f; + static inline const void* get_alpha() { static const float val = alpha; return &val; } + static inline const void* get_beta() { static const float val = beta; return &val; } + static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_bf16_nc_cuda(src_type); } +}; + +template<> +struct batched_mul_mat_traits { + using cuda_type = half; + static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + static inline const cudaDataType_t data_type = CUDA_R_16F; + static inline const ggml_type ggml_type_val = GGML_TYPE_F16; + static inline const half alpha = 1.0; + static inline const half beta = 0.0; + static inline const void* get_alpha() { static const half val = alpha; return &val; } + static inline const void* get_beta() { static const half val = beta; return &val; } + static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp16_nc_cuda(src_type); } +}; + +template +static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + using traits = batched_mul_mat_traits; + using cuda_t = typename traits::cuda_type; + GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft)); - GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == src0_type); + GGML_ASSERT(ggml_is_contiguous(dst)); // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst. // As long as dst is contiguous this does not matter though. - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_TENSOR_BINARY_OP_LOCALS const int64_t ne_dst = ggml_nelements(dst); - cudaStream_t main_stream = ctx.stream(); - CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); - const half * src0_f16 = (const half *) src0->data; float * dst_ddf = (float *) dst->data; - - const half * src1_f16 = (const half *) src1->data; const size_t ts_src1 = ggml_type_size(src1->type); GGML_ASSERT(nb10 == ts_src1); int64_t s11 = nb11 / ts_src1; int64_t s12 = nb12 / ts_src1; int64_t s13 = nb13 / ts_src1; - ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); - // convert src1 to fp16 - if (src1->type != GGML_TYPE_F16) { - const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type); - const int64_t ne_src1 = ggml_nelements(src1); - src1_f16_alloc.alloc(ne_src1); - GGML_ASSERT(to_fp16_cuda != nullptr); + const cuda_t * src0_ptr = nullptr; + const cuda_t * src1_ptr = nullptr; - to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); + ggml_cuda_pool_alloc src0_alloc(ctx.pool()); + ggml_cuda_pool_alloc src1_alloc(ctx.pool()); + + // Handle src0 + src0_ptr = (const cuda_t *) src0->data; + + // Handle src1 - convert if necessary + if (src1->type == src0_type) { + src1_ptr = (const cuda_t *) src1->data; + } else { + // Convert src1 to target type using traits conversion functions + const int64_t ne_src1 = ggml_nelements(src1); + src1_alloc.alloc(ne_src1); - src1_f16 = src1_f16_alloc.get(); + const auto convert_func = traits::get_nc_converter(src1->type); + GGML_ASSERT(convert_func != nullptr); + convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); + src1_ptr = src1_alloc.get(); s11 = ne10; s12 = ne11*s11; s13 = ne12*s12; } - ggml_cuda_pool_alloc dst_f16(ctx.pool()); + // Setup destination buffer + ggml_cuda_pool_alloc dst_temp(ctx.pool()); char * dst_t; - - cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; - cudaDataType_t cu_data_type = CUDA_R_16F; - - // dst strides size_t nbd2 = dst->nb[2]; size_t nbd3 = dst->nb[3]; - const half alpha_f16 = 1.0f; - const half beta_f16 = 0.0f; - + cublasComputeType_t cu_compute_type = traits::compute_type; + cudaDataType_t cu_data_type = traits::data_type; + cudaDataType_t cu_data_type_a = traits::data_type; + cudaDataType_t cu_data_type_b = traits::data_type; + const void * alpha = traits::get_alpha(); + const void * beta = traits::get_beta(); const float alpha_f32 = 1.0f; - const float beta_f32 = 0.0f; - - const void * alpha = &alpha_f16; - const void * beta = &beta_f16; + const float beta_f32 = 0.0f; if (dst->op_params[0] == GGML_PREC_DEFAULT) { - dst_t = (char *) dst_f16.alloc(ne_dst); - - nbd2 /= sizeof(float) / sizeof(half); - nbd3 /= sizeof(float) / sizeof(half); + if constexpr (src0_type == GGML_TYPE_F32) { + dst_t = (char *) dst_ddf; // Direct F32 output + } else { + dst_t = (char *) dst_temp.alloc(ne_dst); + nbd2 /= sizeof(float) / sizeof(cuda_t); + nbd3 /= sizeof(float) / sizeof(cuda_t); + } } else { dst_t = (char *) dst_ddf; - cu_compute_type = CUBLAS_COMPUTE_32F; - cu_data_type = CUDA_R_32F; - + cu_data_type = CUDA_R_32F; alpha = &alpha_f32; - beta = &beta_f32; + beta = &beta_f32; } int id = ggml_cuda_get_device(); @@ -1856,7 +1904,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { cu_compute_type = CUBLAS_COMPUTE_32F; alpha = &alpha_f32; - beta = &beta_f32; + beta = &beta_f32; } GGML_ASSERT(ne12 % ne02 == 0); @@ -1866,35 +1914,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co const int64_t r2 = ne12/ne02; const int64_t r3 = ne13/ne03; -#if 0 - // use cublasGemmEx - { - for (int i13 = 0; i13 < ne13; ++i13) { - for (int i12 = 0; i12 < ne12; ++i12) { - int i03 = i13 / r3; - int i02 = i12 / r2; - - CUBLAS_CHECK( - cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half), - src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11, - beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0, - cu_compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - } - } -#else if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) { // there is no broadcast and src0, src1 are contiguous across dims 2, 3 // use cublasGemmStridedBatchedEx CUBLAS_CHECK( cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA - src1_f16, CUDA_R_16F, s11, s12, // strideB - beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC + alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA + src1_ptr, cu_data_type_b, s11, s12, // strideB + beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC ne12*ne13, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -1905,34 +1933,55 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ggml_cuda_pool_alloc ptrs_src(ctx.pool(), 2*ne23); ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23); + size_t src1_stride_size = sizeof(cuda_t); + dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( - src0_f16, src1_f16, dst_t, + src0_ptr, src1_ptr, dst_t, ptrs_src.get(), ptrs_dst.get(), ne12, ne13, ne23, nb02, nb03, - src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half), - src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half), + (src1->type == src0_type) ? nb12 : s12*src1_stride_size, + (src1->type == src0_type) ? nb13 : s13*src1_stride_size, nbd2, nbd3, r2, r3); + CUDA_CHECK(cudaGetLastError()); CUBLAS_CHECK( cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00, - (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11, - beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0, + alpha, (const void **) (ptrs_src.get() + 0*ne23), cu_data_type_a, nb01/nb00, + (const void **) (ptrs_src.get() + 1*ne23), cu_data_type_b, s11, + beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0, ne23, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } -#endif - if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type == CUDA_R_16F) { - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); - to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream); + // Convert output back to F32 if needed + if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type != CUDA_R_32F) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(traits::ggml_type_val); + to_fp32_cuda(dst_temp.get(), dst_ddf, ne_dst, main_stream); + } +} + +static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32); + + switch (src0->type) { + case GGML_TYPE_F32: + ggml_cuda_mul_mat_batched_cublas_impl(ctx, src0, src1, dst); + break; + case GGML_TYPE_BF16: + ggml_cuda_mul_mat_batched_cublas_impl(ctx, src0, src1, dst); + break; + case GGML_TYPE_F16: + ggml_cuda_mul_mat_batched_cublas_impl(ctx, src0, src1, dst); + break; + default: + GGML_ABORT("Unsupported type"); } } @@ -1984,6 +2033,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); + //TODO update for generic tensor parallelism + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16); + bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc); + bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32; + if (!split && use_mul_mat_vec) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) @@ -1992,8 +2047,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); } else if (!split && use_mul_mat_q) { ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst); - } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && - !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32) + && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec) { From 00b36237ba7b7569995bff1ced92f0f796ed8b5c Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 29 Jun 2025 02:43:36 -0500 Subject: [PATCH 0034/1348] vulkan: Add fusion support for RMS_NORM+MUL (llama/14366) * vulkan: Add fusion support for RMS_NORM+MUL - Add a use_count to ggml_tensor, so we can detect if an output is used more than once. - Change the ggml-vulkan rms_norm shader to optionally multiply by another tensor. - Add detection logic and basic fusion logic in ggml-vulkan. - Add some testing support for fusion. Rather than computing one node at a time, allow for computing the whole graph and just testing one node's results. Add rms_norm_mul tests and enable a llama test. * extract some common fusion logic * fix -Winconsistent-missing-override * move ggml_can_fuse to a common function * build fix * C and C++ versions of can_fuse * move use count to the graph to avoid data races and double increments when used in multiple threads * use hash table lookup to find node index * change use_counts to be indexed by hash table slot * minimize hash lookups style fixes * last node doesn't need single use. fix type. handle mul operands being swapped. * remove redundant parameter --------- Co-authored-by: slaren --- ggml/include/ggml-backend.h | 2 +- ggml/src/ggml-backend.cpp | 54 ++++++++++------ ggml/src/ggml-impl.h | 64 +++++++++++++++++++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 58 ++++++++++++----- .../ggml-vulkan/vulkan-shaders/rms_norm.comp | 15 ++++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 +- ggml/src/ggml.c | 46 +++++++++---- 7 files changed, 190 insertions(+), 51 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 778927f6821..a2977ea2e56 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -339,7 +339,7 @@ extern "C" { typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends - GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); + GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node); // Tensor initialization GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b1050ad59c2..788861a365f 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -817,8 +817,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } if (sched->debug > 1) { ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, - fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); + GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name, + fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node), + graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -1826,7 +1827,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { ggml_free(copy.ctx_unallocated); } -bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { +bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) { struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); if (copy.buffer == NULL) { return false; @@ -1837,28 +1838,45 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t assert(g1->n_nodes == g2->n_nodes); - for (int i = 0; i < g1->n_nodes; i++) { - struct ggml_tensor * t1 = g1->nodes[i]; - struct ggml_tensor * t2 = g2->nodes[i]; + if (test_node != nullptr) { + // Compute the whole graph and only test the output for a specific tensor + ggml_backend_graph_compute(backend1, g1); + ggml_backend_graph_compute(backend2, g2); - assert(t1->op == t2->op && ggml_are_same_layout(t1, t2)); + int test_node_idx = -1; + for (int i = 0; i < g1->n_nodes; i++) { + struct ggml_tensor * t1 = g1->nodes[i]; + if (t1 == test_node) { + test_node_idx = i; + break; + } + } + GGML_ASSERT(test_node_idx != -1); - struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); - struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); + callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data); + } else { + for (int i = 0; i < g1->n_nodes; i++) { + struct ggml_tensor * t1 = g1->nodes[i]; + struct ggml_tensor * t2 = g2->nodes[i]; - ggml_backend_graph_compute(backend1, &g1v); - ggml_backend_graph_compute(backend2, &g2v); + assert(t1->op == t2->op && ggml_are_same_layout(t1, t2)); - if (ggml_is_view_op(t1->op)) { - continue; - } + struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); + struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); - // compare results, calculate rms etc - if (!callback(i, t1, t2, user_data)) { - break; + ggml_backend_graph_compute(backend1, &g1v); + ggml_backend_graph_compute(backend2, &g2v); + + if (ggml_is_view_op(t1->op)) { + continue; + } + + // compare results, calculate rms etc + if (!callback(i, t1, t2, user_data)) { + break; + } } } - ggml_backend_graph_copy_free(copy); return true; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 57761644f43..4972558c98b 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -301,6 +301,7 @@ struct ggml_cgraph { struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes struct ggml_tensor ** grad_accs; // accumulators for node gradients struct ggml_tensor ** leafs; // tensors with constant data + int32_t * use_counts;// number of uses of each tensor, indexed by hash table slot struct ggml_hash_set visited_hash_set; @@ -467,13 +468,76 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) +// return true if the node's results are only used by N other nodes +// and can be fused into their calculations. +static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) { + const struct ggml_tensor * node = cgraph->nodes[node_idx]; + + // check the use count against how many we're replacing + size_t hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node); + if (!ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos) || cgraph->use_counts[hash_pos] != n_uses) { + return false; + } + + // if node is a view, some other node might be using the intermediate result + // via the view source. + if (node->view_src) { + return false; + } + + // If the user requested output for the node, can't fuse + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { + return false; + } + + return true; +} + +// Returns true if nodes [i, i+ops.size()) are the sequence of ggml_ops in ops[] +// and are fusable. Nodes are considered fusable according to this function if: +// - all nodes except the last have only one use and are not views/outputs (see ggml_node_has_N_uses). +// - all nodes except the last are a src of the following node. +// - all nodes are the same shape. +// TODO: Consider allowing GGML_OP_NONE nodes in between +static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, const enum ggml_op * ops, int num_ops) { + if (node_idx + num_ops > cgraph->n_nodes) { + return false; + } + + for (int i = 0; i < num_ops; ++i) { + struct ggml_tensor * node = cgraph->nodes[node_idx + i]; + if (node->op != ops[i]) { + return false; + } + if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idx + i, 1)) { + return false; + } + if (i > 0) { + struct ggml_tensor * prev = cgraph->nodes[node_idx + i - 1]; + if (node->src[0] != prev && node->src[1] != prev) { + return false; + } + if (!ggml_are_same_shape(node, prev)) { + return false; + } + } + } + return true; +} + #ifdef __cplusplus } #endif #ifdef __cplusplus +#include #include +// nicer C++ syntax for ggml_can_fuse +inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { + return ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size()); +} + // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 996ccbf66b0..aebcc03915f 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -425,6 +425,7 @@ struct vk_device_struct { vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; + vk_pipeline pipeline_rms_norm_mul_f32; vk_pipeline pipeline_rms_norm_back_f32; vk_pipeline pipeline_l2_norm_f32; @@ -978,6 +979,10 @@ struct ggml_backend_vk_context { vk_command_pool compute_cmd_pool; vk_command_pool transfer_cmd_pool; + + // number of additional consecutive nodes that are being fused with the + // node currently being processed + uint32_t num_additional_fused_ops {}; }; static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT @@ -2655,7 +2660,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_f32, "rms_norm_mul_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); @@ -6430,7 +6436,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rms_norm_f32; + return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32; } return nullptr; case GGML_OP_RMS_NORM_BACK: @@ -7530,18 +7536,19 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun); } -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - op_params[0], 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + op_params[0], 0.0f, 0, }, dryrun); } @@ -8736,7 +8743,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* t // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. -static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){ +static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){ + ggml_tensor * node = cgraph->nodes[node_idx]; if (ggml_is_empty(node) || !node->buffer) { return false; } @@ -8974,8 +8982,14 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; case GGML_OP_RMS_NORM: - ggml_vk_rms_norm(ctx, compute_ctx, src0, node, dryrun); - + if (ctx->num_additional_fused_ops > 0) { + // fused rms_norm + mul + ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0]; + ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, dryrun); + } else { + ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, dryrun); + } break; case GGML_OP_RMS_NORM_BACK: ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun); @@ -9710,10 +9724,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false); + if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + } + ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false); if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } + i += ctx->num_additional_fused_ops; + ctx->num_additional_fused_ops = 0; } if (ctx->device->need_compiles) { ggml_vk_load_shaders(ctx->device); @@ -9775,14 +9794,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } + if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + } + // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; bool submit = (submitted_nodes >= nodes_per_submit) || (mul_mat_bytes >= mul_mat_bytes_per_submit) || - (i == last_node) || + (i + ctx->num_additional_fused_ops == last_node) || (almost_ready && !ctx->almost_ready_fence_pending); - bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit); + bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i + ctx->num_additional_fused_ops == last_node, almost_ready, submit); if (vk_perf_logger_enabled) { if (ctx->compute_ctx.expired()) { @@ -9792,7 +9815,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } else { compute_ctx = ctx->compute_ctx.lock(); } - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1); + // If there are fused ops, just write out timestamps for all nodes to keep the accounting simple + for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) { + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1); + } } if (enqueued) { @@ -9814,6 +9840,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } submit_count++; } + i += ctx->num_additional_fused_ops; + ctx->num_additional_fused_ops = 0; } if (vk_perf_logger_enabled) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp index deb8ee9960f..6428ca7ba33 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp @@ -1,11 +1,13 @@ #version 450 -#include "generic_unary_head.comp" +#include "generic_binary_head.comp" #include "types.comp" #extension GL_EXT_control_flow_attributes : enable #define BLOCK_SIZE 512 +layout (constant_id = 1) const bool do_multiply = false; + layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; shared FLOAT_TYPE sum[BLOCK_SIZE]; @@ -25,6 +27,7 @@ void main() { const uint stride_sample = p.nb03; uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset(); + uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset(); uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset(); sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp @@ -46,7 +49,13 @@ void main() { const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(ncols); const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1)); - [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { - data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); + if (do_multiply) { + [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col])); + } + } else { + [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); + } } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index c63345ec8b4..a207b98c60e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -497,7 +497,7 @@ void process_shaders() { // Norms string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); - string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index be8a9c4488b..e70d9d57b3b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5850,19 +5850,32 @@ static void ggml_compute_backward( GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2])); } -static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { +static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { // check if already visited - if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) { - return; + size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node); + GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL); + if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) { + // This is the first time we see this node in the current graph. + cgraph->visited_hash_set.keys[node_hash_pos] = node; + ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos); + cgraph->use_counts[node_hash_pos] = 0; + } else { + // already visited + return node_hash_pos; } for (int i = 0; i < GGML_MAX_SRC; ++i) { const int k = (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i : (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) : - /* unknown order, just fall back to using i*/ i; - if (node->src[k]) { - ggml_visit_parents(cgraph, node->src[k]); + /* unknown order, just fall back to using i */ i; + + struct ggml_tensor * src = node->src[k]; + if (src) { + size_t src_hash_pos = ggml_visit_parents(cgraph, src); + + // Update the use count for this operand. + cgraph->use_counts[src_hash_pos]++; } } @@ -5886,6 +5899,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * cgraph->nodes[cgraph->n_nodes] = node; cgraph->n_nodes++; } + + return node_hash_pos; } static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) { @@ -6023,6 +6038,7 @@ static size_t ggml_graph_nbytes(size_t size, bool grads) { incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1); incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs + incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t)); // use_counts incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys if (grads) { incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads @@ -6052,11 +6068,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz void * p = cgraph + 1; - struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); - struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); - struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); - struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; - struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; + struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + int32_t * use_counts_ptr = incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t)); + struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; + struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t)); @@ -6071,6 +6088,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.grads =*/ grads_ptr, /*.grad_accs =*/ grad_accs_ptr, /*.leafs =*/ leafs_ptr, + /*.use_counts =*/ use_counts_ptr, /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, }; @@ -6097,7 +6115,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.grads =*/ NULL, // gradients would need visited_hash_set /*.grad_accs =*/ NULL, /*.leafs =*/ NULL, - /*.visited_hash_set =*/ { 0, NULL, NULL }, + /*.use_counts =*/ cgraph0->use_counts, + /*.visited_hash_set =*/ cgraph0->visited_hash_set, /*.order =*/ cgraph0->order, }; @@ -6124,7 +6143,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { for (size_t i = 0; i < src->visited_hash_set.size; ++i) { // copy all hashset keys (tensors) that are in use if (ggml_bitset_get(src->visited_hash_set.used, i)) { - ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); + size_t new_hash_pos = ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); + dst->use_counts[new_hash_pos] = src->use_counts[i]; } } From 8bc638ee5644411d84058edff1cda4b0abe0b2a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 29 Jun 2025 11:04:10 +0200 Subject: [PATCH 0035/1348] ggml : implement REGLU/GEGLU/SWIGLU ops (llama/14158) * implement unary REGLU/GEGLU/SWIGLU cpu ops * relax constraints * duplicate shape of source * fix ggml_vec_geglu_f16 * special case gated ops * implement unary REGLU/GEGLU/SWIGLU cuda ops * tighten constraints again * refactor into GGML_GLU_OP * metal : add glu kernels ggml-ci * add CUDA_GLU_BLOCK_SIZE [no ci] * more constraints and use 64bit ints ggml-ci * 64bit multiplication [no ci] * implement swapped variants (cpu/cuda) * update comment [no ci] ggml-ci * Vulkan: Add GLU ops and shaders * SYCL: Implement fused kernel GEGLU, SWIGLU and REGLU for single up+gate * ggml : implement GLU for split up/gate (llama/14181) * implement GLU for split up/gate * add tests for ggml_glu_split * Vulkan: Implement glu_split logic and shader support * add split to logging [no ci] * SYCL: refactor element_size ops and add split up and gate support to gated kernels * SYCL: switch GEGLU to use tanh approximation --------- Co-authored-by: 0cc4m Co-authored-by: Akarshan * GGML: increase OP count in assertion * Refactor: Optimize SYCL element-wise operations with unary function inlining This commit refactors the SYCL element-wise operations to improve performance by: - Inlining unary operations (sgn, abs, elu, gelu, silu, etc.) to reduce kernel launch overhead. - Introducing helper functions `op_xxx` for each unary operation to encapsulate the logic. - Replacing direct kernel calls with calls to these inlined functions. - Using `__dpct_inline__` to encourage compiler inlining. - Minor code cleanup and consistency improvements. The changes aim to reduce kernel launch overhead and improve the overall efficiency of element-wise operations on SYCL devices. * vulkan: Increase workgroup size for GLU, for performance (llama/14345) * vulkan: Increase workgroup size for GLU, for performance * vulkan: change GLU shaders to do one element per invocation rather than one row per workgroup * merge fix * metal : add support for split and swap ggml-ci --------- Co-authored-by: Georgi Gerganov Co-authored-by: 0cc4m Co-authored-by: Akarshan Co-authored-by: Jeff Bolz --- ggml/include/ggml.h | 69 + ggml/src/ggml-cpu/ggml-cpu.c | 16 + ggml/src/ggml-cpu/ops.cpp | 457 +++++ ggml/src/ggml-cpu/ops.h | 1 + ggml/src/ggml-cpu/vec.cpp | 24 + ggml/src/ggml-cpu/vec.h | 54 + ggml/src/ggml-cuda/ggml-cuda.cu | 25 + ggml/src/ggml-cuda/unary.cu | 89 + ggml/src/ggml-cuda/unary.cuh | 7 + ggml/src/ggml-metal/ggml-metal-impl.h | 11 + ggml/src/ggml-metal/ggml-metal.m | 71 + ggml/src/ggml-metal/ggml-metal.metal | 64 + ggml/src/ggml-sycl/element_wise.cpp | 1661 +++++++---------- ggml/src/ggml-sycl/element_wise.hpp | 25 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 25 + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 117 +- .../src/ggml-vulkan/vulkan-shaders/geglu.comp | 13 + .../ggml-vulkan/vulkan-shaders/glu_head.comp | 15 + .../ggml-vulkan/vulkan-shaders/glu_main.comp | 29 + .../src/ggml-vulkan/vulkan-shaders/reglu.comp | 9 + .../ggml-vulkan/vulkan-shaders/swiglu.comp | 9 + .../vulkan-shaders/vulkan-shaders-gen.cpp | 7 + ggml/src/ggml.c | 138 +- 23 files changed, 1896 insertions(+), 1040 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 042a13e055b..3a04ad58a49 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -520,6 +520,8 @@ extern "C" { GGML_OP_CROSS_ENTROPY_LOSS_BACK, GGML_OP_OPT_STEP_ADAMW, + GGML_OP_GLU, + GGML_OP_COUNT, }; @@ -543,6 +545,14 @@ extern "C" { GGML_UNARY_OP_COUNT, }; + enum ggml_glu_op { + GGML_GLU_OP_REGLU, + GGML_GLU_OP_GEGLU, + GGML_GLU_OP_SWIGLU, + + GGML_GLU_OP_COUNT, + }; + enum ggml_object_type { GGML_OBJECT_TYPE_TENSOR, GGML_OBJECT_TYPE_GRAPH, @@ -658,6 +668,7 @@ extern "C" { GGML_API const char * ggml_op_symbol(enum ggml_op op); GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); + GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op); GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); @@ -762,6 +773,7 @@ extern "C" { GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); @@ -1090,6 +1102,63 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // gated linear unit ops + // A: n columns, r rows, + // result is n / 2 columns, r rows, + // expects gate in second half of row, unless swapped is true + GGML_API struct ggml_tensor * ggml_glu( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_glu_op op, + bool swapped); + + GGML_API struct ggml_tensor * ggml_reglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_reglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_swiglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_swiglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // A: n columns, r rows, + // B: n columns, r rows, + GGML_API struct ggml_tensor * ggml_glu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_glu_op op); + + GGML_API struct ggml_tensor * ggml_reglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_geglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_swiglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // normalize along rows GGML_API struct ggml_tensor * ggml_norm( struct ggml_context * ctx, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 2042ee71f1f..1d68cde71a6 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1949,6 +1949,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_unary(params, tensor); } break; + case GGML_OP_GLU: + { + ggml_compute_forward_glu(params, tensor); + } break; case GGML_OP_GET_REL_POS: { ggml_compute_forward_get_rel_pos(params, tensor); @@ -2159,6 +2163,18 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { GGML_ABORT("fatal error"); } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + { + n_tasks = n_threads; + } break; + default: + GGML_ABORT("fatal error"); + } + break; case GGML_OP_SILU_BACK: case GGML_OP_MUL: case GGML_OP_DIV: diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 7ead13f42a3..3b07e249741 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -3184,6 +3184,435 @@ void ggml_compute_forward_silu_back( } } +// ggml_compute_forward_reglu + +static void ggml_compute_forward_reglu_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * src0_p = (float *) (src0_d + i1*src0_o); + float * src1_p = (float *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_reglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_reglu_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o); + ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_reglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_reglu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_reglu_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_reglu_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_geglu + +static void ggml_compute_forward_geglu_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * src0_p = (float *) (src0_d + i1*src0_o); + float * src1_p = (float *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_geglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_geglu_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o); + ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_geglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_geglu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_geglu_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_geglu_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_swiglu + +static void ggml_compute_forward_swiglu_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * src0_p = (float *) (src0_d + i1*src0_o); + float * src1_p = (float *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_swiglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_swiglu_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o); + ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_swiglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_swiglu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_swiglu_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_swiglu_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_norm static void ggml_compute_forward_norm_f32( @@ -8057,6 +8486,34 @@ void ggml_compute_forward_unary( } } +//ggml_compute_forward_glu + +void ggml_compute_forward_glu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_glu_op op = ggml_get_glu_op(dst); + + switch (op) { + case GGML_GLU_OP_REGLU: + { + ggml_compute_forward_reglu(params, dst); + } break; + case GGML_GLU_OP_GEGLU: + { + ggml_compute_forward_geglu(params, dst); + } break; + case GGML_GLU_OP_SWIGLU: + { + ggml_compute_forward_swiglu(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_get_rel_pos static void ggml_compute_forward_get_rel_pos_f16( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 3a395fdcd9f..5b384e4ba5f 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -94,6 +94,7 @@ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, st void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 5e34d79a169..ed5d7aefc35 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -254,6 +254,30 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) { } } +void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) { + int i = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i))); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i))); + } +#elif defined(__SSE2__) + for (; i + 3 < n; i += 4) { + _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i))); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + for (; i + 3 < n; i += 4) { + vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i))); + } +#endif + for (; i < n; ++i) { + y[i] = ggml_silu_f32(x[i]) * g[i]; + } +} + ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { int i = 0; ggml_float sum = 0; diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 84f6c0e6d26..ebd4b756134 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -905,6 +905,60 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con } } +inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) { + for (int i = 0; i < n; ++i) { + y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f; + } +} + +inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { + for (int i = 0; i < n; ++i) { + float v = GGML_FP16_TO_FP32(x[i]); + y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v * GGML_FP16_TO_FP32(g[i]) : 0.f); + } +} + +#ifdef GGML_GELU_FP16 +inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) { + uint16_t t; + for (int i = 0; i < n; ++i) { + if (x[i] <= -10.0f) { + y[i] = 0.0f; + } else if (x[i] >= 10.0f) { + y[i] = x[i] * g[i]; + } else { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i]; + } + } +} +#else +inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_f32(x[i]) * g[i]; + } +} +#endif + +inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { + const uint16_t * i16 = (const uint16_t *) x; + for (int i = 0; i < n; ++i) { + float v = GGML_FP16_TO_FP32(g[i]); + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v); + } +} + +void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g); + +inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { + for (int i = 0; i < n; ++i) { + float v = GGML_FP16_TO_FP32(x[i]); + float w = GGML_FP16_TO_FP32(g[i]); + y[i] = GGML_FP32_TO_FP16((v/(1.0f + expf(-v))) * w); + } +} + inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE ggml_float sum = 0.0; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 811422f3850..086f9a56c4a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2303,6 +2303,21 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_REGLU: + ggml_cuda_op_reglu(ctx, dst); + break; + case GGML_GLU_OP_GEGLU: + ggml_cuda_op_geglu(ctx, dst); + break; + case GGML_GLU_OP_SWIGLU: + ggml_cuda_op_swiglu(ctx, dst); + break; + default: + return false; + } + break; case GGML_OP_NORM: ggml_cuda_op_norm(ctx, dst); break; @@ -3096,6 +3111,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous_1(op->src[0]); + default: + return false; + } + break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 2c0375fbe3c..ba3c0f13762 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -196,6 +196,95 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_unary(ctx, dst); } +/* gated ops */ + +template +static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) { + const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + // perform base op and multiply with gate (either offset in same tensor or a separate one) + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + + dst[i] = (T)(op((float)x[j0]) * (float)g[j1]); +} + +template +static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, cudaStream_t stream) { + const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE; + unary_gated_op_kernel<<>>(x, g, dst, k, n, o0, o1); +} + +template +void ggml_cuda_op_unary_gated(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + void * src0_d = src0->data; + void * src1_d = src1 ? src1->data : src0->data; + const int64_t src0_o = src0->nb[1]; + const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + void * dst_d = dst->data; + const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(src0->nb[0] == ggml_element_size(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); + GGML_ASSERT(src1->ne[0] == nc); + GGML_ASSERT(src0->type == src1->type); + } + + const int32_t swapped = ((const int32_t *) dst->op_params)[1]; + + if (src0->type == GGML_TYPE_F16) { + half * src0_p = (half *) src0_d; + half * src1_p = (half *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + unary_gated_cuda(src0_p, src1_p, (half *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(half), src1_o / sizeof(half), stream); + } else { + float * src0_p = (float *) src0_d; + float * src1_p = (float *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + unary_gated_cuda(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), stream); + } +} + +void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary_gated(ctx, dst); +} + +void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary_gated(ctx, dst); +} + +void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary_gated(ctx, dst); +} + /* silu_back */ static __device__ __forceinline__ float op_silu_back(float grad, float x) { diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index 6686fc17e91..9094f1d0bad 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -15,6 +15,7 @@ #define CUDA_SQRT_BLOCK_SIZE 256 #define CUDA_SIN_BLOCK_SIZE 256 #define CUDA_COS_BLOCK_SIZE 256 +#define CUDA_GLU_BLOCK_SIZE 256 void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst); @@ -57,3 +58,9 @@ void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 260440aedde..7a9aab31684 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -422,6 +422,17 @@ typedef struct { int32_t KHW; // KH * KW, pre-computed on CPU to save GPU resources } ggml_metal_kargs_im2col; +typedef struct{ + int32_t ne00; + uint64_t nb01; + int32_t ne10; + uint64_t nb11; + int32_t ne0; + uint64_t nb1; + int32_t i00; + int32_t i10; +} ggml_metal_kargs_glu; + typedef struct { int64_t ne00; int64_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 349f0ff998e..12a36695789 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -526,6 +526,9 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_SIN, GGML_METAL_KERNEL_TYPE_COS, GGML_METAL_KERNEL_TYPE_NEG, + GGML_METAL_KERNEL_TYPE_REGLU, + GGML_METAL_KERNEL_TYPE_GEGLU, + GGML_METAL_KERNEL_TYPE_SWIGLU, GGML_METAL_KERNEL_TYPE_SUM_ROWS, GGML_METAL_KERNEL_TYPE_MEAN, GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, @@ -1502,6 +1505,9 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REGLU, reglu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU, geglu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SWIGLU, swiglu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true); @@ -1680,6 +1686,15 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex default: return false; } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + default: + return false; + } case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -2419,6 +2434,62 @@ static bool ggml_metal_encode_node( GGML_ABORT("fatal error"); } } break; + case GGML_OP_GLU: + { + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + if (src1) { + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + } + + id pipeline = nil; + + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_REGLU: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REGLU].pipeline; + break; + case GGML_GLU_OP_GEGLU: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GEGLU].pipeline; + break; + case GGML_GLU_OP_SWIGLU: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SWIGLU].pipeline; + break; + default: + GGML_ABORT("fatal error"); + } + + const int32_t swp = ((const int32_t *) dst->op_params)[1]; + + const int32_t i00 = swp ? ne0 : 0; + const int32_t i10 = swp ? 0 : ne0; + + ggml_metal_kargs_glu args = { + /*.ne00 =*/ ne00, + /*.nb01 =*/ nb01, + /*.ne10 =*/ src1 ? ne10 : ne00, + /*.nb11 =*/ src1 ? nb11 : nb01, + /*.ne0 =*/ ne0, + /*.nb1 =*/ nb1, + /*.i00 =*/ src1 ? 0 : i00, + /*.i10 =*/ src1 ? 0 : i10, + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + if (src1) { + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:3]; + + const int64_t nrows = ggml_nrows(src0); + + const int32_t nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00/2); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_SQR: { GGML_ASSERT(ggml_is_contiguous(src0)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 984a0ab503e..fc3cfe35a34 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1191,6 +1191,70 @@ kernel void kernel_neg( dst[tpig] = -src0[tpig]; } +kernel void kernel_reglu( + device const char * src0, + device const char * src1, + device char * dst, + constant ggml_metal_kargs_glu & args, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + dst_row[i0] = x0*x1*(x0 > 0.0f); + } +} + +kernel void kernel_geglu( + device const char * src0, + device const char * src1, + device char * dst, + constant ggml_metal_kargs_glu & args, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu = 0.5f*x0*(1.0f + precise::tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); + + dst_row[i0] = gelu*x1; + } +} + +kernel void kernel_swiglu( + device const char * src0, + device const char * src1, + device char * dst, + constant ggml_metal_kargs_glu & args, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float silu = x0 / (1.0f + exp(-x0)); + + dst_row[i0] = silu*x1; + } +} + template kernel void kernel_sum_rows( constant ggml_metal_kargs_sum_rows & args, diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index c56924ce832..c7788bdb6bf 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,12 +1,19 @@ #include "common.hpp" +#include "ggml-sycl/presets.hpp" #include "ggml.h" #include "element_wise.hpp" +#define SYCL_GLOBAL_ID_LOOP(K, ITEM) \ + for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0)) + +#define SYCL_LOCAL_ID_CALC(ITEM, IDX) \ + (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX)) + + static void acc_f32(const float * x, const float * y, float * dst, const int ne, const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); + const int nb1, const int nb2, int offset, const sycl::nd_item<1> &item_ct1) { + const int i = SYCL_LOCAL_ID_CALC(item_ct1, 0); if (i >= ne) { return; } @@ -21,535 +28,375 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne, } } +/* Unary OP funcs */ template -static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { - dst[i] = x[i] > static_cast(0.f) ? static_cast(1.f) : ((x[i] < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); - } +static __dpct_inline__ T op_sgn(T x) { + return x > static_cast(0.f) ? static_cast(1.f) : ((x < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); } template -static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { - dst[i] = sycl::fabs(x[i]); - } +static __dpct_inline__ T op_abs(T x) { + return sycl::fabs(x); } template -static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { - dst[i] = (x[i] > static_cast(0.f)) ? x[i] : sycl::expm1(x[i]); - } +static __dpct_inline__ T op_elu(T x) { + return (x > static_cast(0.f)) ? x : sycl::expm1(x); } template -static void gelu(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { +static __dpct_inline__ T op_gelu(T x) { const T GELU_COEF_A = static_cast(0.044715f); const T SQRT_2_OVER_PI = static_cast(0.79788456080286535587989211986876f); - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - float xi = x[i]; - dst[i] = static_cast(0.5f) * xi * - (static_cast(1.0f) + - sycl::tanh(SQRT_2_OVER_PI * xi * (static_cast(1.0f) + GELU_COEF_A * xi * xi))); + return static_cast(0.5f) * x * + (static_cast(1.0f) + + sycl::tanh(SQRT_2_OVER_PI * x * (static_cast(1.0f) + GELU_COEF_A * x * x))); } template -static void silu(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = x[i] / (static_cast(1.0f) + sycl::native::exp(-x[i])); +static __dpct_inline__ T op_silu(T x) { + return x / (static_cast(1.0f) + sycl::native::exp(-x)); } template -static void gelu_quick(const T *x, T *dst, int k, - const sycl::nd_item<3> &item_ct1) { - const float GELU_QUICK_COEF = -1.702f; - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; - } - dst[i] = x[i] * (static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i]))); +static __dpct_inline__ T op_gelu_quick(T x) { + const T GELU_QUICK_COEF_LOCAL = static_cast(-1.702f); + return x * (static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x))); } template -static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { +static __dpct_inline__ T op_gelu_erf(T x) { const T SQRT_2_INV = static_cast(0.70710678118654752440084436210484f); - for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { - auto x_i = x[i]; - dst[i] = static_cast(0.5f) * x_i * (static_cast(1.0f) + sycl::erf(x_i * SQRT_2_INV)); - } + return static_cast(0.5f) * x * (static_cast(1.0f) + sycl::erf(x * SQRT_2_INV)); } template -static void tanh(const T *x, T *dst, int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; - } - dst[i] = sycl::tanh((x[i])); +static __dpct_inline__ T op_tanh(T x) { + return sycl::tanh(x); } template -static void relu(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::fmax((x[i]), static_cast(0)); +static __dpct_inline__ T op_relu(T x) { + return sycl::fmax(x, static_cast(0)); } template -static void sigmoid(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = 1.0f / (static_cast(1.0f) + sycl::native::exp(-x[i])); +static __dpct_inline__ T op_sigmoid(T x) { + return static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(-x)); } template -static void sqrt(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::sqrt(x[i]); +static __dpct_inline__ T op_sqrt(T x) { + return sycl::sqrt(x); } template -static void sin(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::sin(x[i]); +static __dpct_inline__ T op_sin(T x) { + return sycl::sin(x); } template -static void cos(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::cos(x[i]); +static __dpct_inline__ T op_cos(T x) { + return sycl::cos(x); } template -static void hardsigmoid(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x[i] + static_cast(3.0f)) / static_cast(6.0f))); +static __dpct_inline__ T op_hardsigmoid(T x) { + return sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); } template -static void hardswish(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = x[i] * sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x[i] + static_cast(3.0f)) / static_cast(6.0f))); +static __dpct_inline__ T op_hardswish(T x) { + return x * sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); } template -static void exp(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); +static __dpct_inline__ T op_exp(T x) { + return sycl::exp(x); +} - if (i >= k) { - return; +template +static __dpct_inline__ T op_log(T x) { + if (x <= static_cast(0)) { + return neg_infinity(); } - dst[i] = sycl::exp(x[i]); + return sycl::log(x); } template -static void log(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); +static __dpct_inline__ T op_neg(T x) { + return -x; +} - if (i >= k) { - return; - } - T xi = x[i]; - if (xi <= 0) { - dst[i] = neg_infinity(); - } else { - dst[i] = sycl::log(xi); - } +template +static __dpct_inline__ T op_step(T x) { + return (x > static_cast(0.0f)) ? static_cast(1.0f) : static_cast(0.0f); } template -static void neg(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); +static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) { + T neg_slope_T = static_cast(negative_slope); + return sycl::fmax(x, static_cast(0)) + + sycl::fmin(x, static_cast(0.0f)) * neg_slope_T; +} - if (i >= k) { - return; - } - dst[i] = -x[i]; +template +static __dpct_inline__ T op_sqr(T x) { + return x * x; } template -static void step(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); +static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) { + return x < static_cast(min_val) ? static_cast(min_val) : (x > static_cast(max_val) ? static_cast(max_val) : x); +} - if (i >= k) { - return; +template +static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sgn(x[i]); } - dst[i] = x[i] > static_cast(0.0f); } template -static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; +static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_abs(x[i]); } - dst[i] = sycl::fmax((x[i]), static_cast(0)) + - sycl::fmin((x[i]), static_cast(0.0f)) * negative_slope; } template -static void sqr(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; +static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_elu(x[i]); } - dst[i] = x[i] * x[i]; } -template -static void upscale(const T *x, T *dst, const int nb00, const int nb01, - const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int ne13, const float sf0, const float sf1, - const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) { - int index = item_ct1.get_local_id(0) + - item_ct1.get_group(0) * item_ct1.get_local_range(0); - if (index >= ne10 * ne11 * ne12 * ne13) { - return; +template +static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu(x[i]); } - // operation - int i10 = index % ne10; - int i11 = (index / ne10) % ne11; - int i12 = (index / (ne10 * ne11)) % ne12; - int i13 = (index / (ne10 * ne11 * ne12)) % ne13; - - int i00 = i10 / sf0; - int i01 = i11 / sf1; - int i02 = i12 / sf2; - int i03 = i13 / sf3; - - dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); } -template -static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02, - const sycl::nd_item<3> &item_ct1) { - int nidx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (nidx >= ne0) { - return; +template +static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_silu(x[i]); } +} - // operation - int offset_dst = nidx + item_ct1.get_group(1) * ne0 + - item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) { - int offset_src = nidx + item_ct1.get_group(1) * ne00 + - item_ct1.get_group(0) * ne00 * ne01; - dst[offset_dst] = x[offset_src]; - } else { - dst[offset_dst] = static_cast(0.0f); +template +static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu_quick(x[i]); } } - template -static void clamp(const T * x, T * dst, const float min, const float max, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; +static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu_erf(x[i]); } - - dst[i] = x[i] < static_cast(min) ? static_cast(min) : (x[i] > static_cast(max) ? static_cast(max) : x[i]); } -static void acc_f32_sycl(const float *x, const float *y, float *dst, - const int n_elements, const int ne10, const int ne11, - const int ne12, const int nb1, const int nb2, - const int offset, queue_ptr stream) { - int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1); - }); +template +static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_tanh(x[i]); + } } template -static void gelu_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { gelu(x, dst, k, item_ct1); }); +static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_relu(x[i]); + } } template -static void silu_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { silu(x, dst, k, item_ct1); }); +static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sigmoid(x[i]); + } } template -static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) { - // hard code for now - const int num_blocks = ceil_div(k, 256); - sycl_parallel_for( - stream, sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), - [=](sycl::nd_item<3> item_ct1) { sgn(x, dst, k, item_ct1); }); +static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sqrt(x[i]); + } } template -static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) { - // hard code for now - const int num_blocks = ceil_div(k, 256); - sycl_parallel_for( - stream, - sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), - [=](sycl::nd_item<3> item_ct1) { abs_op(x, dst, k, item_ct1); }); +static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sin(x[i]); + } } - template -static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) { - // hard code for now - const int num_blocks = ceil_div(k, 256); - sycl_parallel_for( - stream, - sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), - [=](sycl::nd_item<3> item_ct1) { elu_op(x, dst, k, item_ct1); }); +static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_cos(x[i]); + } } template -static void gelu_quick_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { gelu_quick(x, dst, k, item_ct1); }); +static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_hardsigmoid(x[i]); + } } - template -static void gelu_erf_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { gelu_erf(x, dst, k, item_ct1); }); +static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_hardswish(x[i]); + } } template -static void tanh_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { tanh(x, dst, k, item_ct1); }); +static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_exp(x[i]); + } } template -static void relu_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { relu(x, dst, k, item_ct1); }); +static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_log(x[i]); + } } template -static void hardsigmoid_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE; - sycl_parallel_for( - stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { hardsigmoid(x, dst, k, item_ct1); }); +static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_neg(x[i]); + } } template -static void hardswish_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE; - sycl_parallel_for( - stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { hardswish(x, dst, k, item_ct1); }); +static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_step(x[i]); + } } template -static void exp_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { exp(x, dst, k, item_ct1); }); +static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_leaky_relu(x[i], negative_slope); + } } template -static void log_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { log(x, dst, k, item_ct1); }); +static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sqr(x[i]); + } } template -static void neg_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { neg(x, dst, k, item_ct1); }); +static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_clamp(x[i], min_val, max_val); + } } -template -static void step_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { step(x, dst, k, item_ct1); }); +template +static void upscale(const T *x, T *dst, const int nb00, const int nb01, + const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int ne13, const float sf0, const float sf1, + const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) { + int index = item_ct1.get_local_id(0) + + item_ct1.get_group(0) * item_ct1.get_local_range(0); + if (index >= ne10 * ne11 * ne12 * ne13) { + return; + } + // operation + int i10 = index % ne10; + int i11 = (index / ne10) % ne11; + int i12 = (index / (ne10 * ne11)) % ne12; + int i13 = (index / (ne10 * ne11 * ne12)) % ne13; + + int i00 = static_cast(i10 / sf0); + int i01 = static_cast(i11 / sf1); + int i02 = static_cast(i12 / sf2); + int i03 = static_cast(i13 / sf3); + + dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); } -template -static void sigmoid_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE; - sycl_parallel_for( - stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { sigmoid(x, dst, k, item_ct1); }); +template +static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = SYCL_LOCAL_ID_CALC(item_ct1, 2); + if (nidx >= ne0) { + return; + } + + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) { + int offset_src = nidx + item_ct1.get_group(1) * ne00 + + item_ct1.get_group(0) * ne00 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + dst[offset_dst] = static_cast(0.0f); + } } template -static void sqrt_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { sqrt(x, dst, k, item_ct1); }); +static void clamp(const T * x, T * dst, const float min, const float max, const int k, + const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = x[i] < static_cast(min) ? static_cast(min) : (x[i] > static_cast(max) ? static_cast(max) : x[i]); + } } template -static void sin_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { sin(x, dst, k, item_ct1); }); +static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_gelu(x[j0]) * g[j1]; + } } template -static void cos_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { cos(x, dst, k, item_ct1); }); +static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_relu(x[j0]) * g[j1]; + } } template -static void leaky_relu_sycl(const T *x, T *dst, const int k, - const float negative_slope, - queue_ptr stream) { - const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { leaky_relu(x, dst, k, negative_slope, item_ct1); }); +static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_silu(x[j0]) * g[j1]; + } } -template -static void sqr_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE; +namespace ggml_sycl_detail { +static void acc_f32_sycl(const float *x, const float *y, float *dst, + const int n_elements, const int ne10, const int ne11, + const int ne12, const int nb1, const int nb2, + const int offset, queue_ptr stream) { + int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE); sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { sqr(x, dst, k, item_ct1); }); + sycl::nd_range<1>(sycl::range<1>(num_blocks) * + sycl::range<1>(SYCL_ACC_BLOCK_SIZE), + sycl::range<1>(SYCL_ACC_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, + item_ct1); + }); } template @@ -558,7 +405,7 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01, const int ne12, const int ne13, const float sf0, const float sf1, const float sf2, const float sf3, queue_ptr stream) { int dst_size = ne10 * ne11 * ne12 * ne13; - int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; + int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE); sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE); sycl_parallel_for<1>( stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { @@ -570,7 +417,7 @@ template static void pad_sycl(const T *x, T *dst, const int ne00, const int ne01, const int ne02, const int ne0, const int ne1, const int ne2, queue_ptr stream) { - int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE; + int num_blocks = ceil_div(ne0, SYCL_PAD_BLOCK_SIZE); sycl::range<3> gridDim(ne2, ne1, num_blocks); sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE), @@ -578,22 +425,11 @@ static void pad_sycl(const T *x, T *dst, const int ne00, [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); }); } -template -static void clamp_sycl(const T *x, T *dst, const float min, - const float max, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { clamp(x, dst, min, max, k, item_ct1); }); -} - -inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +template +static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - #else GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -606,14 +442,14 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) case GGML_TYPE_F16: { auto data_pts = cast_data(dst); - sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); break; } #endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); - sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); break; } default: @@ -621,11 +457,11 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } } -inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +template +static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - #else GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -633,52 +469,66 @@ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->src[0]->type == dst->type); dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;; + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_is_contiguous_1(dst->src[0])); + GGML_ASSERT(ggml_is_contiguous(dst)); + const int32_t swapped = ((const int32_t *) dst->op_params)[1]; + void * src0_d = src0->data; + void * src1_d = src1 ? src1->data : src0->data; + const int64_t src0_o = src0->nb[1]; + const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + void * dst_d = dst->data; + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); + GGML_ASSERT(src1->ne[0] == nc); + GGML_ASSERT(src0->type == src1->type); } -} - - -inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); switch (dst->type) { #if defined (GGML_SYCL_F16) case GGML_TYPE_F16: { - auto data_pts = cast_data(dst); - elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + sycl::half * src0_p = (sycl::half *) src0_d; + sycl::half * src1_p = (sycl::half *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + kernel_invoker(src0_p, + src1_p, + (sycl::half *) dst_d, + ggml_nelements(dst), + nc, + src0_o / sizeof(sycl::half), + src1_o / sizeof(sycl::half), + main_stream, + std::forward(args)...); break; } #endif case GGML_TYPE_F32: { - auto data_pts = cast_data(dst); - elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + float * src0_p = (float *) src0_d; + float * src1_p = (float *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + kernel_invoker(src0_p, + src1_p, + (float *) dst_d, + ggml_nelements(dst), + nc, + src0_o / sizeof(float), + src1_o / sizeof(float), + main_stream, + std::forward(args)...); break; } default: @@ -686,7 +536,8 @@ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } } -inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +template +static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); @@ -695,52 +546,31 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->type == GGML_TYPE_F32); #endif GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} -inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0]; + const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1]; + const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2]; + const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3]; switch (dst->type) { #if defined (GGML_SYCL_F16) case GGML_TYPE_F16: { auto data_pts = cast_data(dst); - gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], + (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, + main_stream, std::forward(args)...); break; } #endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); - gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], + (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, + main_stream, std::forward(args)...); break; } default: @@ -748,7 +578,8 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst } } -inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +template +static inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); @@ -757,6 +588,7 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor GGML_ASSERT(dst->type == GGML_TYPE_F32); #endif GGML_ASSERT(dst->src[0]->type == dst->type); + GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); switch (dst->type) { @@ -764,14 +596,16 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor case GGML_TYPE_F16: { auto data_pts = cast_data(dst); - gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->ne[0], (int)dst->src[0]->ne[1], (int)dst->src[0]->ne[2], (int)dst->ne[0], + (int)dst->ne[1], (int)dst->ne[2], main_stream, std::forward(args)...); break; } #endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); - gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->ne[0], (int)dst->src[0]->ne[1], (int)dst->src[0]->ne[2], (int)dst->ne[0], + (int)dst->ne[1], (int)dst->ne[2], main_stream, std::forward(args)...); break; } default: @@ -779,593 +613,320 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor } } -inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - - -inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} +} // namespace ggml_sycl_detail -inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE), + sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } - -inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } + +static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE), + sycl::range<1>(SYCL_TANH_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), + sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE), + sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif +static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE), + sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} - GGML_ASSERT(dst->src[0]->type == dst->type); - float negative_slope; - memcpy(&negative_slope, dst->op_params, sizeof(float)); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), + sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - #if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), + sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); +static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), + sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); +static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), + sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} - const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0]; - const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1]; - const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2]; - const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3]; - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], - dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, - main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], - dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, - main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE), + sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0], - dst->ne[1], dst->ne[2], main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0], - dst->ne[1], dst->ne[2], main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE), + sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined(GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else +static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), + sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - float min; - float max; - memcpy(&min, dst->op_params, sizeof(float)); - memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); +static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), + sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} - switch (dst->type) { -#if defined(GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } +static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) { + const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), + sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1); + }); + }, negative_slope); +} + +static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE), + sycl::range<1>(SYCL_SQR_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst, + [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03, + int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3, + queue_ptr stream) { + ggml_sycl_detail::upscale_sycl(src, dst_ptr, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, stream); + }); } -inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_pad(ctx, dst, + [](const auto* src, auto* dst_ptr, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, + queue_ptr stream) { + ggml_sycl_detail::pad_sycl(src, dst_ptr, ne00, ne01, ne02, ne0, ne1, ne2, stream); + }); +} +static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + float min_val; + float max_val; + memcpy(&min_val, dst->op_params, sizeof(float)); + memcpy(&max_val, (float *) dst->op_params + 1, sizeof(float)); + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) { + const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE), + sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + clamp(src, dst_ptr, min_arg, max_arg, k_elements, item_ct1); + }); + }, min_val, max_val); +} + +static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -1381,7 +942,40 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused int offset = dst->op_params[3] / 4; // offset in bytes - acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream); + ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream); +} + +static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(main_stream, + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu + sycl_parallel_for(main_stream, + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu + sycl_parallel_for(main_stream, + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); } @@ -1509,3 +1103,18 @@ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_elu(ctx, dst); } + +void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_geglu(ctx, dst); +} + +void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_reglu(ctx, dst); +} + +void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_swiglu(ctx, dst); +} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index bd40113f097..86068b10129 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -3,27 +3,30 @@ #include "common.hpp" #include "ggml.h" -#include +#include // For std::numeric_limits template T neg_infinity() { return -std::numeric_limits::infinity(); } -template +template struct typed_data { - const T * src; - T * dst; + const T_Src * src; + T_Dst * dst; }; -template -typed_data cast_data(ggml_tensor * dst) { +template +typed_data cast_data(ggml_tensor * dst) { return { - /* .src = */ static_cast(dst->src[0]->data), - /* .dst = */ static_cast(dst->data) + /* .src = */ static_cast(dst->src[0]->data), + /* .dst = */ static_cast(dst->data) }; } +const float GELU_QUICK_COEF = -1.702f; + + void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst); @@ -73,5 +76,9 @@ void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -#endif // GGML_SYCL_ELEMENTWISE_HPP +void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 9cb36ae99e7..ae5e062572e 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3676,6 +3676,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_REGLU: + ggml_sycl_reglu(ctx, dst); + break; + case GGML_GLU_OP_GEGLU: + ggml_sycl_geglu(ctx, dst); + break; + case GGML_GLU_OP_SWIGLU: + ggml_sycl_swiglu(ctx, dst); + break; + default: + return false; + } + break; case GGML_OP_NORM: ggml_sycl_norm(ctx, dst); break; @@ -4212,6 +4227,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g default: return false; } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous_1(op->src[0]); + default: + return false; + } + break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index aebcc03915f..4696f1fe46e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -437,6 +437,10 @@ struct vk_device_struct { vk_pipeline pipeline_tanh[2]; vk_pipeline pipeline_sigmoid[2]; + vk_pipeline pipeline_geglu[2]; + vk_pipeline pipeline_reglu[2]; + vk_pipeline pipeline_swiglu[2]; + vk_pipeline pipeline_leaky_relu_f32; vk_pipeline pipeline_silu_back_f32; vk_pipeline pipeline_diag_mask_inf_f32; @@ -661,6 +665,13 @@ struct vk_op_push_constants { float param2; }; +struct vk_op_glu_push_constants { + uint32_t N; + uint32_t ne00; + uint32_t ne20; + uint32_t mode; // 0: default, 1: swapped, 2: split +}; + struct vk_op_unary_push_constants { uint32_t ne; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; @@ -2757,6 +2768,15 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_UNARY(sigmoid) #undef CREATE_UNARY +#define CREATE_GLU(name) \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); + + CREATE_GLU(geglu) + CREATE_GLU(reglu) + CREATE_GLU(swiglu) +#undef CREATE_GLU + ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); @@ -6473,6 +6493,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const break; } return nullptr; + case GGML_OP_GLU: + if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || + (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) || + (src0->type != dst->type)) { + return nullptr; + } + + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_GEGLU: + return ctx->device->pipeline_geglu[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_REGLU: + return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_SWIGLU: + return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16]; + default: + break; + } + return nullptr; case GGML_OP_DIAG_MASK_INF: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_diag_mask_inf_f32; @@ -6933,6 +6971,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_CONCAT: case GGML_OP_UPSCALE: case GGML_OP_UNARY: + case GGML_OP_GLU: case GGML_OP_CONV_2D_DW: { uint32_t ne = ggml_nelements(dst); @@ -6973,7 +7012,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } } - if (op == GGML_OP_SOFT_MAX) { + if (op == GGML_OP_SOFT_MAX || op == GGML_OP_GLU) { // Empty src1 is possible in soft_max, but the shader needs a buffer vk_subbuffer subbuf_y; if (use_src1) { @@ -7566,6 +7605,25 @@ static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, con ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); } +static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + const bool swapped = (bool)dst->op_params[1]; + const bool split = src1 != nullptr; + + GGML_ASSERT(ggml_is_contiguous(src0)); + + if (!split) { + GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]); + } else { + GGML_ASSERT(src0->ne[0] == src1->ne[0]); + GGML_ASSERT(src0->ne[0] == dst->ne[0]); + GGML_ASSERT(src0->type == src1->type); + } + + const uint32_t mode = split ? 2 : (swapped ? 1 : 0); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GLU, { (uint32_t)ggml_nelements(dst), (uint32_t)src0->ne[0], (uint32_t)dst->ne[0], mode }, dryrun); +} + static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { int32_t * op_params = (int32_t *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun); @@ -8778,6 +8836,16 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + break; + default: + return false; + } + break; case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_GET_ROWS: @@ -8870,6 +8938,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_RMS_NORM_BACK: case GGML_OP_L2_NORM: case GGML_OP_UNARY: + case GGML_OP_GLU: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: @@ -9013,6 +9082,17 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun); + break; + default: + return false; + } + break; case GGML_OP_DIAG_MASK_INF: ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun); @@ -9138,8 +9218,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr if (!ok) { if (node->op == GGML_OP_UNARY) { std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; - } - else { + } else if (node->op == GGML_OP_GLU) { + std::cerr << __func__ << ": error: op not supported GLU " << node->name << " (" << ggml_glu_op_name(static_cast(node->op_params[0])) << ")" << std::endl; + } else { std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; } } @@ -9218,6 +9299,17 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(tensor)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + buf = tensor->buffer; + break; + default: + return false; + } + break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: case GGML_OP_FLASH_ATTN_EXT: @@ -10016,6 +10108,19 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous(op->src[0]) && + (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && + (op->src[0]->type == op->type); + default: + return false; + } + break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { @@ -10746,6 +10851,12 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ABORT("fatal error"); } + } else if (tensor->op == GGML_OP_GLU) { + if (src_clone[1] == nullptr) { + tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]); + } else { + tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]); + } } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { tensor_clone = ggml_dup(ggml_ctx, src_clone[0]); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp new file mode 100644 index 00000000000..f4268ed24f4 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp @@ -0,0 +1,13 @@ +#version 450 + +#include "glu_head.comp" + +const float GELU_COEF_A = 0.044715f; +const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + +float op(float a, float b) { + const float val = SQRT_2_OVER_PI*a*(1.0f + GELU_COEF_A*a*a); + return 0.5f*a*(2.0f - 2.0f / (exp(2 * val) + 1)) * b; +} + +#include "glu_main.comp" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp new file mode 100644 index 00000000000..41a29889075 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp @@ -0,0 +1,15 @@ +#extension GL_EXT_shader_16bit_storage : require + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer B {A_TYPE data_b[];}; +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; + +layout (push_constant) uniform parameter +{ + uint N; + uint ne00; + uint ne20; + uint mode; +} p; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp new file mode 100644 index 00000000000..85cf65a9eca --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp @@ -0,0 +1,29 @@ +void main() { + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.N) { + return; + } + + const uint row = i / p.ne20; + const uint col = i - row * p.ne20; + + if (p.mode == 0) { + // Default + const uint offset = p.ne00 / 2; + const uint idx = row * p.ne00 + col; + + data_d[row * offset + col] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset]))); + } else if (p.mode == 1) { + // Swapped + const uint offset = p.ne00 / 2; + const uint idx = row * p.ne00 + col; + + data_d[row * offset + col] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx]))); + } else { + // Split + const uint idx = row * p.ne00 + col; + + data_d[idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx]))); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp new file mode 100644 index 00000000000..0073d8f7666 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp @@ -0,0 +1,9 @@ +#version 450 + +#include "glu_head.comp" + +float op(float a, float b) { + return max(a, 0.0f) * b; +} + +#include "glu_main.comp" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp new file mode 100644 index 00000000000..a28e7c6cc86 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp @@ -0,0 +1,9 @@ +#version 450 + +#include "glu_head.comp" + +float op(float a, float b) { + return a / (1.0f + exp(-a)) * b; +} + +#include "glu_main.comp" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index a207b98c60e..23fc50bf295 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -585,6 +585,13 @@ void process_shaders() { string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("geglu_f16", "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("geglu_f32", "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("reglu_f16", "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("reglu_f32", "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("swiglu_f16", "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("swiglu_f32", "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e70d9d57b3b..fa57b60ef7d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -982,9 +982,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS", "CROSS_ENTROPY_LOSS_BACK", "OPT_STEP_ADAMW", + + "GLU", }; -static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); +static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1079,9 +1081,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss(x,y)", "cross_entropy_loss_back(x,y)", "adamw(x)", + + "glu(x)", }; -static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); +static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1107,6 +1111,15 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = { static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15"); +static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = { + "REGLU", + "GEGLU", + "SWIGLU", +}; + +static_assert(GGML_GLU_OP_COUNT == 3, "GGML_GLU_OP_COUNT != 3"); + + static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -1209,11 +1222,19 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) { return GGML_UNARY_OP_NAME[op]; } +const char * ggml_glu_op_name(enum ggml_glu_op op) { + return GGML_GLU_OP_NAME[op]; +} + const char * ggml_op_desc(const struct ggml_tensor * t) { if (t->op == GGML_OP_UNARY) { enum ggml_unary_op uop = ggml_get_unary_op(t); return ggml_unary_op_name(uop); } + if (t->op == GGML_OP_GLU) { + enum ggml_glu_op gop = ggml_get_glu_op(t); + return ggml_glu_op_name(gop); + } return ggml_op_name(t->op); } @@ -1730,6 +1751,11 @@ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } +enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->op == GGML_OP_GLU); + return (enum ggml_glu_op) ggml_get_op_params_i32(tensor, 0); +} + const char * ggml_get_name(const struct ggml_tensor * tensor) { return tensor->name; } @@ -2609,6 +2635,114 @@ struct ggml_tensor * ggml_exp_inplace( return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXP); } +// ggml_glu + +static struct ggml_tensor * ggml_glu_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_glu_op op, + bool swapped) { + GGML_ASSERT(ggml_is_contiguous_1(a)); + + if (b) { + GGML_ASSERT(ggml_is_contiguous_1(b)); + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(a->type == b->type); + } + + int64_t ne[GGML_MAX_DIMS] = { a->ne[0] / 2 }; for (int i = 1; i < GGML_MAX_DIMS; i++) ne[i] = a->ne[i]; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b ? a->ne : ne, NULL, 0); + + ggml_set_op_params_i32(result, 0, (int32_t) op); + ggml_set_op_params_i32(result, 1, (int32_t) swapped); + + result->op = GGML_OP_GLU; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_glu( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_glu_op op, + bool swapped) { + return ggml_glu_impl(ctx, a, NULL, op, swapped); +} + +struct ggml_tensor * ggml_glu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_glu_op op) { + return ggml_glu_impl(ctx, a, b, op, false); +} + +// ggml_reglu + +struct ggml_tensor * ggml_reglu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_REGLU, false); +} + +struct ggml_tensor * ggml_reglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_REGLU, true); +} + +struct ggml_tensor * ggml_reglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_REGLU, false); +} + +// ggml_geglu + +struct ggml_tensor * ggml_geglu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU, false); +} + +struct ggml_tensor * ggml_geglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU, true); +} + +struct ggml_tensor * ggml_geglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU, false); +} + +// ggml_swiglu + +struct ggml_tensor * ggml_swiglu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_SWIGLU, false); +} + +struct ggml_tensor * ggml_swiglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_SWIGLU, true); +} + +struct ggml_tensor * ggml_swiglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU, false); +} + // ggml_norm static struct ggml_tensor * ggml_norm_impl( From 130044f2285555c666a5e0975a50d8b51fc74fa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 29 Jun 2025 14:38:10 +0200 Subject: [PATCH 0036/1348] ggml : fix unmerged GGML_FPxx_TO_FPxx refactoring (llama/14443) --- ggml/src/ggml-cpu/vec.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index ebd4b756134..d5507d75646 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -913,8 +913,8 @@ inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v * GGML_FP16_TO_FP32(g[i]) : 0.f); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f); } } @@ -927,9 +927,9 @@ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, c } else if (x[i] >= 10.0f) { y[i] = x[i] * g[i]; } else { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i]; + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i]; } } } @@ -944,8 +944,8 @@ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, c inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { const uint16_t * i16 = (const uint16_t *) x; for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(g[i]); - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v); + float v = GGML_CPU_FP16_TO_FP32(g[i]); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v); } } @@ -953,9 +953,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - float w = GGML_FP16_TO_FP32(g[i]); - y[i] = GGML_FP32_TO_FP16((v/(1.0f + expf(-v))) * w); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + float w = GGML_CPU_FP16_TO_FP32(g[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w); } } From e81be92931bd83782ae228932c2f192e9fbc1003 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 29 Jun 2025 21:07:58 +0530 Subject: [PATCH 0037/1348] SYCL: disable faulty fp16 exp kernel (llama/14395) * SYCL: disable faulty fp16 CPU exponent for now * Revert "SYCL: disable faulty fp16 CPU exponent for now" This reverts commit ed0aab1ec31b4eb4b0f275dd7acd41d96a375202. * SYCL: disable faulty fp16 CPU exponent for now * Fix logic of disabling exponent kernel --- ggml/src/ggml-sycl/ggml-sycl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ae5e062572e..4ecca4165be 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4215,7 +4215,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_TANH: - case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_ELU: @@ -4224,6 +4223,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g #else return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type); #endif + case GGML_UNARY_OP_EXP: + // Disable FP16 until we find out the root cause of failing fp16 sycl::exp + return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type) && op->src[0]->type == GGML_TYPE_F32; default: return false; } From 3239359bd1bdf065cce4f965b3d85b39f800abbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vedran=20Mileti=C4=87?= Date: Mon, 30 Jun 2025 10:17:18 +0200 Subject: [PATCH 0038/1348] scripts : make the shell scripts cross-platform (llama/14341) --- scripts/gen-authors.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gen-authors.sh b/scripts/gen-authors.sh index 3ef8391cc9c..73e7b386f97 100755 --- a/scripts/gen-authors.sh +++ b/scripts/gen-authors.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash printf "# date: $(date)\n" > AUTHORS printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS From 6bb1234a56991f0106d67691876218b43a7a1f73 Mon Sep 17 00:00:00 2001 From: xiaobing318 <71554036+xiaobing318@users.noreply.github.com> Date: Mon, 30 Jun 2025 17:48:24 +0800 Subject: [PATCH 0039/1348] cmake : Remove redundant include path in CMakeLists.txt (llama/14452) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update docker.yml 修改docker.yml文件中的内容使其停止周期性的运行该workflow,如果想要运行该workflow可以手动启动 * Remove redundant include path in CMakeLists.txt The parent directory '..' was removed from the include directories for the ggml-cpu-feats target, to avoid unnecessary include paths. * Enable scheduled Docker image builds Uncomments the workflow schedule to trigger daily Docker image rebuilds at 04:12 UTC, improving automation and keeping images up to date. --- ggml/src/ggml-cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 671fad4d228..46f678b8525 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -5,7 +5,7 @@ function(ggml_add_cpu_backend_features cpu_name arch) # build, using set_source_files_properties() to set the arch flags is not possible set(GGML_CPU_FEATS_NAME ${cpu_name}-feats) add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp) - target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include) + target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include) target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN}) target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED) set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) From e29e36aee7b7acb5bad00ecba42db2d3929405a6 Mon Sep 17 00:00:00 2001 From: Romain Biessy Date: Mon, 30 Jun 2025 14:52:02 +0200 Subject: [PATCH 0040/1348] ggml-cpu: sycl: Re-enable exp f16 (llama/14462) --- ggml/src/ggml-cpu/CMakeLists.txt | 5 +++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 46f678b8525..66a5ad8d2ed 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -589,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (EMSCRIPTEN) set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128") endif() + + if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") + # The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math" + target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math") + endif() endfunction() diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 4ecca4165be..ae5e062572e 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4215,6 +4215,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_ELU: @@ -4223,9 +4224,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g #else return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type); #endif - case GGML_UNARY_OP_EXP: - // Disable FP16 until we find out the root cause of failing fp16 sycl::exp - return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type) && op->src[0]->type == GGML_TYPE_F32; default: return false; } From 679f31a9d188dbc582676054b30473a160ceb79a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 30 Jun 2025 17:04:05 +0300 Subject: [PATCH 0041/1348] metal : disable fast-math for some cpy kernels (llama/14460) * metal : disable fast-math for some cpy kernels ggml-ci * cont : disable for q4_1 ggml-ci * cont : disable for iq4_nl ggml-ci --- ggml/src/ggml-metal/ggml-metal.metal | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index fc3cfe35a34..dac45c7a99b 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -138,6 +138,7 @@ void quantize_q4_0(device const float * src, device block_q4_0 & dst) { } void quantize_q4_1(device const float * src, device block_q4_1 & dst) { +#pragma METAL fp math_mode(safe) float min = FLT_MAX; float max = -FLT_MAX; @@ -203,6 +204,7 @@ void quantize_q5_0(device const float * src, device block_q5_0 & dst) { } void quantize_q5_1(device const float * src, device block_q5_1 & dst) { +#pragma METAL fp math_mode(safe) float max = src[0]; float min = src[0]; @@ -239,6 +241,7 @@ void quantize_q5_1(device const float * src, device block_q5_1 & dst) { } void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) { +#pragma METAL fp math_mode(safe) float amax = 0.0f; // absolute max float max = 0.0f; From b5b237d49a8915f22fd395a24cd96feac31496fc Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 30 Jun 2025 23:57:04 +0800 Subject: [PATCH 0042/1348] Add Conv2d for CPU (llama/14388) * Conv2D: Add CPU version * Half decent * Tiled approach for F32 * remove file * Fix tests * Support F16 operations * add assert about size * Review: further formatting fixes, add assert and use CPU version of fp32->fp16 --- ggml/include/ggml.h | 12 +++ ggml/src/ggml-cpu/ggml-cpu.c | 11 ++- ggml/src/ggml-cpu/ops.cpp | 181 +++++++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ops.h | 5 + ggml/src/ggml.c | 44 ++++++++- 5 files changed, 250 insertions(+), 3 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3a04ad58a49..f8238f3159b 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -482,6 +482,7 @@ extern "C" { GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, + GGML_OP_CONV_2D, GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, @@ -1813,6 +1814,17 @@ extern "C" { struct ggml_tensor * b, int stride); + GGML_API struct ggml_tensor * ggml_conv_2d_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] + struct ggml_tensor * b, // input data [W, H, C, N] + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + enum ggml_op_pool { GGML_OP_POOL_MAX, GGML_OP_POOL_AVG, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1d68cde71a6..11ff228f07a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1193,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( } } -static void ggml_compute_forward_mul_mat( +void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -1866,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_back_f32(params, tensor); } break; + case GGML_OP_CONV_2D: + { + ggml_compute_forward_conv_2d(params, tensor); + } break; case GGML_OP_CONV_2D_DW: { ggml_compute_forward_conv_2d_dw(params, tensor); @@ -2228,6 +2232,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_IM2COL: case GGML_OP_IM2COL_BACK: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: @@ -2746,6 +2751,10 @@ struct ggml_cplan ggml_graph_plan( GGML_ABORT("fatal error"); } } break; + case GGML_OP_CONV_2D: + { + cur = GGML_IM2COL_WORK_SIZE; + } break; case GGML_OP_CONV_TRANSPOSE_2D: { const int64_t ne00 = node->src[0]->ne[0]; // W diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 3b07e249741..dd83efde714 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -3,6 +3,7 @@ #include "ggml-cpu.h" #include "ggml-impl.h" #include "binary-ops.h" +#include "ggml.h" #include "unary-ops.h" #include "vec.h" @@ -6545,6 +6546,186 @@ void ggml_compute_forward_im2col_back_f32( } } +static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k, + void * a, void * b, float * c) { + const ggml_type_traits * traits = ggml_get_type_traits(type); + struct ggml_tensor src1 = {}; + src1.type = type; + src1.ne[0] = k; + src1.ne[1] = m; + src1.ne[2] = 1; + src1.ne[3] = 1; + src1.nb[0] = traits->type_size; + src1.nb[1] = k * traits->type_size; + src1.nb[2] = src1.nb[1]; + src1.nb[3] = src1.nb[2]; + src1.data = a; + + struct ggml_tensor src0 = {}; + src0.type = type; + src0.ne[0] = k; + src0.ne[1] = n; + src0.ne[2] = 1; + src0.ne[3] = 1; + src0.nb[0] = traits->type_size; + src0.nb[1] = k * traits->type_size; + src0.nb[2] = src0.nb[1]; + src0.nb[3] = src0.nb[2]; + src0.data = b; + + struct ggml_tensor dst = {}; + dst.ne[0] = n; + dst.ne[1] = m; + dst.ne[2] = 1; + dst.ne[3] = 1; + dst.nb[0] = sizeof(float); + dst.nb[1] = n * sizeof(float); + dst.nb[2] = dst.nb[1]; + dst.nb[3] = dst.nb[2]; + dst.data = c; + dst.src[0] = &src0; + dst.src[1] = &src1; + + ggml_compute_forward_mul_mat(params, &dst); +} + +// ggml_compute_forward_conv_2d + +static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params, + const ggml_tensor * kernel, // [KW, KH, IC, OC] + const ggml_tensor * src, // [W, H, C, N] + ggml_tensor * dst, // [OW, OH, OC, N] + ggml_type kernel_type) { + + GGML_ASSERT(ggml_is_contiguous(kernel)); + GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32); + GGML_ASSERT(kernel->type == kernel_type); + + const ggml_type_traits * traits = ggml_get_type_traits(kernel_type); + + const int32_t stride_x = dst->op_params[0]; + const int32_t stride_y = dst->op_params[1]; + const int32_t pad_x = dst->op_params[2]; + const int32_t pad_y = dst->op_params[3]; + const int32_t dilation_x = dst->op_params[4]; + const int32_t dilation_y = dst->op_params[5]; + + const int64_t c_in = src->ne[2]; + const int64_t c_out = kernel->ne[3]; + GGML_ASSERT(c_in == kernel->ne[2]); + + const int64_t src_w = src->ne[0]; + const int64_t src_h = src->ne[1]; + const int64_t knl_w = kernel->ne[0]; + const int64_t knl_h = kernel->ne[1]; + const int64_t dst_w = dst->ne[0]; + const int64_t dst_h = dst->ne[1]; + + const float * src_data = (float *) src->data; + void * knl_data = kernel->data; + float * dst_data = (float *) dst->data; + + const int64_t knl_n = knl_w * knl_h * c_in; + const int64_t patch_total = dst->ne[3] * dst_w * dst_h; + + const int64_t space_per_patch = knl_n * traits->type_size + c_out * sizeof(float); + const int64_t batch_size = params->wsize / space_per_patch; + const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size; + const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch; + + GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1); + + void * tmp = params->wdata; + + for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) { + + const int64_t patch_start_batch = batch_i * patches_per_batch; + const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch, + patch_total); + const int64_t patch_n = patch_end_batch - patch_start_batch; + + const int64_t patch_per_thread = (patch_n + params->nth - 1) / params->nth; + const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread; + const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch); + + //im2col for a patch + for (int64_t p = patch_start; p < patch_end; ++p) { + const int64_t batch_n = p / (dst_w * dst_h); + const int64_t src_x = (p / dst_w) % dst_h; + const int64_t src_y = p % dst_w; + + const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]); + char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; + + for (int64_t ic = 0; ic < c_in; ++ic) { + for (int64_t ky = 0; ky < knl_h; ++ky) { + for (int64_t kx = 0; kx < knl_w; ++kx) { + const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y; + const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x; + + int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; + + float src_val; + if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { + src_val = 0.0f; + } else { + const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); + src_val = *src_ptr; + } + + char * element_ptr = dst_row + dst_idx * traits->type_size; + if (kernel_type == GGML_TYPE_F32) { + *(float *) element_ptr = src_val; + } else if (kernel_type == GGML_TYPE_F16) { + *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val); + } + } + } + } + } // patches handled by this thread + + ggml_barrier(params->threadpool); + + float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size); + + GGML_ASSERT(gemm_output + patch_n * c_out <= (float*)tmp + params->wsize); + + // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out] + ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output); + + ggml_barrier(params->threadpool); + + + //permute back [OC, N, OH, OW] to [N, OC, OH, OW] + const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth; + const int64_t permute_start = params->ith * permute_per_thread; + const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n); + + for (int64_t i = permute_start; i < permute_end; ++i) { + const int64_t p = patch_start_batch + i; + const int64_t batch_n = p / (dst_w * dst_h); + const int64_t dst_y = (p / dst_w) % dst_h; + const int64_t dst_x = p % dst_w; + + for (int64_t oc = 0; oc < c_out; ++oc) { + const float value = gemm_output[i * c_out + oc]; + float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]); + *dst_ptr = value; + } + } + } +} + +void ggml_compute_forward_conv_2d( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type); +} + // ggml_compute_forward_conv_transpose_2d void ggml_compute_forward_conv_transpose_2d( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 5b384e4ba5f..3a32ec20dba 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -20,6 +20,9 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); +// Work buffer size for im2col operations in CONV2D +#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024) + #ifdef __cplusplus extern "C" { #endif @@ -65,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); @@ -107,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); #ifdef __cplusplus } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index fa57b60ef7d..b499f378565 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -945,6 +945,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CONV_TRANSPOSE_1D", "IM2COL", "IM2COL_BACK", + "CONV_2D", "CONV_2D_DW", "CONV_TRANSPOSE_2D", "POOL_1D", @@ -986,7 +987,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); +static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1044,6 +1045,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "im2col_back(x)", + "conv_2d(x)", "conv_2d_dw(x)", "conv_transpose_2d(x)", "pool_1d(x)", @@ -1085,7 +1087,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); +static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4291,6 +4293,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( return result; } +// ggml_conv_2d_direct + +struct ggml_tensor * ggml_conv_2d_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] + struct ggml_tensor * b, // input data [W, H, C, N] + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1) {// dilation dimension 1 + + GGML_ASSERT(a->ne[2] == b->ne[2]); + //GGML_ASSERT(a->type == b->type); + + int64_t ne[4]; + ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + ne[2] = a->ne[3]; + ne[3] = b->ne[3]; + + struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne); + + ggml_set_op_params_i32(result, 0, s0); + ggml_set_op_params_i32(result, 1, s1); + ggml_set_op_params_i32(result, 2, p0); + ggml_set_op_params_i32(result, 3, p1); + ggml_set_op_params_i32(result, 4, d0); + ggml_set_op_params_i32(result, 5, d1); + + result->op = GGML_OP_CONV_2D; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { From 1e930ab1b8a12a62c0d5f78af2bc7e25fb36e0b9 Mon Sep 17 00:00:00 2001 From: lhez Date: Tue, 1 Jul 2025 00:19:16 -0700 Subject: [PATCH 0043/1348] opencl : add GEGLU, REGLU, SWIGLU (llama/14456) --- ggml/src/ggml-opencl/CMakeLists.txt | 1 + ggml/src/ggml-opencl/ggml-opencl.cpp | 124 +++++++++++++++++ ggml/src/ggml-opencl/kernels/glu.cl | 201 +++++++++++++++++++++++++++ 3 files changed, 326 insertions(+) create mode 100644 ggml/src/ggml-opencl/kernels/glu.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 0e2a419649c..45a48833480 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -65,6 +65,7 @@ set(GGML_OPENCL_KERNELS gemv_noshuffle_general gemv_noshuffle get_rows + glu group_norm im2col_f32 im2col_f16 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 96e8a8588dc..496e47575e9 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -351,6 +351,7 @@ struct ggml_backend_opencl_context { cl_program program_gemv_noshuffle_general; cl_program program_gemv_noshuffle; cl_program program_get_rows; + cl_program program_glu; cl_program program_im2col_f16; cl_program program_im2col_f32; cl_program program_mul_mat_Ab_Bi_8x4; @@ -401,6 +402,8 @@ struct ggml_backend_opencl_context { cl_kernel kernel_relu; cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16; cl_kernel kernel_clamp; + cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, + kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16; cl_kernel kernel_norm; cl_kernel kernel_rms_norm; cl_kernel kernel_group_norm; @@ -738,6 +741,27 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // glu + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "glu.cl.h" + }; +#else + const std::string kernel_src = read_file("glu.cl"); +#endif + backend_ctx->program_glu = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err)); + CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err)); + CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err)); + CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err)); + GGML_LOG_CONT("."); + } + // get_rows { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -2242,6 +2266,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te default: return false; } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); + default: + return false; + } case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_SOFT_MAX: @@ -6143,6 +6176,91 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } +static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + if (src1) { + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + } + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + cl_kernel kernel; + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_GEGLU: + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_geglu; + } else { + kernel = backend_ctx->kernel_geglu_f16; + } + break; + case GGML_GLU_OP_REGLU: + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_reglu; + } else { + kernel = backend_ctx->kernel_reglu_f16; + } + break; + case GGML_GLU_OP_SWIGLU: + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_swiglu; + } else { + kernel = backend_ctx->kernel_swiglu_f16; + } + break; + default: + GGML_ABORT("Unsupported glu op"); + } + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0; + + const int ne0 = dst->ne[0]; + + const cl_ulong nb01 = src0->nb[1]; + const cl_ulong nb11 = src1 ? src1->nb[1] : nb01; + + const cl_ulong nb1 = dst->nb[1]; + + const int swp = ((const int32_t *) dst->op_params)[1]; + const int ne00_off = src1 ? 0 : (swp ? ne0 : 0); + const int ne10_off = src1 ? 0 : (swp ? 0 : ne0); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off)); + + const size_t nrows = ggml_nrows(src0); + size_t nth = 512; + size_t global_work_size[] = {nrows*nth, 1, 1}; + size_t local_work_size[] = {nth, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} + //------------------------------------------------------------------------------ // Op offloading //------------------------------------------------------------------------------ @@ -6244,6 +6362,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor default: return false; } break; + case GGML_OP_GLU: + if (!any_on_device) { + return false; + } + func = ggml_cl_glu; + break; case GGML_OP_CLAMP: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/glu.cl b/ggml/src/ggml-opencl/kernels/glu.cl new file mode 100644 index 00000000000..ba861d8b18f --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/glu.cl @@ -0,0 +1,201 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define GELU_COEF_A 0.044715f +#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f + +//------------------------------------------------------------------------------ +// geglu +//------------------------------------------------------------------------------ +kernel void kernel_geglu( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu = 0.5f*x0*(1.0f + tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); + + dst_row[i0] = gelu*x1; + } +} + +kernel void kernel_geglu_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const half x0 = src0_row[i0]; + const half x1 = src1_row[i0]; + + const half gelu = 0.5f*x0*(1.0f + tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); + + dst_row[i0] = gelu*x1; + } +} + +//------------------------------------------------------------------------------ +// reglu +//------------------------------------------------------------------------------ +kernel void kernel_reglu( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + dst_row[i0] = x0*x1*(x0 > 0.0f); + } +} + +kernel void kernel_reglu_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const half x0 = src0_row[i0]; + const half x1 = src1_row[i0]; + + dst_row[i0] = x0*x1*(x0 > 0.0f); + } +} + +//------------------------------------------------------------------------------ +// swiglu +//------------------------------------------------------------------------------ +kernel void kernel_swiglu( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float silu = x0 / (1.0f + exp(-x0)); + + dst_row[i0] = silu*x1; + } +} + +kernel void kernel_swiglu_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const half x0 = src0_row[i0]; + const half x1 = src1_row[i0]; + + const half silu = x0 / (1.0f + exp(-x0)); + + dst_row[i0] = silu*x1; + } +} From c4ea72be9a8daa0a09fcea7cad378d2a147ac7d9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Jul 2025 11:05:48 +0300 Subject: [PATCH 0044/1348] ggml : remove trailing whitespace (llama/0) --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b499f378565..c51cb57cc68 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4458,7 +4458,7 @@ static struct ggml_tensor * ggml_interpolate_impl( int64_t ne3, uint32_t mode) { GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT); - + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); ggml_set_op_params_i32(result, 0, (int32_t)mode); From 1f816de7da0fd74adf8ce6cc24b116b757a1d966 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Jul 2025 12:21:09 +0300 Subject: [PATCH 0045/1348] talk-llama : sync llama.cpp --- examples/talk-llama/llama-arch.cpp | 72 ++ examples/talk-llama/llama-arch.h | 18 + examples/talk-llama/llama-batch.cpp | 27 +- examples/talk-llama/llama-chat.cpp | 17 +- examples/talk-llama/llama-context.cpp | 79 +- examples/talk-llama/llama-context.h | 24 +- examples/talk-llama/llama-graph.cpp | 189 ++--- examples/talk-llama/llama-graph.h | 71 +- examples/talk-llama/llama-hparams.h | 6 + .../llama-kv-cache-unified-iswa.cpp | 72 +- .../talk-llama/llama-kv-cache-unified-iswa.h | 36 +- .../talk-llama/llama-kv-cache-unified.cpp | 82 +- examples/talk-llama/llama-kv-cache-unified.h | 32 +- examples/talk-llama/llama-kv-cells.h | 42 +- examples/talk-llama/llama-memory-hybrid.cpp | 72 +- examples/talk-llama/llama-memory-hybrid.h | 28 +- .../talk-llama/llama-memory-recurrent.cpp | 89 ++- examples/talk-llama/llama-memory-recurrent.h | 28 +- examples/talk-llama/llama-memory.cpp | 17 + examples/talk-llama/llama-memory.h | 36 +- examples/talk-llama/llama-model.cpp | 725 +++++++++++++++++- examples/talk-llama/llama-model.h | 23 + examples/talk-llama/llama-quant.cpp | 92 ++- examples/talk-llama/llama.h | 9 +- 24 files changed, 1456 insertions(+), 430 deletions(-) diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp index 8dadef204f9..aa21108a4bd 100644 --- a/examples/talk-llama/llama-arch.cpp +++ b/examples/talk-llama/llama-arch.cpp @@ -42,6 +42,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_GEMMA3, "gemma3" }, + { LLM_ARCH_GEMMA3N, "gemma3n" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, @@ -75,6 +76,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_DOTS1, "dots1" }, { LLM_ARCH_ARCEE, "arcee" }, + { LLM_ARCH_ERNIE4_5, "ernie4_5" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -932,6 +934,42 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, }, }, + { + LLM_ARCH_GEMMA3N, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" }, + { LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" }, + { LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" }, + { LLM_TENSOR_ALTUP_UNEMBD_PROJ, "altup_unembd_proj" }, + { LLM_TENSOR_ALTUP_PROJ, "altup_proj" }, + { LLM_TENSOR_PER_LAYER_INP_GATE, "blk.%d.inp_gate" }, + { LLM_TENSOR_PER_LAYER_PROJ, "blk.%d.proj" }, + { LLM_TENSOR_PER_LAYER_POST_NORM, "blk.%d.post_norm" }, + { LLM_TENSOR_ALTUP_CORRECT_COEF, "blk.%d.altup_correct_coef" }, + { LLM_TENSOR_ALTUP_CORRECT_SCALE, "blk.%d.altup_correct_scale" }, + { LLM_TENSOR_ALTUP_PREDICT_COEF, "blk.%d.altup_predict_coef" }, + { LLM_TENSOR_ALTUP_ROUTER, "blk.%d.altup_router" }, + { LLM_TENSOR_ALTUP_ROUTER_NORM, "blk.%d.altup_router_norm" }, + { LLM_TENSOR_LAUREL_L, "blk.%d.laurel_l" }, + { LLM_TENSOR_LAUREL_R, "blk.%d.laurel_r" }, + { LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" }, + }, + }, { LLM_ARCH_STARCODER2, { @@ -1621,6 +1659,23 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, } }, + { + LLM_ARCH_ERNIE4_5, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1749,6 +1804,23 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + // altup / laurel (gemma 3n) + {LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_PER_LAYER_MODEL_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_PER_LAYER_PROJ_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_ALTUP_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ALTUP_UNEMBD_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_PER_LAYER_INP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_PER_LAYER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_PER_LAYER_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ALTUP_CORRECT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ALTUP_CORRECT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ALTUP_PREDICT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ALTUP_ROUTER, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ALTUP_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_LAUREL_L, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_LAUREL_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h index 5b0230c1506..0771ec3ebad 100644 --- a/examples/talk-llama/llama-arch.h +++ b/examples/talk-llama/llama-arch.h @@ -46,6 +46,7 @@ enum llm_arch { LLM_ARCH_GEMMA, LLM_ARCH_GEMMA2, LLM_ARCH_GEMMA3, + LLM_ARCH_GEMMA3N, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, @@ -79,6 +80,7 @@ enum llm_arch { LLM_ARCH_BAILINGMOE, LLM_ARCH_DOTS1, LLM_ARCH_ARCEE, + LLM_ARCH_ERNIE4_5, LLM_ARCH_UNKNOWN, }; @@ -269,6 +271,22 @@ enum llm_tensor { LLM_TENSOR_LAYER_OUT_NORM, LLM_TENSOR_POST_ATTN_NORM, LLM_TENSOR_POST_MLP_NORM, + LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n + LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n + LLM_TENSOR_PER_LAYER_INP_GATE, // gemma3n + LLM_TENSOR_PER_LAYER_PROJ, // gemma3n + LLM_TENSOR_PER_LAYER_PROJ_NORM, // gemma3n + LLM_TENSOR_PER_LAYER_POST_NORM, // gemma3n + LLM_TENSOR_ALTUP_PROJ, // gemma3n + LLM_TENSOR_ALTUP_UNEMBD_PROJ, // gemma3n + LLM_TENSOR_ALTUP_CORRECT_COEF, // gemma3n + LLM_TENSOR_ALTUP_CORRECT_SCALE, // gemma3n + LLM_TENSOR_ALTUP_PREDICT_COEF, // gemma3n + LLM_TENSOR_ALTUP_ROUTER, // gemma3n + LLM_TENSOR_ALTUP_ROUTER_NORM, // gemma3n + LLM_TENSOR_LAUREL_L, // gemma3n + LLM_TENSOR_LAUREL_R, // gemma3n + LLM_TENSOR_LAUREL_POST_NORM, // gemma3n LLM_TENSOR_SSM_IN, LLM_TENSOR_SSM_CONV1D, LLM_TENSOR_SSM_X, diff --git a/examples/talk-llama/llama-batch.cpp b/examples/talk-llama/llama-batch.cpp index b3c996e18ab..91b1d6078a2 100644 --- a/examples/talk-llama/llama-batch.cpp +++ b/examples/talk-llama/llama-batch.cpp @@ -244,22 +244,35 @@ bool llama_batch_allocr::init( continue; } - if (memory) { + const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1; + + if (p0 >= 0) { + bool ok = true; + if (batch.token) { - if (seq_pos_min(s) != memory->seq_pos_max(s) + 1) { - LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s); - return false; + if (seq_pos_min(s) != p0 + 1) { + ok = false; } } else { assert(batch.embd); // for embeddings (typically used as vision input), we allow them to have repeating positions // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762 - if (seq_pos_min(s) != memory->seq_pos_max(s) && seq_pos_min(s) != memory->seq_pos_max(s) + 1) { - LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s); - return false; + if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) { + ok = false; } } + + if (!ok) { + LLAMA_LOG_ERROR( + "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n" + " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" + " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n" + " it is required that the sequence positions remain consecutive: Y = X + 1\n", + __func__, s, s, p0, s, seq_pos_min(s)); + + return false; + } } if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) { diff --git a/examples/talk-llama/llama-chat.cpp b/examples/talk-llama/llama-chat.cpp index 0839cad3ee6..5d317f4ee62 100644 --- a/examples/talk-llama/llama-chat.cpp +++ b/examples/talk-llama/llama-chat.cpp @@ -528,12 +528,17 @@ int32_t llm_chat_apply_template( } } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) { // this template requires the model to have "\n\n" as EOT token - for (auto message : chat) { - std::string role(message->role); - if (role == "user") { - ss << "User: " << message->content << "\n\nAssistant:"; - } else { - ss << message->content << "\n\n"; + for (size_t i = 0; i < chat.size(); i++) { + std::string role(chat[i]->role); + if (role == "system") { + ss << "System: " << trim(chat[i]->content) << "\n\n"; + } else if (role == "user") { + ss << "User: " << trim(chat[i]->content) << "\n\n"; + if (i == chat.size() - 1) { + ss << "Assistant:"; + } + } else if (role == "assistant") { + ss << "Assistant: " << trim(chat[i]->content) << "\n\n"; } } } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) { diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp index 5a18a4fb393..06e93b19cbf 100644 --- a/examples/talk-llama/llama-context.cpp +++ b/examples/talk-llama/llama-context.cpp @@ -280,8 +280,8 @@ llama_context::llama_context( // simulate full KV cache - const auto mstate = memory->init_full(); - if (!mstate) { + const auto mctx = memory->init_full(); + if (!mctx) { throw std::runtime_error("failed to initialize KV cache"); } @@ -289,7 +289,7 @@ llama_context::llama_context( // reserve pp graph first so that buffers are only allocated once { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -300,7 +300,7 @@ llama_context::llama_context( // reserve with tg graph to get the number of splits and nodes { - auto * gf = graph_reserve(1, 1, 1, mstate.get()); + auto * gf = graph_reserve(1, 1, 1, mctx.get()); if (!gf) { throw std::runtime_error("failed to allocate compute tg buffers"); } @@ -311,7 +311,7 @@ llama_context::llama_context( // reserve again with pp graph to avoid ggml-alloc reallocations during inference { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -444,8 +444,8 @@ bool llama_context::kv_self_update(bool optimize) { optimize |= memory_force_optimize; memory_force_optimize = false; - const auto mstate = memory->init_update(this, optimize); - switch (mstate->get_status()) { + const auto mctx = memory->init_update(this, optimize); + switch (mctx->get_status()) { case LLAMA_MEMORY_STATUS_SUCCESS: { // noop @@ -463,22 +463,22 @@ bool llama_context::kv_self_update(bool optimize) { } } - if (!mstate->apply()) { + if (!mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__); } } // if the memory module did any computation, we have to reserve a new worst-case graph { - const auto mstate = memory->init_full(); - if (!mstate) { - throw std::runtime_error("failed to initialize memory state"); + const auto mctx = memory->init_full(); + if (!mctx) { + throw std::runtime_error("failed to initialize memory context"); } const uint32_t n_seqs = cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); if (!gf) { LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__); } @@ -678,9 +678,9 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_state_i * mstate, ggml_status & ret) { - if (mstate && !mstate->apply()) { - LLAMA_LOG_ERROR("%s: failed to apply memory state\n", __func__); +llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { + if (mctx && !mctx->apply()) { + LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); ret = GGML_STATUS_FAILED; return nullptr; } @@ -692,7 +692,7 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, return nullptr; } - auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mstate); + auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx); if (!res) { LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__); ret = GGML_STATUS_FAILED; @@ -933,21 +933,21 @@ int llama_context::decode(const llama_batch & batch_inp) { // handle any pending defrags/shifts kv_self_update(false); - llama_memory_state_ptr mstate; + llama_memory_context_ptr mctx; while (true) { - mstate = memory->init_batch(*balloc, cparams.n_ubatch, output_all); - if (!mstate) { + mctx = memory->init_batch(*balloc, cparams.n_ubatch, output_all); + if (!mctx) { return -2; } - switch (mstate->get_status()) { + switch (mctx->get_status()) { case LLAMA_MEMORY_STATUS_SUCCESS: { } break; case LLAMA_MEMORY_STATUS_NO_UPDATE: { - LLAMA_LOG_ERROR("%s: unexpected memory state status: %d\n", __func__, mstate->get_status()); + LLAMA_LOG_ERROR("%s: unexpected memory context status: %d\n", __func__, mctx->get_status()); return -2; } @@ -987,7 +987,7 @@ int llama_context::decode(const llama_batch & batch_inp) { int64_t n_outputs_prev = 0; do { - const auto & ubatch = mstate->get_ubatch(); + const auto & ubatch = mctx->get_ubatch(); // count the outputs in this ubatch { @@ -1009,7 +1009,7 @@ int llama_context::decode(const llama_batch & batch_inp) { ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); ggml_status status; - const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mstate.get(), status); + const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache @@ -1018,7 +1018,6 @@ int llama_context::decode(const llama_batch & batch_inp) { pos_min[s] = std::numeric_limits::max(); } - // TODO: fix sequence indexing for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { const auto & seq_id = ubatch.seq_id[i][0]; @@ -1126,7 +1125,7 @@ int llama_context::decode(const llama_batch & batch_inp) { } n_outputs_prev += n_outputs; - } while (mstate->next()); + } while (mctx->next()); // set to total number of outputs in the batch, for use in llama_get_logits_ith n_outputs = n_outputs_all; @@ -1292,7 +1291,7 @@ ggml_cgraph * llama_context::graph_init() { return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false); } -ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate) { +ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) { LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); if (n_tokens % n_seqs != 0) { @@ -1312,7 +1311,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate); + auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx); this->n_outputs = save_n_outputs; @@ -1333,11 +1332,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u } llm_graph_result_ptr llama_context::graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype, - const llama_memory_state_i * mstate) { + ggml_context * ctx, + ggml_cgraph * gf, + const llama_ubatch & ubatch, + llm_graph_type gtype, + const llama_memory_context_i * mctx) { return model.build_graph( { /*.ctx =*/ ctx, @@ -1349,7 +1348,7 @@ llm_graph_result_ptr llama_context::graph_build( /*.backend_cpu =*/ backend_cpu, /*.cvec =*/ &cvec, /*.loras =*/ &loras, - /*.mstate =*/ mstate, + /*.mctx =*/ mctx, /*.cross =*/ &cross, /*.n_outputs =*/ n_outputs, /*.cb =*/ graph_get_cb(), @@ -2042,8 +2041,8 @@ void llama_context::opt_epoch_iter( uint32_t n_outputs_all = n_tokens_all; - auto mstate = memory->init_batch(*balloc, cparams.n_ubatch, true); - if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) { + auto mctx = memory->init_batch(*balloc, cparams.n_ubatch, true); + if (!mctx || mctx->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__); break; } @@ -2056,17 +2055,17 @@ void llama_context::opt_epoch_iter( uint32_t pos_batch = 0; do { - const auto & ubatch = mstate->get_ubatch(); + const auto & ubatch = mctx->get_ubatch(); n_outputs = ubatch.n_tokens; - if (!mstate->apply()) { - LLAMA_LOG_ERROR("%s: failed to update the memory state\n", __func__); + if (!mctx->apply()) { + LLAMA_LOG_ERROR("%s: failed to update the memory context\n", __func__); break; } auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate.get()); + auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get()); struct ggml_context * ctx_compute_opt; { @@ -2101,7 +2100,7 @@ void llama_context::opt_epoch_iter( ggml_free(ctx_compute_opt); pos_batch += ubatch.n_tokens; - } while (mstate->next()); + } while (mctx->next()); } } diff --git a/examples/talk-llama/llama-context.h b/examples/talk-llama/llama-context.h index 7d300c14572..9ce05715a8c 100644 --- a/examples/talk-llama/llama-context.h +++ b/examples/talk-llama/llama-context.h @@ -18,7 +18,7 @@ class llama_io_read_i; class llama_io_write_i; struct llama_memory_i; -struct llama_memory_state_i; +struct llama_memory_context_i; struct llama_context { // init scheduler and compute buffers, reserve worst-case graphs @@ -93,14 +93,14 @@ struct llama_context { int32_t il_end); // process a single ubatch with a specific graph type - // if memory_state is provided, it will be applied first to the context's memory + // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation // returns nullptr only if ret != GGML_STATUS_SUCCESS llm_graph_result_ptr process_ubatch( - const llama_ubatch & ubatch, - llm_graph_type gtype, - llama_memory_state_i * mstate, - ggml_status & ret); + const llama_ubatch & ubatch, + llm_graph_type gtype, + llama_memory_context_i * mctx, + ggml_status & ret); int encode(const llama_batch & batch_inp); int decode(const llama_batch & batch_inp); @@ -197,15 +197,15 @@ struct llama_context { ggml_status graph_compute(ggml_cgraph * gf, bool batched); // reserve a graph with a dummy ubatch of the specified size - ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate); + ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx); private: llm_graph_result_ptr graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype, - const llama_memory_state_i * mstate); + ggml_context * ctx, + ggml_cgraph * gf, + const llama_ubatch & ubatch, + llm_graph_type gtype, + const llama_memory_context_i * mctx); llm_graph_cb graph_get_cb() const; diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp index 7e162c55522..010300df609 100644 --- a/examples/talk-llama/llama-graph.cpp +++ b/examples/talk-llama/llama-graph.cpp @@ -87,7 +87,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) { if (pos_bucket) { - kv_state->set_input_pos_bucket(pos_bucket, ubatch); + mctx->set_input_pos_bucket(pos_bucket, ubatch); } } @@ -221,7 +221,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) { GGML_UNUSED(ubatch); - const int64_t n_rs = mem_state->get_n_rs(); + const int64_t n_rs = mctx->get_n_rs(); if (s_copy) { GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); @@ -229,7 +229,7 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) { // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_rs; ++i) { - data[i] = mem_state->s_copy(i); + data[i] = mctx->s_copy(i); } } } @@ -282,17 +282,17 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) { if (self_kq_mask) { - kv_state->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } } void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) { if (self_kq_mask) { - kv_state->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } if (self_kq_mask_swa) { - kv_state->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } } @@ -334,10 +334,10 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { if (self_kq_mask) { - mem_state->get_state_attn()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + mctx->get_attn()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } - const int64_t n_rs = mem_state->get_state_recr()->get_n_rs(); + const int64_t n_rs = mctx->get_recr()->get_n_rs(); if (s_copy) { GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); @@ -345,11 +345,17 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_rs; ++i) { - data[i] = mem_state->get_state_recr()->s_copy(i); + data[i] = mctx->get_recr()->s_copy(i); } } } +void llm_graph_input_one::set_input(const llama_ubatch *) { + GGML_ASSERT(one && ggml_nelements(one) == 1); + float f_one = 1.0f; + ggml_backend_tensor_set(one, &f_one, 0, sizeof(float)); +} + // // llm_graph_context // @@ -389,7 +395,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : backend_cpu (params.backend_cpu), cvec (params.cvec), loras (params.loras), - mstate (params.mstate), + mctx (params.mctx), cross (params.cross), cb_func (params.cb), res (std::make_unique()) { @@ -554,12 +560,20 @@ ggml_tensor * llm_graph_context::build_ffn( switch (type_op) { case LLM_FFN_SILU: - { + if (gate && type_gate == LLM_FFN_PAR) { + cur = ggml_swiglu_split(ctx0, cur, tmp); + cb(cur, "ffn_swiglu", il); + type_gate = LLM_FFN_SEQ; + } else { cur = ggml_silu(ctx0, cur); cb(cur, "ffn_silu", il); } break; case LLM_FFN_GELU: - { + if (gate && type_gate == LLM_FFN_PAR) { + cur = ggml_geglu_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu", il); + type_gate = LLM_FFN_SEQ; + } else { cur = ggml_gelu(ctx0, cur); cb(cur, "ffn_gelu", il); if (act_scales != NULL) { @@ -568,7 +582,11 @@ ggml_tensor * llm_graph_context::build_ffn( } } break; case LLM_FFN_RELU: - { + if (gate && type_gate == LLM_FFN_PAR) { + cur = ggml_reglu_split(ctx0, cur, tmp); + cb(cur, "ffn_reglu", il); + type_gate = LLM_FFN_SEQ; + } else { cur = ggml_relu(ctx0, cur); cb(cur, "ffn_relu", il); } break; @@ -582,32 +600,19 @@ ggml_tensor * llm_graph_context::build_ffn( } break; case LLM_FFN_SWIGLU: { - // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf - int64_t split_point = cur->ne[0] / 2; - // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217 - ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); - ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_silu(ctx0, x0); - cb(cur, "ffn_silu", il); - - cur = ggml_mul(ctx0, x0, x1); - cb(cur, "ffn_mul", il); + cur = ggml_swiglu(ctx0, cur); + cb(cur, "ffn_swiglu", il); } break; case LLM_FFN_GEGLU: { - // Split into two equal parts - int64_t split_point = cur->ne[0] / 2; - // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217 - ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); - ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_gelu(ctx0, x0); - cb(x0, "ffn_gelu", il); - - cur = ggml_mul(ctx0, x0, x1); + cur = ggml_geglu(ctx0, cur); cb(cur, "ffn_geglu", il); } break; + case LLM_FFN_REGLU: + { + cur = ggml_reglu(ctx0, cur); + cb(cur, "ffn_reglu", il); + } break; } if (gate && type_gate == LLM_FFN_PAR) { @@ -737,12 +742,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn( switch (type_op) { case LLM_FFN_SILU: - { + if (gate_exps) { + cur = ggml_swiglu_split(ctx0, cur, up); + cb(cur, "ffn_moe_swiglu", il); + } else { cur = ggml_silu(ctx0, cur); cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: - { + if (gate_exps) { + cur = ggml_geglu_split(ctx0, cur, up); + cb(cur, "ffn_moe_geglu", il); + } else { cur = ggml_gelu(ctx0, cur); cb(cur, "ffn_moe_gelu", il); } break; @@ -750,11 +761,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn( GGML_ABORT("fatal error"); } - if (gate_exps) { - cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens] - cb(cur, "ffn_moe_gate_par", il); - } - experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); @@ -950,11 +956,11 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const { } ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); - auto inp = std::make_unique(hparams, kv_state); + auto inp = std::make_unique(hparams, mctx_cur); - const auto n_kv = kv_state->get_n_kv(); + const auto n_kv = mctx_cur->get_n_kv(); auto & cur = inp->pos_bucket; @@ -982,14 +988,14 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t } llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { - const auto * mem_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); - auto inp = std::make_unique(hparams, cparams, mem_state); + auto inp = std::make_unique(hparams, cparams, mctx_cur); { GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Hybrid recurrent is not supported with SWA attention layers"); - const auto n_kv = inp->mem_state->get_state_attn()->get_n_kv(); + const auto n_kv = inp->mctx->get_attn()->get_n_kv(); inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); //cb(inp->self_kq_mask, "KQ_mask", -1); @@ -999,7 +1005,7 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { } { - const auto n_rs = mem_state->get_state_recr()->get_n_rs(); + const auto n_rs = mctx_cur->get_recr()->get_n_rs(); inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs); ggml_set_input(inp->s_copy); @@ -1183,14 +1189,14 @@ ggml_tensor * llm_graph_context::build_attn( } llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); - auto inp = std::make_unique(hparams, cparams, kv_state); + auto inp = std::make_unique(hparams, cparams, mctx_cur); { GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA"); - const auto n_kv = kv_state->get_n_kv(); + const auto n_kv = mctx_cur->get_n_kv(); inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); //cb(inp->self_kq_mask, "KQ_mask", -1); @@ -1220,19 +1226,19 @@ ggml_tensor * llm_graph_context::build_attn( ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); // store to KV cache { - ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il)); - ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il)); + ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il)); + ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il)); } const auto & kq_mask = inp->get_kq_mask(); ggml_tensor * q = q_cur; - ggml_tensor * k = kv_state->get_k(ctx0, il); - ggml_tensor * v = kv_state->get_v(ctx0, il); + ggml_tensor * k = mctx_cur->get_k(ctx0, il); + ggml_tensor * v = mctx_cur->get_v(ctx0, il); ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); @@ -1267,26 +1273,35 @@ ggml_tensor * llm_graph_context::build_attn( // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced ggml_build_forward_expand(gf, q_cur); - ggml_build_forward_expand(gf, k_cur); - ggml_build_forward_expand(gf, v_cur); - const auto * kv_state_iswa = static_cast(mstate); + if (k_cur) { + ggml_build_forward_expand(gf, k_cur); + } + + if (v_cur) { + ggml_build_forward_expand(gf, v_cur); + } + + const auto * mctx_iswa = static_cast(mctx); const bool is_swa = hparams.is_swa(il); - const auto * kv_state = is_swa ? kv_state_iswa->get_swa() : kv_state_iswa->get_base(); + const auto * mctx_cur = is_swa ? mctx_iswa->get_swa() : mctx_iswa->get_base(); - // store to KV cache - { - ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il)); - ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il)); + // optionally store to KV cache + if (k_cur) { + ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il)); + } + + if (v_cur) { + ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il)); } const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask(); ggml_tensor * q = q_cur; - ggml_tensor * k = kv_state->get_k(ctx0, il); - ggml_tensor * v = kv_state->get_v(ctx0, il); + ggml_tensor * k = mctx_cur->get_k(ctx0, il); + ggml_tensor * v = mctx_cur->get_v(ctx0, il); ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); @@ -1379,19 +1394,19 @@ ggml_tensor * llm_graph_context::build_attn( ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - const auto * kv_state = static_cast(mstate)->get_state_attn(); + const auto * mctx_cur = static_cast(mctx)->get_attn(); // store to KV cache { - ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il)); - ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il)); + ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il)); + ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il)); } const auto & kq_mask = inp->get_kq_mask(); ggml_tensor * q = q_cur; - ggml_tensor * k = kv_state->get_k(ctx0, il); - ggml_tensor * v = kv_state->get_v(ctx0, il); + ggml_tensor * k = mctx_cur->get_k(ctx0, il); + ggml_tensor * v = mctx_cur->get_v(ctx0, il); ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); @@ -1412,12 +1427,12 @@ ggml_tensor * llm_graph_context::build_attn( } llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); - auto inp = std::make_unique(hparams, cparams, kv_state); + auto inp = std::make_unique(hparams, cparams, mctx_cur); { - const auto n_kv = kv_state->get_base()->get_n_kv(); + const auto n_kv = mctx_cur->get_base()->get_n_kv(); inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); //cb(inp->self_kq_mask, "KQ_mask", -1); @@ -1429,7 +1444,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif { GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA"); - const auto n_kv = kv_state->get_swa()->get_n_kv(); + const auto n_kv = mctx_cur->get_swa()->get_n_kv(); inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1); @@ -1485,11 +1500,11 @@ ggml_tensor * llm_graph_context::build_rs( } llm_graph_input_rs * llm_graph_context::build_rs_inp() const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); - auto inp = std::make_unique(kv_state); + auto inp = std::make_unique(mctx_cur); - const auto n_rs = kv_state->get_n_rs(); + const auto n_rs = mctx_cur->get_n_rs(); inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs); ggml_set_input(inp->s_copy); @@ -1504,9 +1519,9 @@ ggml_tensor * llm_graph_context::build_rs( int32_t state_size, int32_t n_seqs, bool avoid_copies) const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); - return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), avoid_copies); + return build_rs(gf, s, inp->s_copy, state_size, n_seqs, mctx_cur->get_n_rs(), mctx_cur->get_head(), mctx_cur->get_size(), mctx_cur->get_rs_z(), avoid_copies); } ggml_tensor * llm_graph_context::build_rs( @@ -1516,9 +1531,9 @@ ggml_tensor * llm_graph_context::build_rs( int32_t state_size, int32_t n_seqs, bool avoid_copies) const { - const auto * kv_state = static_cast(mstate)->get_state_recr(); + const auto * mctx_cur = static_cast(mctx)->get_recr(); - return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), avoid_copies); + return build_rs(gf, s, inp->s_copy, state_size, n_seqs, mctx_cur->get_n_rs(), mctx_cur->get_head(), mctx_cur->get_size(), mctx_cur->get_rs_z(), avoid_copies); } ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( @@ -1526,13 +1541,13 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( ggml_cgraph * gf, const llama_ubatch & ubatch, int il) const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); const auto token_shift_count = hparams.token_shift_count; const int64_t n_seqs = ubatch.n_seqs; - ggml_tensor * token_shift_all = kv_state->get_r_l(il); + ggml_tensor * token_shift_all = mctx_cur->get_r_l(il); ggml_tensor * token_shift = build_rs( inp, gf, token_shift_all, @@ -1547,19 +1562,19 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store( ggml_tensor * token_shift, const llama_ubatch & ubatch, int il) const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); const auto token_shift_count = hparams.token_shift_count; const auto n_embd = hparams.n_embd; const int64_t n_seqs = ubatch.n_seqs; - const auto kv_head = kv_state->get_head(); + const auto kv_head = mctx_cur->get_head(); return ggml_cpy( ctx0, ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0), - ggml_view_1d(ctx0, kv_state->get_r_l(il), hparams.n_embd_r()*n_seqs, hparams.n_embd_r()*kv_head*ggml_element_size(kv_state->get_r_l(il))) + ggml_view_1d(ctx0, mctx_cur->get_r_l(il), hparams.n_embd_r()*n_seqs, hparams.n_embd_r()*kv_head*ggml_element_size(mctx_cur->get_r_l(il))) ); } diff --git a/examples/talk-llama/llama-graph.h b/examples/talk-llama/llama-graph.h index 9e62fa60720..ceddb6021f1 100644 --- a/examples/talk-llama/llama-graph.h +++ b/examples/talk-llama/llama-graph.h @@ -17,12 +17,12 @@ struct ggml_tensor; struct llama_ubatch; struct llama_cparams; -struct llama_memory_state_i; +struct llama_memory_context_i; -class llama_kv_cache_unified_state; -class llama_kv_cache_unified_iswa_state; -class llama_memory_recurrent_state; -class llama_memory_hybrid_state; +class llama_kv_cache_unified_context; +class llama_kv_cache_unified_iswa_context; +class llama_memory_recurrent_context; +class llama_memory_hybrid_context; // certain models (typically multi-modal) can produce different types of graphs enum llm_graph_type { @@ -38,6 +38,7 @@ enum llm_ffn_op_type { LLM_FFN_RELU_SQR, LLM_FFN_SWIGLU, LLM_FFN_GEGLU, + LLM_FFN_REGLU, }; enum llm_ffn_gate_type { @@ -136,7 +137,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i { public: llm_graph_input_pos_bucket_kv( const llama_hparams & hparams, - const llama_kv_cache_unified_state * kv_state) : hparams(hparams), kv_state(kv_state) {} + const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {} virtual ~llm_graph_input_pos_bucket_kv() = default; void set_input(const llama_ubatch * ubatch) override; @@ -144,7 +145,8 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i { ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch] const llama_hparams & hparams; - const llama_kv_cache_unified_state * kv_state; + + const llama_kv_cache_unified_context * mctx; }; class llm_graph_input_out_ids : public llm_graph_input_i { @@ -191,14 +193,14 @@ class llm_graph_input_cls : public llm_graph_input_i { class llm_graph_input_rs : public llm_graph_input_i { public: - llm_graph_input_rs(const llama_memory_recurrent_state * mem_state) : mem_state(mem_state) {} + llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {} virtual ~llm_graph_input_rs() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * s_copy; // I32 [kv_size] - const llama_memory_recurrent_state * mem_state; + const llama_memory_recurrent_context * mctx; }; class llm_graph_input_cross_embd : public llm_graph_input_i { @@ -238,10 +240,10 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i { llm_graph_input_attn_kv_unified( const llama_hparams & hparams, const llama_cparams & cparams, - const llama_kv_cache_unified_state * kv_state) : + const llama_kv_cache_unified_context * mctx) : hparams(hparams), cparams(cparams), - kv_state(kv_state) { + mctx(mctx) { } ~llm_graph_input_attn_kv_unified() = default; @@ -255,7 +257,7 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i { const llama_hparams & hparams; const llama_cparams & cparams; - const llama_kv_cache_unified_state * kv_state; + const llama_kv_cache_unified_context * mctx; }; class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { @@ -263,10 +265,10 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { llm_graph_input_attn_kv_unified_iswa( const llama_hparams & hparams, const llama_cparams & cparams, - const llama_kv_cache_unified_iswa_state * kv_state) : + const llama_kv_cache_unified_iswa_context * mctx) : hparams(hparams), cparams(cparams), - kv_state(kv_state) { + mctx(mctx) { } ~llm_graph_input_attn_kv_unified_iswa() = default; @@ -283,7 +285,7 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { const llama_hparams & hparams; const llama_cparams & cparams; - const llama_kv_cache_unified_iswa_state * kv_state; + const llama_kv_cache_unified_iswa_context * mctx; }; class llm_graph_input_attn_cross : public llm_graph_input_i { @@ -306,10 +308,10 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i { llm_graph_input_mem_hybrid( const llama_hparams & hparams, const llama_cparams & cparams, - const llama_memory_hybrid_state * mem_state) : + const llama_memory_hybrid_context * mctx) : hparams(hparams), cparams(cparams), - mem_state(mem_state) { + mctx(mctx) { } virtual ~llm_graph_input_mem_hybrid() = default; @@ -325,7 +327,18 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i { const llama_hparams & hparams; const llama_cparams & cparams; - const llama_memory_hybrid_state * mem_state; + const llama_memory_hybrid_context * mctx; +}; + +// TODO: remove this when ggml_scale_add is implemented +class llm_graph_input_one : public llm_graph_input_i { +public: + llm_graph_input_one() {} + virtual ~llm_graph_input_one() = default; + + void set_input(const llama_ubatch *) override; + + ggml_tensor * one = nullptr; // F32 }; // @@ -401,10 +414,10 @@ struct llm_graph_params { ggml_backend_sched_t sched; ggml_backend_t backend_cpu; - const llama_adapter_cvec * cvec; - const llama_adapter_loras * loras; - const llama_memory_state_i * mstate; - const llama_cross * cross; + const llama_adapter_cvec * cvec; + const llama_adapter_loras * loras; + const llama_memory_context_i * mctx; + const llama_cross * cross; uint32_t n_outputs; @@ -453,16 +466,17 @@ struct llm_graph_context { ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? - const llama_adapter_cvec * cvec; - const llama_adapter_loras * loras; - const llama_memory_state_i * mstate; - const llama_cross * cross; + const llama_adapter_cvec * cvec; + const llama_adapter_loras * loras; + const llama_memory_context_i * mctx; + const llama_cross * cross; const llm_graph_cb & cb_func; std::unique_ptr res; llm_graph_context(const llm_graph_params & params); + virtual ~llm_graph_context() = default; void cb(ggml_tensor * cur, const char * name, int il) const; @@ -588,14 +602,15 @@ struct llm_graph_context { llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const; + // note: if k_cur or v_cur are not provided, they will not be stored in the memory ggml_tensor * build_attn( llm_graph_input_attn_kv_unified_iswa * inp, ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] - ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] - ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional ggml_tensor * kq_b, ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h index 7b315a9a74b..e85afe145a9 100644 --- a/examples/talk-llama/llama-hparams.h +++ b/examples/talk-llama/llama-hparams.h @@ -143,6 +143,12 @@ struct llama_hparams { uint32_t n_attn_temp_floor_scale = 8192; float f_attn_temp_scale = 0.1; + // gemma3n altup + uint32_t n_altup = 4; // altup_num_inputs + uint32_t i_altup_act = 0; // altup_active_idx + uint32_t laurel_rank = 64; + uint32_t n_embd_altup = 256; + // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggerganov/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; diff --git a/examples/talk-llama/llama-kv-cache-unified-iswa.cpp b/examples/talk-llama/llama-kv-cache-unified-iswa.cpp index 0ced340dec6..d1f839b63aa 100644 --- a/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +++ b/examples/talk-llama/llama-kv-cache-unified-iswa.cpp @@ -95,7 +95,7 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const { return kv_swa->seq_pos_max(seq_id); } -llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { +llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { GGML_UNUSED(embd_all); // first try simple split @@ -125,7 +125,7 @@ llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_alloc assert(heads_base.size() == heads_swa.size()); - return std::make_unique( + return std::make_unique( this, std::move(heads_base), std::move(heads_swa), std::move(ubatches)); } while (false); @@ -156,22 +156,22 @@ llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_alloc assert(heads_base.size() == heads_swa.size()); - return std::make_unique( + return std::make_unique( this, std::move(heads_base), std::move(heads_swa), std::move(ubatches)); } while (false); // TODO: if we fail again, we should attempt different splitting strategies // but to do that properly, we first have to refactor the batches to be more flexible - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } -llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() { - return std::make_unique(this); +llama_memory_context_ptr llama_kv_cache_unified_iswa::init_full() { + return std::make_unique(this); } -llama_memory_state_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) { - return std::make_unique(this, lctx, optimize); +llama_memory_context_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) { + return std::make_unique(this, lctx, optimize); } bool llama_kv_cache_unified_iswa::get_can_shift() const { @@ -197,46 +197,46 @@ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const { } // -// llama_kv_cache_unified_iswa_state +// llama_kv_cache_unified_iswa_context // -llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(llama_memory_status status) : status(status) {} +llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(llama_memory_status status) : status(status) {} -llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state( +llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa * kv) : - state_base(kv->get_base()->init_full()), - state_swa (kv->get_swa ()->init_full()), - status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) { + ctx_base(kv->get_base()->init_full()), + ctx_swa (kv->get_swa ()->init_full()), + status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { } -llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state( +llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa * kv, llama_context * lctx, bool optimize) : - state_base(kv->get_base()->init_update(lctx, optimize)), - state_swa (kv->get_swa ()->init_update(lctx, optimize)), - status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) { + ctx_base(kv->get_base()->init_update(lctx, optimize)), + ctx_swa (kv->get_swa ()->init_update(lctx, optimize)), + status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { } -llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state( +llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa * kv, std::vector heads_base, std::vector heads_swa, std::vector ubatches) : ubatches(std::move(ubatches)), // note: here we copy the ubatches. not sure if this is ideal - state_base(new llama_kv_cache_unified_state(kv->get_base(), std::move(heads_base), this->ubatches)), - state_swa (new llama_kv_cache_unified_state(kv->get_swa (), std::move(heads_swa), this->ubatches)), - status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) { + ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(heads_base), this->ubatches)), + ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(heads_swa), this->ubatches)), + status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { } -llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default; +llama_kv_cache_unified_iswa_context:: ~llama_kv_cache_unified_iswa_context() = default; -bool llama_kv_cache_unified_iswa_state::next() { +bool llama_kv_cache_unified_iswa_context::next() { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - state_base->next(); - state_swa ->next(); + ctx_base->next(); + ctx_swa ->next(); if (++i_next >= ubatches.size()) { return false; @@ -245,35 +245,35 @@ bool llama_kv_cache_unified_iswa_state::next() { return true; } -bool llama_kv_cache_unified_iswa_state::apply() { - assert(status == LLAMA_MEMORY_STATUS_SUCCESS); +bool llama_kv_cache_unified_iswa_context::apply() { + assert(!llama_memory_status_is_fail(status)); bool res = true; - res = res & state_base->apply(); - res = res & state_swa ->apply(); + res = res & ctx_base->apply(); + res = res & ctx_swa ->apply(); return res; } -llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const { +llama_memory_status llama_kv_cache_unified_iswa_context::get_status() const { return status; } -const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const { +const llama_ubatch & llama_kv_cache_unified_iswa_context::get_ubatch() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); return ubatches[i_next]; } -const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_base() const { +const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_base() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - return static_cast(state_base.get()); + return static_cast(ctx_base.get()); } -const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_swa() const { +const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_swa() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - return static_cast(state_swa.get()); + return static_cast(ctx_swa.get()); } diff --git a/examples/talk-llama/llama-kv-cache-unified-iswa.h b/examples/talk-llama/llama-kv-cache-unified-iswa.h index 071041585db..46c1ed614f2 100644 --- a/examples/talk-llama/llama-kv-cache-unified-iswa.h +++ b/examples/talk-llama/llama-kv-cache-unified-iswa.h @@ -31,14 +31,14 @@ class llama_kv_cache_unified_iswa : public llama_memory_i { // llama_memory_i // - llama_memory_state_ptr init_batch( + llama_memory_context_ptr init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) override; - llama_memory_state_ptr init_full() override; + llama_memory_context_ptr init_full() override; - llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; + llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override; bool get_can_shift() const override; @@ -72,32 +72,32 @@ class llama_kv_cache_unified_iswa : public llama_memory_i { std::unique_ptr kv_swa; }; -class llama_kv_cache_unified_iswa_state : public llama_memory_state_i { +class llama_kv_cache_unified_iswa_context : public llama_memory_context_i { public: // used for errors - llama_kv_cache_unified_iswa_state(llama_memory_status status); + llama_kv_cache_unified_iswa_context(llama_memory_status status); - // used to create a full-cache state - llama_kv_cache_unified_iswa_state( + // used to create a full-cache context + llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa * kv); - // used to create an update state - llama_kv_cache_unified_iswa_state( + // used to create an update context + llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa * kv, llama_context * lctx, bool optimize); - // used to create a state from a batch - llama_kv_cache_unified_iswa_state( + // used to create a batch processing context from a batch + llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa * kv, std::vector heads_base, std::vector heads_swa, std::vector ubatches); - virtual ~llama_kv_cache_unified_iswa_state(); + virtual ~llama_kv_cache_unified_iswa_context(); // - // llama_memory_state_i + // llama_memory_context_i // bool next() override; @@ -107,11 +107,11 @@ class llama_kv_cache_unified_iswa_state : public llama_memory_state_i { const llama_ubatch & get_ubatch() const override; // - // llama_kv_cache_unified_iswa_state specific API + // llama_kv_cache_unified_iswa_context specific API // - const llama_kv_cache_unified_state * get_base() const; - const llama_kv_cache_unified_state * get_swa() const; + const llama_kv_cache_unified_context * get_base() const; + const llama_kv_cache_unified_context * get_swa() const; private: //llama_kv_cache_unified_iswa * kv; @@ -121,8 +121,8 @@ class llama_kv_cache_unified_iswa_state : public llama_memory_state_i { std::vector ubatches; - const llama_memory_state_ptr state_base; - const llama_memory_state_ptr state_swa; + const llama_memory_context_ptr ctx_base; + const llama_memory_context_ptr ctx_swa; const llama_memory_status status; }; diff --git a/examples/talk-llama/llama-kv-cache-unified.cpp b/examples/talk-llama/llama-kv-cache-unified.cpp index 6897b797153..7f7b162ffd7 100644 --- a/examples/talk-llama/llama-kv-cache-unified.cpp +++ b/examples/talk-llama/llama-kv-cache-unified.cpp @@ -33,13 +33,19 @@ llama_kv_cache_unified::llama_kv_cache_unified( GGML_ASSERT(kv_size % n_pad == 0); + // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] + auto n_layer_cache = hparams.n_layer; + if (model.arch == LLM_ARCH_GEMMA3N) { + n_layer_cache = 20; + } + // create a context for each buffer type std::map ctx_map; auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(2u*n_layer_cache*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -62,7 +68,7 @@ llama_kv_cache_unified::llama_kv_cache_unified( cells.resize(kv_size); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + for (uint32_t il = 0; il < n_layer_cache; il++) { if (filter && !filter(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); continue; @@ -102,6 +108,26 @@ llama_kv_cache_unified::llama_kv_cache_unified( layers.push_back({ il, k, v }); } + // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] + if (model.arch == LLM_ARCH_GEMMA3N) { + LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1); + + for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) { + if (filter && !filter(il)) { + LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); + continue; + } + + const bool is_swa = hparams.is_swa(il); + const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1); + + GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end()); + map_layer_ids[il] = map_layer_ids[il_reuse]; + + LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa); + } + } + // allocate tensors and initialize the buffers to avoid NaNs in the padding for (auto it : ctx_map) { auto * buft = it.first; @@ -307,7 +333,7 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { return cells.seq_pos_max(seq_id); } -llama_memory_state_ptr llama_kv_cache_unified::init_batch( +llama_memory_context_ptr llama_kv_cache_unified::init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { @@ -332,18 +358,18 @@ llama_memory_state_ptr llama_kv_cache_unified::init_batch( break; } - return std::make_unique( + return std::make_unique( this, std::move(heads), std::move(ubatches)); } while (false); - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } -llama_memory_state_ptr llama_kv_cache_unified::init_full() { - return std::make_unique(this); +llama_memory_context_ptr llama_kv_cache_unified::init_full() { + return std::make_unique(this); } -llama_memory_state_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) { +llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) { bool do_shift = get_has_shift(); defrag_info dinfo; @@ -373,7 +399,7 @@ llama_memory_state_ptr llama_kv_cache_unified::init_update(llama_context * lctx, } } - return std::make_unique(this, lctx, do_shift, std::move(dinfo)); + return std::make_unique(this, lctx, do_shift, std::move(dinfo)); } llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::vector & ubatches) { @@ -1710,18 +1736,18 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell } // -// llama_kv_cache_unified_state +// llama_kv_cache_unified_context // -llama_kv_cache_unified_state::llama_kv_cache_unified_state(llama_memory_status status) : status(status) {} +llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_status status) : status(status) {} -llama_kv_cache_unified_state::llama_kv_cache_unified_state( +llama_kv_cache_unified_context::llama_kv_cache_unified_context( llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) { n_kv = kv->get_size(); head = 0; } -llama_kv_cache_unified_state::llama_kv_cache_unified_state( +llama_kv_cache_unified_context::llama_kv_cache_unified_context( llama_kv_cache_unified * kv, llama_context * lctx, bool do_shift, @@ -1731,15 +1757,15 @@ llama_kv_cache_unified_state::llama_kv_cache_unified_state( } } -llama_kv_cache_unified_state::llama_kv_cache_unified_state( +llama_kv_cache_unified_context::llama_kv_cache_unified_context( llama_kv_cache_unified * kv, llama_kv_cache_unified::ubatch_heads heads, std::vector ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), heads(std::move(heads)), ubatches(std::move(ubatches)) { } -llama_kv_cache_unified_state::~llama_kv_cache_unified_state() = default; +llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default; -bool llama_kv_cache_unified_state::next() { +bool llama_kv_cache_unified_context::next() { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); if (++i_next >= ubatches.size()) { @@ -1749,8 +1775,8 @@ bool llama_kv_cache_unified_state::next() { return true; } -bool llama_kv_cache_unified_state::apply() { - assert(status == LLAMA_MEMORY_STATUS_SUCCESS); +bool llama_kv_cache_unified_context::apply() { + assert(!llama_memory_status_is_fail(status)); // no ubatches -> this is a KV cache update if (ubatches.empty()) { @@ -1767,45 +1793,45 @@ bool llama_kv_cache_unified_state::apply() { return true; } -llama_memory_status llama_kv_cache_unified_state::get_status() const { +llama_memory_status llama_kv_cache_unified_context::get_status() const { return status; } -const llama_ubatch & llama_kv_cache_unified_state::get_ubatch() const { +const llama_ubatch & llama_kv_cache_unified_context::get_ubatch() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); return ubatches[i_next]; } -uint32_t llama_kv_cache_unified_state::get_n_kv() const { +uint32_t llama_kv_cache_unified_context::get_n_kv() const { return n_kv; } -ggml_tensor * llama_kv_cache_unified_state::get_k(ggml_context * ctx, int32_t il) const { +ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const { return kv->get_k(ctx, il, n_kv); } -ggml_tensor * llama_kv_cache_unified_state::get_v(ggml_context * ctx, int32_t il) const { +ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t il) const { return kv->get_v(ctx, il, n_kv); } -ggml_tensor * llama_kv_cache_unified_state::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const { +ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const { return kv->cpy_k(ctx, k_cur, il, head); } -ggml_tensor * llama_kv_cache_unified_state::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const { +ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const { return kv->cpy_v(ctx, v_cur, il, head); } -void llama_kv_cache_unified_state::set_input_k_shift(ggml_tensor * dst) const { +void llama_kv_cache_unified_context::set_input_k_shift(ggml_tensor * dst) const { kv->set_input_k_shift(dst); } -void llama_kv_cache_unified_state::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { +void llama_kv_cache_unified_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { kv->set_input_kq_mask(dst, ubatch, causal_attn); } -void llama_kv_cache_unified_state::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { +void llama_kv_cache_unified_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { kv->set_input_pos_bucket(dst, ubatch); } diff --git a/examples/talk-llama/llama-kv-cache-unified.h b/examples/talk-llama/llama-kv-cache-unified.h index 1560640045c..4c53f1273ab 100644 --- a/examples/talk-llama/llama-kv-cache-unified.h +++ b/examples/talk-llama/llama-kv-cache-unified.h @@ -56,14 +56,14 @@ class llama_kv_cache_unified : public llama_memory_i { // llama_memory_i // - llama_memory_state_ptr init_batch( + llama_memory_context_ptr init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) override; - llama_memory_state_ptr init_full() override; + llama_memory_context_ptr init_full() override; - llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; + llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override; bool get_can_shift() const override; @@ -208,36 +208,36 @@ class llama_kv_cache_unified : public llama_memory_i { bool state_read_data(llama_io_read_i & io, uint32_t cell_count); }; -class llama_kv_cache_unified_state : public llama_memory_state_i { +class llama_kv_cache_unified_context : public llama_memory_context_i { public: // some shorthands using ubatch_heads = llama_kv_cache_unified::ubatch_heads; using defrag_info = llama_kv_cache_unified::defrag_info; // used for errors - llama_kv_cache_unified_state(llama_memory_status status); + llama_kv_cache_unified_context(llama_memory_status status); - // used to create a full-cache state - llama_kv_cache_unified_state( + // used to create a full-cache context + llama_kv_cache_unified_context( llama_kv_cache_unified * kv); - // used to create an update state - llama_kv_cache_unified_state( + // used to create an update context + llama_kv_cache_unified_context( llama_kv_cache_unified * kv, llama_context * lctx, bool do_shift, defrag_info dinfo); - // used to create a decode state from a batch - llama_kv_cache_unified_state( + // used to create a batch procesing context from a batch + llama_kv_cache_unified_context( llama_kv_cache_unified * kv, ubatch_heads heads, std::vector ubatches); - virtual ~llama_kv_cache_unified_state(); + virtual ~llama_kv_cache_unified_context(); // - // llama_memory_state_i + // llama_memory_context_i // bool next() override; @@ -247,7 +247,7 @@ class llama_kv_cache_unified_state : public llama_memory_state_i { const llama_ubatch & get_ubatch() const override; // - // llama_kv_cache_unified_state specific API + // llama_kv_cache_unified_context specific API // uint32_t get_n_kv() const; @@ -272,7 +272,7 @@ class llama_kv_cache_unified_state : public llama_memory_state_i { llama_context * lctx; // - // update state + // update context // bool do_shift = false; @@ -280,7 +280,7 @@ class llama_kv_cache_unified_state : public llama_memory_state_i { defrag_info dinfo; // - // batch processing state + // batch processing context // // the index of the next ubatch to process diff --git a/examples/talk-llama/llama-kv-cells.h b/examples/talk-llama/llama-kv-cells.h index 349e9032e24..c95d635948b 100644 --- a/examples/talk-llama/llama-kv-cells.h +++ b/examples/talk-llama/llama-kv-cells.h @@ -7,6 +7,7 @@ #include #include #include +#include // meta information about KV cells that can be part of multiple sequences at the same time // TODO: add unit tests @@ -164,7 +165,7 @@ class llama_kv_cells_unified { assert(seq_id >= 0); seq[i].reset(seq_id); - seq_pos[seq_id].erase(pos[i]); + seq_pos_dec(seq_id, pos[i]); if (seq[i].none()) { pos[i] = -1; @@ -187,7 +188,7 @@ class llama_kv_cells_unified { seq[i].reset(); seq[i].set(seq_id); - seq_pos[seq_id].insert(pos[i]); + seq_pos_inc(seq_id, pos[i]); return false; } @@ -232,7 +233,7 @@ class llama_kv_cells_unified { assert(!seq[i].test(seq_id)); seq[i].set(seq_id); - seq_pos[seq_id].insert(pos[i]); + seq_pos_inc(seq_id, pos[i]); } // return the sequence id of this cell @@ -259,7 +260,9 @@ class llama_kv_cells_unified { return -1; } - return *seq_pos[seq_id].begin(); + assert(seq_pos[seq_id].begin()->second > 0); + + return seq_pos[seq_id].begin()->first; } // the maximum position of sequence seq_id currently present in any of the cells @@ -272,7 +275,9 @@ class llama_kv_cells_unified { return -1; } - return *seq_pos[seq_id].rbegin(); + assert(seq_pos[seq_id].rbegin()->second > 0); + + return seq_pos[seq_id].rbegin()->first; } // note: call only if the cell is not empty @@ -389,17 +394,36 @@ class llama_kv_cells_unified { // the bitset seq[i] tells us which sequences are currently occupying the i-th cell std::vector seq; - // the set seq_pos[s] tells us which positions are currently present for sequence s + // the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s + // if the position p is not present, seq_pos[s][p] is not set // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache - std::set seq_pos[LLAMA_MAX_SEQ]; + // + // note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq: + // - during performing a cache reuse via (rm + add) + // - some vision models have input embeddings with repeating positions + // + std::map seq_pos[LLAMA_MAX_SEQ]; // helper functions for updating `seq_pos`, once cell at a time: + void seq_pos_dec(llama_seq_id s, llama_pos p) { + auto it = seq_pos[s].find(p); + assert(it != seq_pos[s].end()); + + if (--it->second == 0) { + seq_pos[s].erase(it); + } + } + + void seq_pos_inc(llama_seq_id s, llama_pos p) { + seq_pos[s][p]++; + } + // remove cell i void seq_pos_rm(uint32_t i) { for (int s = 0; s < LLAMA_MAX_SEQ; ++s) { if (seq[i].test(s)) { - seq_pos[s].erase(pos[i]); + seq_pos_dec(s, pos[i]); } } } @@ -408,7 +432,7 @@ class llama_kv_cells_unified { void seq_pos_add(uint32_t i) { for (int s = 0; s < LLAMA_MAX_SEQ; ++s) { if (seq[i].test(s)) { - seq_pos[s].insert(pos[i]); + seq_pos_inc(s, pos[i]); } } } diff --git a/examples/talk-llama/llama-memory-hybrid.cpp b/examples/talk-llama/llama-memory-hybrid.cpp index 1b16686819e..67cbf955482 100644 --- a/examples/talk-llama/llama-memory-hybrid.cpp +++ b/examples/talk-llama/llama-memory-hybrid.cpp @@ -56,7 +56,7 @@ llama_memory_hybrid::llama_memory_hybrid( n_seq_max )) {} -llama_memory_state_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { +llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { do { balloc.split_reset(); @@ -82,31 +82,31 @@ llama_memory_state_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ball // prepare the recurrent batches first if (!mem_recr->prepare(ubatches)) { - // TODO: will the recurrent cache be in an undefined state at this point? + // TODO: will the recurrent cache be in an undefined context at this point? LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__); - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } // prepare the attention cache auto heads_attn = mem_attn->prepare(ubatches); if (heads_attn.empty()) { LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__); - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } - return std::make_unique( + return std::make_unique( this, std::move(heads_attn), std::move(ubatches)); } while(false); - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } -llama_memory_state_ptr llama_memory_hybrid::init_full() { - return std::make_unique(this); +llama_memory_context_ptr llama_memory_hybrid::init_full() { + return std::make_unique(this); } -llama_memory_state_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) { - return std::make_unique(this, lctx, optimize); +llama_memory_context_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) { + return std::make_unique(this, lctx, optimize); } bool llama_memory_hybrid::get_can_shift() const { @@ -176,39 +176,39 @@ llama_memory_recurrent * llama_memory_hybrid::get_mem_recr() const { return mem_recr.get(); } -llama_memory_hybrid_state::llama_memory_hybrid_state(llama_memory_status status) : status(status) {} +llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_status status) : status(status) {} -llama_memory_hybrid_state::llama_memory_hybrid_state(llama_memory_hybrid * mem) : - state_attn(mem->get_mem_attn()->init_full()), - state_recr(mem->get_mem_recr()->init_full()), - status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) { +llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_hybrid * mem) : + ctx_attn(mem->get_mem_attn()->init_full()), + ctx_recr(mem->get_mem_recr()->init_full()), + status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { } -llama_memory_hybrid_state::llama_memory_hybrid_state( +llama_memory_hybrid_context::llama_memory_hybrid_context( llama_memory_hybrid * mem, llama_context * lctx, bool optimize) : - state_attn(mem->get_mem_attn()->init_update(lctx, optimize)), - state_recr(mem->get_mem_recr()->init_update(lctx, optimize)), - status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) { + ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)), + ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)), + status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { } -llama_memory_hybrid_state::llama_memory_hybrid_state( +llama_memory_hybrid_context::llama_memory_hybrid_context( llama_memory_hybrid * mem, std::vector heads_attn, std::vector ubatches) : ubatches(std::move(ubatches)), // note: here we copy the ubatches. not sure if this is ideal - state_attn(new llama_kv_cache_unified_state(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)), - state_recr(new llama_memory_recurrent_state(mem->get_mem_recr(), this->ubatches)), - status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) { + ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)), + ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)), + status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { } -bool llama_memory_hybrid_state::next() { +bool llama_memory_hybrid_context::next() { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - state_attn->next(); - state_recr->next(); + ctx_attn->next(); + ctx_recr->next(); if (++i_next >= ubatches.size()) { return false; @@ -217,30 +217,30 @@ bool llama_memory_hybrid_state::next() { return true; } -bool llama_memory_hybrid_state::apply() { - assert(status == LLAMA_MEMORY_STATUS_SUCCESS); +bool llama_memory_hybrid_context::apply() { + assert(!llama_memory_status_is_fail(status)); bool res = true; - res = res & state_attn->apply(); - res = res & state_recr->apply(); + res = res & ctx_attn->apply(); + res = res & ctx_recr->apply(); return res; } -llama_memory_status llama_memory_hybrid_state::get_status() const { +llama_memory_status llama_memory_hybrid_context::get_status() const { return status; } -const llama_ubatch & llama_memory_hybrid_state::get_ubatch() const { +const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); return ubatches[i_next]; } -const llama_kv_cache_unified_state * llama_memory_hybrid_state::get_state_attn() const { - return static_cast(state_attn.get()); +const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const { + return static_cast(ctx_attn.get()); } -const llama_memory_recurrent_state * llama_memory_hybrid_state::get_state_recr() const { - return static_cast(state_recr.get()); +const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const { + return static_cast(ctx_recr.get()); } diff --git a/examples/talk-llama/llama-memory-hybrid.h b/examples/talk-llama/llama-memory-hybrid.h index 4d27ab896aa..f0c2420e9a2 100644 --- a/examples/talk-llama/llama-memory-hybrid.h +++ b/examples/talk-llama/llama-memory-hybrid.h @@ -49,14 +49,14 @@ class llama_memory_hybrid : public llama_memory_i { // llama_memory_i // - llama_memory_state_ptr init_batch( + llama_memory_context_ptr init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) override; - llama_memory_state_ptr init_full() override; + llama_memory_context_ptr init_full() override; - llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; + llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override; bool get_can_shift() const override; @@ -90,27 +90,27 @@ class llama_memory_hybrid : public llama_memory_i { const std::unique_ptr mem_recr; }; -class llama_memory_hybrid_state : public llama_memory_state_i { +class llama_memory_hybrid_context : public llama_memory_context_i { public: // init failure - explicit llama_memory_hybrid_state(llama_memory_status status); + explicit llama_memory_hybrid_context(llama_memory_status status); // init full - explicit llama_memory_hybrid_state(llama_memory_hybrid * mem); + explicit llama_memory_hybrid_context(llama_memory_hybrid * mem); // init update - explicit llama_memory_hybrid_state( + explicit llama_memory_hybrid_context( llama_memory_hybrid * mem, llama_context * lctx, bool optimize); // init success - llama_memory_hybrid_state( + llama_memory_hybrid_context( llama_memory_hybrid * mem, std::vector heads_attn, std::vector ubatches); - ~llama_memory_hybrid_state() = default; + ~llama_memory_hybrid_context() = default; bool next() override; bool apply() override; @@ -119,11 +119,11 @@ class llama_memory_hybrid_state : public llama_memory_state_i { const llama_ubatch & get_ubatch() const override; // - // llama_memory_hybrid_state + // llama_memory_hybrid_context // - const llama_kv_cache_unified_state * get_state_attn() const; - const llama_memory_recurrent_state * get_state_recr() const; + const llama_kv_cache_unified_context * get_attn() const; + const llama_memory_recurrent_context * get_recr() const; private: // the index of the next ubatch to process @@ -131,8 +131,8 @@ class llama_memory_hybrid_state : public llama_memory_state_i { std::vector ubatches; - const llama_memory_state_ptr state_attn; - const llama_memory_state_ptr state_recr; + const llama_memory_context_ptr ctx_attn; + const llama_memory_context_ptr ctx_recr; const llama_memory_status status; }; diff --git a/examples/talk-llama/llama-memory-recurrent.cpp b/examples/talk-llama/llama-memory-recurrent.cpp index b064da0084c..6ed84057ccf 100644 --- a/examples/talk-llama/llama-memory-recurrent.cpp +++ b/examples/talk-llama/llama-memory-recurrent.cpp @@ -362,42 +362,47 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const { return result; } -llama_memory_state_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { - std::vector ubatches; +llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { + do { + balloc.split_reset(); - while (true) { - llama_ubatch ubatch; + std::vector ubatches; + while (true) { + llama_ubatch ubatch; - if (embd_all) { - // if all tokens are output, split by sequence - ubatch = balloc.split_seq(n_ubatch); - } else { - ubatch = balloc.split_equal(n_ubatch); + if (embd_all) { + // if all tokens are output, split by sequence + ubatch = balloc.split_seq(n_ubatch); + } else { + ubatch = balloc.split_equal(n_ubatch); + } + + if (ubatch.n_tokens == 0) { + break; + } + + ubatches.push_back(std::move(ubatch)); // NOLINT } - if (ubatch.n_tokens == 0) { + if (!prepare(ubatches)) { break; } - ubatches.push_back(std::move(ubatch)); // NOLINT - } - - if (!prepare(ubatches)) { - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); - } + return std::make_unique(this, std::move(ubatches)); + } while (false); - return std::make_unique(this, std::move(ubatches)); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } -llama_memory_state_ptr llama_memory_recurrent::init_full() { - return std::make_unique(this); +llama_memory_context_ptr llama_memory_recurrent::init_full() { + return std::make_unique(this); } -llama_memory_state_ptr llama_memory_recurrent::init_update(llama_context * lctx, bool optimize) { +llama_memory_context_ptr llama_memory_recurrent::init_update(llama_context * lctx, bool optimize) { GGML_UNUSED(lctx); GGML_UNUSED(optimize); - return std::make_unique(LLAMA_MEMORY_STATUS_NO_UPDATE); + return std::make_unique(LLAMA_MEMORY_STATUS_NO_UPDATE); } bool llama_memory_recurrent::prepare(const std::vector & ubatches) { @@ -1040,22 +1045,22 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell } // -// llama_memory_recurrent_state +// llama_memory_recurrent_context // -llama_memory_recurrent_state::llama_memory_recurrent_state(llama_memory_status status) : status(status) {} +llama_memory_recurrent_context::llama_memory_recurrent_context(llama_memory_status status) : status(status) {} -llama_memory_recurrent_state::llama_memory_recurrent_state( +llama_memory_recurrent_context::llama_memory_recurrent_context( llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), is_full(true) { } -llama_memory_recurrent_state::llama_memory_recurrent_state( +llama_memory_recurrent_context::llama_memory_recurrent_context( llama_memory_recurrent * mem, std::vector ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)) {} -llama_memory_recurrent_state::~llama_memory_recurrent_state() = default; +llama_memory_recurrent_context::~llama_memory_recurrent_context() = default; -bool llama_memory_recurrent_state::next() { +bool llama_memory_recurrent_context::next() { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); if (++i_next >= ubatches.size()) { @@ -1065,48 +1070,56 @@ bool llama_memory_recurrent_state::next() { return true; } -bool llama_memory_recurrent_state::apply() { - assert(status == LLAMA_MEMORY_STATUS_SUCCESS); +bool llama_memory_recurrent_context::apply() { + assert(!llama_memory_status_is_fail(status)); + + // no ubatches -> this is an update + if (ubatches.empty()) { + // recurrent cache never performs updates + assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE); + + return true; + } mem->find_slot(ubatches[i_next]); return true; } -llama_memory_status llama_memory_recurrent_state::get_status() const { +llama_memory_status llama_memory_recurrent_context::get_status() const { return status; } -const llama_ubatch & llama_memory_recurrent_state::get_ubatch() const { +const llama_ubatch & llama_memory_recurrent_context::get_ubatch() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); return ubatches[i_next]; } -uint32_t llama_memory_recurrent_state::get_n_rs() const { +uint32_t llama_memory_recurrent_context::get_n_rs() const { return is_full ? mem->size : mem->n; } -uint32_t llama_memory_recurrent_state::get_head() const { +uint32_t llama_memory_recurrent_context::get_head() const { return is_full ? 0 : mem->head; } -int32_t llama_memory_recurrent_state::get_rs_z() const { +int32_t llama_memory_recurrent_context::get_rs_z() const { return is_full ? 0 : mem->rs_z; } -uint32_t llama_memory_recurrent_state::get_size() const { +uint32_t llama_memory_recurrent_context::get_size() const { return mem->size; } -ggml_tensor * llama_memory_recurrent_state::get_r_l(int32_t il) const { +ggml_tensor * llama_memory_recurrent_context::get_r_l(int32_t il) const { return mem->r_l[il]; } -ggml_tensor * llama_memory_recurrent_state::get_s_l(int32_t il) const { +ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const { return mem->s_l[il]; } -int32_t llama_memory_recurrent_state::s_copy(int i) const { +int32_t llama_memory_recurrent_context::s_copy(int i) const { return mem->cells[i + mem->head].src0; } diff --git a/examples/talk-llama/llama-memory-recurrent.h b/examples/talk-llama/llama-memory-recurrent.h index be58dae7cfe..4d094f9a057 100644 --- a/examples/talk-llama/llama-memory-recurrent.h +++ b/examples/talk-llama/llama-memory-recurrent.h @@ -11,8 +11,8 @@ // llama_memory_recurrent // -// TODO: extract the cache state used for graph computation into llama_memory_recurrent_state_i -// see the implementation of llama_kv_cache_unified_state_i for an example how to do it +// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i +// see the implementation of llama_kv_cache_unified_context_i for an example how to do it class llama_memory_recurrent : public llama_memory_i { public: @@ -34,14 +34,14 @@ class llama_memory_recurrent : public llama_memory_i { // llama_memory_i // - llama_memory_state_ptr init_batch( + llama_memory_context_ptr init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) override; - llama_memory_state_ptr init_full() override; + llama_memory_context_ptr init_full() override; - llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; + llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override; void clear(bool data) override; @@ -125,24 +125,24 @@ class llama_memory_recurrent : public llama_memory_i { bool state_read_data(llama_io_read_i & io, uint32_t cell_count); }; -class llama_memory_recurrent_state : public llama_memory_state_i { +class llama_memory_recurrent_context : public llama_memory_context_i { public: // used for errors - llama_memory_recurrent_state(llama_memory_status status); + llama_memory_recurrent_context(llama_memory_status status); - // used to create a full-cache state - llama_memory_recurrent_state( + // used to create a full-cache or update context + llama_memory_recurrent_context( llama_memory_recurrent * mem); - // used to create a state from a batch - llama_memory_recurrent_state( + // used to create a batch processing context from a batch + llama_memory_recurrent_context( llama_memory_recurrent * mem, std::vector ubatches); - virtual ~llama_memory_recurrent_state(); + virtual ~llama_memory_recurrent_context(); // - // llama_memory_state_i + // llama_memory_context_i // bool next() override; @@ -152,7 +152,7 @@ class llama_memory_recurrent_state : public llama_memory_state_i { const llama_ubatch & get_ubatch() const override; // - // llama_memory_recurrent_state specific API + // llama_memory_recurrent_context specific API // uint32_t get_n_rs() const; diff --git a/examples/talk-llama/llama-memory.cpp b/examples/talk-llama/llama-memory.cpp index f1107672c64..ca6844c32a7 100644 --- a/examples/talk-llama/llama-memory.cpp +++ b/examples/talk-llama/llama-memory.cpp @@ -40,3 +40,20 @@ llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_me // if either status has an update, then the combined status has an update return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE; } + +bool llama_memory_status_is_fail(llama_memory_status status) { + switch (status) { + case LLAMA_MEMORY_STATUS_SUCCESS: + case LLAMA_MEMORY_STATUS_NO_UPDATE: + { + return false; + } + case LLAMA_MEMORY_STATUS_FAILED_PREPARE: + case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: + { + return true; + } + } + + return false; +} diff --git a/examples/talk-llama/llama-memory.h b/examples/talk-llama/llama-memory.h index d2ef0c2a3b4..e8ba336e852 100644 --- a/examples/talk-llama/llama-memory.h +++ b/examples/talk-llama/llama-memory.h @@ -3,7 +3,6 @@ #include "llama.h" #include -#include struct llama_ubatch; @@ -28,23 +27,24 @@ enum llama_memory_status { LLAMA_MEMORY_STATUS_FAILED_COMPUTE, }; -// helper function for combining the status of two memory states +// helper function for combining the status of two memory contexts // useful for implementing hybrid memory types (e.g. iSWA) llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1); -// the interface for managing the memory state during batch processing +// helper function for checking if a memory status indicates a failure +bool llama_memory_status_is_fail(llama_memory_status status); + +// the interface for managing the memory context during batch processing // this interface is implemented per memory type. see: -// - llama_kv_cache_unified_state -// - llama_kv_cache_unified_iswa_state +// - llama_kv_cache_unified_context +// - llama_kv_cache_unified_iswa_context // ... // -// the only method that can mutate the memory and the memory state is llama_memory_i::apply() -// -// TODO: rename to llama_memory_context_i ? -struct llama_memory_state_i { - virtual ~llama_memory_state_i() = default; +// the only method that should mutate the memory and the memory context is llama_memory_i::apply() +struct llama_memory_context_i { + virtual ~llama_memory_context_i() = default; - // consume the current ubatch from the state and proceed to the next one + // consume the current ubatch from the context and proceed to the next one // return false if we are done virtual bool next() = 0; @@ -55,11 +55,11 @@ struct llama_memory_state_i { // get the current ubatch virtual const llama_ubatch & get_ubatch() const = 0; - // get the status of the memory state - used for error handling and checking if any updates would be applied + // get the status of the memory context - used for error handling and checking if any updates would be applied virtual llama_memory_status get_status() const = 0; }; -using llama_memory_state_ptr = std::unique_ptr; +using llama_memory_context_ptr = std::unique_ptr; // general concept of LLM memory // the KV cache is a type of LLM memory, but there can be other types @@ -67,19 +67,19 @@ struct llama_memory_i { virtual ~llama_memory_i() = default; // split the input batch into a set of ubatches and verify that they can fit into the cache - // return a state object containing the ubatches and KV cache state required to process them - // check the llama_memory_state_i::get_status() for the result - virtual llama_memory_state_ptr init_batch( + // return a context object containing the ubatches and memory state required to process them + // check the llama_memory_context_i::get_status() for the result + virtual llama_memory_context_ptr init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) = 0; // simulate full cache, used for allocating worst-case compute buffers - virtual llama_memory_state_ptr init_full() = 0; + virtual llama_memory_context_ptr init_full() = 0; // prepare for any pending memory updates, such as shifts, defrags, etc. // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update - virtual llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) = 0; + virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0; // getters virtual bool get_can_shift() const = 0; diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp index e2c82017f68..b15bf73c2a2 100644 --- a/examples/talk-llama/llama-model.cpp +++ b/examples/talk-llama/llama-model.cpp @@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_475M: return "475M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; + case LLM_TYPE_0_3B: return "0.3B"; case LLM_TYPE_0_5B: return "0.5B"; case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; @@ -103,6 +104,8 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; case LLM_TYPE_30B_A3B: return "30B.A3B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; + case LLM_TYPE_E2B: return "E2B"; + case LLM_TYPE_E4B: return "E4B"; default: return "?B"; } } @@ -1017,6 +1020,24 @@ void llama_model::load_hparams(llama_model_loader & ml) { ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0))) : 1.0f / std::sqrt(float(hparams.n_embd_head_k)); } break; + case LLM_ARCH_GEMMA3N: + { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.set_swa_pattern(5); + + hparams.rope_freq_base_train_swa = 10000.0f; + hparams.rope_freq_scale_train_swa = 1.0f; + hparams.f_attention_scale = 1.0f; + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 30: type = LLM_TYPE_E2B; break; + case 35: type = LLM_TYPE_E4B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_STARCODER2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -1484,6 +1505,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_ERNIE4_5: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 18: type = LLM_TYPE_0_3B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -2950,6 +2979,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); } } break; + case LLM_ARCH_GEMMA3N: + { + const int64_t n_altup = hparams.n_altup; + const int64_t laurel_rank = hparams.laurel_rank; + const int64_t n_embd_altup = hparams.n_embd_altup; + + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0); + + altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0); + altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0); + per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0); + per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + + // altup & laurel + layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0); + layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0); + layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0); + layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0); + layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0); + layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0); + layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0); + layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0); + layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0); + layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0); + layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0); + } + } break; case LLM_ARCH_STARCODER2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4268,6 +4353,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; + case LLM_ARCH_ERNIE4_5: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + // optional bias tensors + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } @@ -8980,6 +9099,442 @@ struct llm_build_gemma3_iswa : public llm_graph_context { } }; +struct llm_build_gemma3n_iswa : public llm_graph_context { + const llama_model & model; + ggml_cgraph * gf; + + const int64_t n_embd_head; + const int64_t n_embd_altup; + const int64_t n_altup; + const int i_altup_act; + const int n_layer_kv = 20; // number of layers having KV [KV_REUSE] + const int n_layer_sparsity = 10; // number of layers using activation sparsity + const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) + + ggml_tensor * one; // containing single element 1.0f + + llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) + : llm_graph_context(params), + model(model), + gf(gf), + n_embd_head(model.hparams.n_embd_head_k), + n_embd_altup(model.hparams.n_embd_altup), + n_altup(model.hparams.n_altup), + i_altup_act(model.hparams.i_altup_act) { + ggml_tensor * cur; + ggml_tensor * inpL; + + // TODO: remove this when ggml_scale_add is implemented + one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + { + auto inp = std::make_unique(); + inp->one = one; + res->add_input(std::move(inp)); + } + + inpL = build_inp_embd(model.tok_embd); + + // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) + if (ubatch.token) { + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + } + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // TODO: is causal == true correct? might need some changes + auto * inp_attn = build_attn_inp_kv_unified_iswa(); + + // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer] + ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs()); + + // inpL now has only 1 altup, project it to the rest of the altups + // these "added" altups will be concat to the last dim of inpL + { + ggml_tensor * target_magnitude = calc_magnitude(inpL); + ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1); + ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1] + ggml_tensor * new_magnitude = calc_magnitude(altup_added); + altup_added = ggml_div(ctx0, + ggml_mul(ctx0, altup_added, target_magnitude), + new_magnitude); + inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup] + cb(inpL, "inp_stacked", -1); + } + + // inpL now has shape: [n_embd, n_tokens, n_altup] + // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer] + + for (int il = 0; il < n_layer; ++il) { + // this block is made to be closely resemble Gemma3p5DecoderLayer on python code + const bool has_kv = (il < n_layer_kv); + + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + + ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup] + ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup] + + // predicted value will go through self-attention and laurel + ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens] + cur = active_prediction; + cb(cur, "active_prediction", il); + + // norm + cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // laurel + ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens] + + // self-attention + if (has_kv) { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps); + + cb(Qcur, "Qcur_normed", il); + cb(Kcur, "Kcur_normed", il); + cb(Vcur, "Vcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur_pos", il); + cb(Kcur, "Kcur_pos", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il); + } else { + // no KV layers + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_pos", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); + } + + cur = build_norm(cur, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens] + cb(cur, "attn_gated", il); + + ggml_tensor * attn_laurel = ggml_scale(ctx0, + ggml_add(ctx0, cur, laurel_out), + 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens] + cb(attn_laurel, "attn_laurel", il); + + cur = build_norm(attn_laurel, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur); + ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur); + + if (il < n_layer_sparsity) { + // apply activation sparsity + gate_proj = gaussian_topk(gate_proj); + } + gate_proj = ggml_gelu(ctx0, gate_proj); + + cur = ggml_mul(ctx0, up_proj, gate_proj); + cur = build_lora_mm(model.layers[il].ffn_down, cur); + cb(cur, "ffn_out", il); + } + + cur = build_norm(cur, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", il); + + ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens] + cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il); + + ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup] + + ggml_tensor * first_prediction; // [n_embd, n_tokens] + { + first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens] + first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale); + first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction); + first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens] + cb(first_prediction, "first_prediction_gated", il); + ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens] + first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens] + cb(first_prediction, "first_prediction_scaled", il); + + first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens] + first_prediction = build_norm(first_prediction, + model.layers[il].per_layer_post_norm, NULL, + LLM_NORM_RMS, il); + cb(first_prediction, "first_prediction_out", il); + } + + // equivalent to python code: corrected_predictions[1:] += first_prediction + { + ggml_tensor * slice_first = view_2d_slice(corrected, 0); + ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1, + ggml_row_size(corrected->type, n_embd), + ggml_row_size(corrected->type, n_embd*n_tokens), + n_embd*n_tokens*ggml_element_size(corrected)); + ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1] + corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup] + } + + cur = corrected; // [n_embd, n_tokens, n_altup] + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; // [n_embd, n_tokens, n_altup] + + // cur now has multiple altup(s), we want to merge them back to 1 altup + { + ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens] + // do a view to skip the first slice (active altup) + ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1, + ggml_row_size(cur->type, n_embd), + ggml_row_size(cur->type, n_embd*n_tokens), + n_embd*n_tokens*ggml_element_size(cur)); + ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1] + ggml_tensor * new_magnitude = calc_magnitude(altup_unembd); + altup_unembd = ggml_div(ctx0, + ggml_mul(ctx0, altup_unembd, target_magnitude), + new_magnitude); + cb(altup_unembd, "altup_unembd", -1); + + // equivalent to torch.mean(hidden_states, dim=0) + cur = view_2d_slice(cur, 0); // [n_embd, n_tokens] + for (int i = 0; i < n_altup - 1; ++i) { + cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i)); + } + cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens] + cb(cur, "unembd_merged", -1); + } + + // cur now has shape: [n_embd, n_tokens] + + // TODO: move this to right after the last KV layer + { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + + { + // final logit soft-capping + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); + cur = ggml_tanh(ctx0, cur); + cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + + ggml_tensor * calc_magnitude(ggml_tensor * x) { + return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x))); + } + + // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim + ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) { + GGML_ASSERT(idx < (int)x->ne[2]); + return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], + ggml_row_size(x->type, x->ne[0]), + idx * x->ne[0] * x->ne[1] * ggml_element_size(x)); + } + + // equivalent to get_per_layer_inputs() in python code + // output shape: [n_embd_altup, n_layer, n_tokens] + ggml_tensor * get_per_layer_inputs() { + auto inp = std::make_unique(); + ggml_tensor * inp_per_layer; + if (ubatch.token) { + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + ggml_set_input(inp->tokens); + res->t_tokens = inp->tokens; + inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens); + inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup)); + cb(inp_per_layer, "inp_per_layer_selected", -1); + } else { + GGML_ABORT("TODO: support embd input"); + } + res->add_input(std::move(inp)); + return inp_per_layer; + } + + // equivalent to project_per_layer_inputs() in python code + // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim + // output shape: [n_embd_altup, n_tokens, n_layer] + ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) { + const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd); + const float per_layer_input_scale = 1.0f / sqrtf(2.0f); + + ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds); + per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale); + per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens); + per_layer_proj = build_norm(per_layer_proj, + model.per_layer_proj_norm, NULL, + LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens] + cb(per_layer_proj, "per_layer_proj", -1); + + inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale); + cb(inp_per_layer, "inp_per_layer", -1); + + // permute to shape: [n_embd_altup, n_tokens, n_layer] + inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3)); + return inp_per_layer; + } + + // input cur shape: [n_altup, n_tokens] + // output shape: [n_altup, n_tokens] + ggml_tensor * laurel(ggml_tensor * cur, int il) { + ggml_tensor * tmp = cur; + tmp = build_lora_mm(model.layers[il].laurel_l, tmp); + tmp = build_lora_mm(model.layers[il].laurel_r, tmp); + tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il); + tmp = ggml_add(ctx0, tmp, cur); + cb(tmp, "laurel_out", il); + return tmp; + } + + // input x shape: [n_embd, n_tokens] + // output shape: [n_embd, n_tokens] + ggml_tensor * gaussian_topk(ggml_tensor * x) { + ggml_tensor * mean = ggml_mean(ctx0, x); + ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0, + ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))), + 1.0f / (float)(x->ne[0] - 1) + )); + ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul)); + return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x)); + } + + // + // altup functions + // + + // equivalent to compute_router_modalities() in python code + // input x shape: [n_embd, n_tokens] + // output shape: [n_altup, n_tokens] + ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) { + ggml_tensor * router_inputs = build_norm(x, + model.layers[il].altup_router_norm, NULL, + LLM_NORM_RMS, il); + + // router_input_scale + router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd); + + ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs); + return ggml_tanh(ctx0, output); // [n_altup, n_tokens] + } + + // input cur shape: [n_embd, n_tokens, n_altup] + // output shape: [n_embd, n_tokens, n_altup] + ggml_tensor * altup_predict(ggml_tensor * cur, int il) { + ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens] + ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens] + cb(modalities, "modalities", il); + + ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities); + cb(all_coefs, "all_coefs", il); + // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor) + all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens); + + // permute to [n_altup, n_embd, n_tokens] + ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens] + + // final shape must be the same as cur: [n_embd, n_tokens, n_altup] + predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3)); + predictions = ggml_add(ctx0, predictions, cur); + cb(predictions, "predictions", il); + + return predictions; + } + + // input predictions shape: [n_embd, n_tokens, n_altup] + // input activated shape: [n_embd, n_tokens] + // output shape: [n_embd, n_tokens, n_altup] + ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) { + ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens] + cb(modalities, "modalities", il); + + ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); + ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens] + cb(innovation, "innovation", il); + + ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens] + all_coefs = ggml_add(ctx0, all_coefs, one); + cb(all_coefs, "all_coefs", il); + all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup] + all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup] + + innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1); + ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup] + corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup] + cb(corrected, "corrected", il); + + return corrected; + } +}; + // TODO: move up next to build_starcoder struct llm_build_starcoder2 : public llm_graph_context { llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { @@ -9171,9 +9726,9 @@ struct llm_build_mamba : public llm_graph_context { ggml_tensor * cur, const llama_ubatch & ubatch, int il) const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); - const auto kv_head = kv_state->get_head(); + const auto kv_head = mctx_cur->get_head(); const int64_t d_conv = hparams.ssm_d_conv; const int64_t d_inner = hparams.ssm_d_inner; @@ -9191,8 +9746,8 @@ struct llm_build_mamba : public llm_graph_context { GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - ggml_tensor * conv_states_all = kv_state->get_r_l(il); - ggml_tensor * ssm_states_all = kv_state->get_s_l(il); + ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); + ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); // (ab)using the KV cache to store the states ggml_tensor * conv = build_rs( @@ -11916,7 +12471,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { ggml_tensor * x_prev, const llama_ubatch & ubatch, int il) const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); const auto n_tokens = ubatch.n_tokens; const auto n_seqs = ubatch.n_seqs; @@ -11926,7 +12481,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { const auto n_head = n_embd / head_size; const auto n_head_kv = hparams.n_head_kv(il); - const auto kv_head = kv_state->get_head(); + const auto kv_head = mctx_cur->get_head(); const auto & layer = model.layers[il]; @@ -12038,7 +12593,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { } ggml_tensor * wkv_state = build_rs( - inp, gf, kv_state->get_s_l(il), + inp, gf, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs); ggml_tensor * wkv_output; @@ -12057,9 +12612,9 @@ struct llm_build_rwkv6_base : public llm_graph_context { wkv_state, ggml_view_1d( ctx0, - kv_state->get_s_l(il), + mctx_cur->get_s_l(il), hparams.n_embd_s() * n_seqs, - hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il)) + hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il)) ) ) ); @@ -12313,7 +12868,7 @@ struct llm_build_rwkv7_base : public llm_graph_context { ggml_tensor *& first_layer_value, const llama_ubatch & ubatch, int il) const { - const auto * kv_state = static_cast(mstate); + const auto * mctx_cur = static_cast(mctx); const auto n_tokens = ubatch.n_tokens; const auto n_seqs = ubatch.n_seqs; @@ -12322,7 +12877,7 @@ struct llm_build_rwkv7_base : public llm_graph_context { const auto head_count = n_embd / head_size; const auto n_seq_tokens = ubatch.n_seq_tokens; - const auto kv_head = kv_state->get_head(); + const auto kv_head = mctx_cur->get_head(); const auto & layer = model.layers[il]; @@ -12393,7 +12948,7 @@ struct llm_build_rwkv7_base : public llm_graph_context { a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens); ggml_tensor * wkv_state = build_rs( - inp, gf, kv_state->get_s_l(il), + inp, gf, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs); ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state); @@ -12407,9 +12962,9 @@ struct llm_build_rwkv7_base : public llm_graph_context { wkv_state, ggml_view_1d( ctx0, - kv_state->get_s_l(il), + mctx_cur->get_s_l(il), hparams.n_embd_s() * n_seqs, - hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il)) + hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il)) ) ) ); @@ -13613,6 +14168,136 @@ struct llm_build_dots1 : public llm_graph_context { } }; +struct llm_build_ernie4_5 : public llm_graph_context { + llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + { + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + } + + // self-attention + { + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_arcee : public llm_graph_context { llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -13974,6 +14659,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_GEMMA3N: + { + llm = std::make_unique(*this, params, gf); + } break; case LLM_ARCH_STARCODER2: { llm = std::make_unique(*this, params, gf); @@ -14119,6 +14808,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_ERNIE4_5: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } @@ -14270,6 +14963,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_BAILINGMOE: case LLM_ARCH_NEO_BERT: case LLM_ARCH_ARCEE: + case LLM_ARCH_ERNIE4_5: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -14295,6 +14989,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: case LLM_ARCH_GEMMA3: + case LLM_ARCH_GEMMA3N: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: @@ -14377,7 +15072,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n // do not extend this list unless absolutely necessary // Mistral-Small-2503 does not have built-in chat template llama_vocab_pre_type pre_type = model->vocab.get_pre_type(); - if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) { + if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) { return "mistral-v7-tekken"; } diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h index 06e6c687943..a958c5997a1 100644 --- a/examples/talk-llama/llama-model.h +++ b/examples/talk-llama/llama-model.h @@ -39,6 +39,7 @@ enum llm_type { LLM_TYPE_475M, LLM_TYPE_770M, LLM_TYPE_780M, + LLM_TYPE_0_3B, LLM_TYPE_0_5B, LLM_TYPE_0_6B, LLM_TYPE_1B, @@ -95,6 +96,8 @@ enum llm_type { LLM_TYPE_17B_128E, // llama4 Maverick LLM_TYPE_30B_A3B, LLM_TYPE_235B_A22B, + LLM_TYPE_E2B, + LLM_TYPE_E4B, }; std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type); @@ -316,6 +319,19 @@ struct llama_layer { struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr; + // altup & laurel + struct ggml_tensor * per_layer_inp_gate = nullptr; + struct ggml_tensor * per_layer_proj = nullptr; + struct ggml_tensor * per_layer_post_norm = nullptr; + struct ggml_tensor * altup_correct_coef = nullptr; + struct ggml_tensor * altup_correct_scale = nullptr; + struct ggml_tensor * altup_predict_coef = nullptr; + struct ggml_tensor * altup_router = nullptr; + struct ggml_tensor * altup_router_norm = nullptr; + struct ggml_tensor * laurel_l = nullptr; + struct ggml_tensor * laurel_r = nullptr; + struct ggml_tensor * laurel_post_norm = nullptr; + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; @@ -354,6 +370,13 @@ struct llama_model { struct ggml_tensor * conv1d = nullptr; struct ggml_tensor * conv1d_b = nullptr; + // gemma3n altup + struct ggml_tensor * tok_embd_per_layer = nullptr; + struct ggml_tensor * altup_proj = nullptr; + struct ggml_tensor * altup_unembd_proj = nullptr; + struct ggml_tensor * per_layer_model_proj = nullptr; + struct ggml_tensor * per_layer_proj_norm = nullptr; + std::vector layers; llama_model_params params; diff --git a/examples/talk-llama/llama-quant.cpp b/examples/talk-llama/llama-quant.cpp index 8cf45732fd6..f4b5713d7dd 100644 --- a/examples/talk-llama/llama-quant.cpp +++ b/examples/talk-llama/llama-quant.cpp @@ -1,5 +1,4 @@ #include "llama-quant.h" - #include "llama-impl.h" #include "llama-model.h" #include "llama-model-loader.h" @@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) { } } +static std::string remap_layer(const std::string & orig_name, const std::vector & prune, std::map & mapped, int & next_id) { + if (prune.empty()) { + return orig_name; + } + + static const std::regex pattern(R"(blk\.(\d+)\.)"); + if (std::smatch match; std::regex_search(orig_name, match, pattern)) { + const int blk = std::stoi(match[1]); + std::string new_name = orig_name; + + if (mapped.count(blk)) { + // Already mapped, do nothing + } else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) { + mapped[blk] = ""; + } else if (blk < prune.front()) { + mapped[blk] = std::to_string(blk); + next_id = blk + 1; + } else { + mapped[blk] = std::to_string(next_id); + ++next_id; + } + + return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]); + } + + return orig_name; +} + +static std::string remap_imatrix (const std::string & orig_name, const std::map & mapped) { + if (mapped.empty()) { + return orig_name; + } + + static const std::regex pattern(R"(blk\.(\d+)\.)"); + if (std::smatch match; std::regex_search(orig_name, match, pattern)) { + const std::string blk(match[1]); + std::string new_name = orig_name; + + for (const auto & p : mapped) { + if (p.second == blk) { + LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first); + return new_name.replace(match.position(1), match.length(1), std::to_string(p.first)); + } + } + GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str()); + } + + return orig_name; +} + struct quantize_state_impl { const llama_model & model; const llama_model_quantize_params * params; @@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q6_K; } } - } else if (name == "token_embd.weight") { + } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") { if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { @@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const size_t align = GGUF_DEFAULT_ALIGNMENT; gguf_context_ptr ctx_out { gguf_init_empty() }; + std::vector prune_list = {}; + if (params->prune_layers) { + prune_list = *static_cast *>(params->prune_layers); + } + // copy the KV pairs from the input file gguf_set_kv (ctx_out.get(), ml.meta.get()); gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV @@ -597,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } + std::map mapped; + int blk_id = 0; + int pruned_attention_w = 0; + // make a list of weights std::vector tensors; tensors.reserve(ml.weights_map.size()); for (const auto & it : ml.weights_map) { + const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id)); + if (remapped_name.empty()) { + if (it.first.find("attn_v.weight") != std::string::npos || + it.first.find("attn_qkv.weight") != std::string::npos || + it.first.find("attn_kv_b.weight") != std::string::npos) { + pruned_attention_w++; + } + LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str()); + continue; + } else if (remapped_name != it.first) { + ggml_set_name(it.second.tensor, remapped_name.c_str()); + LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor)); + } tensors.push_back(&it.second); } + if (!prune_list.empty()) { + gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id); + } // keep_split requires that the weights are sorted by split index if (params->keep_split) { @@ -640,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (llama_model_has_encoder(&model)) { n_attn_layer *= 3; } - GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected"); + GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected"); } size_t total_size_org = 0; @@ -681,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (size_t i = 0; i < ctx_outs.size(); ++i) { gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); - gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors); + gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size()); } } @@ -756,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + // these are very small (e.g. 4x4) + quantize &= name.find("altup") == std::string::npos; + quantize &= name.find("laurel") == std::string::npos; + + // these are not too big so keep them as it is + quantize &= name.find("per_layer_model_proj") == std::string::npos; + // do not quantize positional embeddings and token types (BERT) quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); @@ -832,7 +913,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const float * imatrix = nullptr; if (imatrix_data) { - auto it = imatrix_data->find(tensor->name); + auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped)); if (it == imatrix_data->end()) { LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); } else { @@ -947,6 +1028,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, + /*.prune_layers =*/ nullptr }; return result; diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h index b04720bee59..3eda9bc6860 100644 --- a/examples/talk-llama/llama.h +++ b/examples/talk-llama/llama.h @@ -390,6 +390,7 @@ extern "C" { void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides void * tensor_types; // pointer to vector containing tensor types + void * prune_layers; // pointer to vector containing layer indices to prune } llama_model_quantize_params; typedef struct llama_logit_bias { @@ -943,12 +944,14 @@ extern "C" { // Requires the context to have a memory. // For encode-decoder contexts, processes the batch using the decoder. // Positive return values does not mean a fatal error, but rather a warning. - // Upon non-zero return values, the memory state is restored to the state before this call + // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context + // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max() + // Upon other return values, the memory state is restored to the state before this call // 0 - success // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) - // 2 - aborted + // 2 - aborted (processed ubatches will remain in the context's memory) // -1 - invalid input batch - // < -1 - error + // < -1 - fatal error (processed ubatches will remain in the context's memory) LLAMA_API int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch); From bca021c9740b267c2973fba56555be052006023a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Jul 2025 12:21:19 +0300 Subject: [PATCH 0046/1348] sync : ggml ggml-ci --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 2adba33a32f..37344329c69 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -4fb4fafad74e9a28849408d4f54eb94a244d303e +e3b764668ffa41996815c175ab7e94da87cdb5b1 From d9999d54c868b8bfcd376aa26067e787d53e679e Mon Sep 17 00:00:00 2001 From: Lin Xiaodong Date: Wed, 2 Jul 2025 18:14:29 +0800 Subject: [PATCH 0047/1348] feat: support vad for addon.node (#3301) Co-authored-by: linxiaodong --- examples/addon.node/README.md | 84 ++++++++++- examples/addon.node/__test__/whisper.spec.js | 144 +++++++++++++++---- examples/addon.node/addon.cpp | 130 ++++++++++++++--- examples/addon.node/vad-example.js | 132 +++++++++++++++++ 4 files changed, 444 insertions(+), 46 deletions(-) create mode 100644 examples/addon.node/vad-example.js diff --git a/examples/addon.node/README.md b/examples/addon.node/README.md index 16df7d95870..ffd7720f9e5 100644 --- a/examples/addon.node/README.md +++ b/examples/addon.node/README.md @@ -1,8 +1,10 @@ -# addon +# whisper.cpp Node.js addon This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js). It can be used as a reference for using the whisper.cpp project in other node projects. +This addon now supports **Voice Activity Detection (VAD)** for improved transcription performance. + ## Install ```shell @@ -26,12 +28,88 @@ For Electron addon and cmake-js options, you can see [cmake-js](https://github.c ## Run +### Basic Usage + ```shell cd examples/addon.node node index.js --language='language' --model='model-path' --fname_inp='file-path' ``` -Because this is a simple Demo, only the above parameters are set in the node environment. +### VAD (Voice Activity Detection) Usage + +Run the VAD example with performance comparison: + +```shell +node vad-example.js +``` + +## Voice Activity Detection (VAD) Support + +VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence. + +### VAD Model Setup + +Before using VAD, download a VAD model: + +```shell +# From the whisper.cpp root directory +./models/download-vad-model.sh silero-v5.1.2 +``` + +### VAD Parameters + +All VAD parameters are optional and have sensible defaults: + +- `vad`: Enable VAD (default: false) +- `vad_model`: Path to VAD model file (required when VAD enabled) +- `vad_threshold`: Speech detection threshold 0.0-1.0 (default: 0.5) +- `vad_min_speech_duration_ms`: Min speech duration in ms (default: 250) +- `vad_min_silence_duration_ms`: Min silence duration in ms (default: 100) +- `vad_max_speech_duration_s`: Max speech duration in seconds (default: FLT_MAX) +- `vad_speech_pad_ms`: Speech padding in ms (default: 30) +- `vad_samples_overlap`: Sample overlap 0.0-1.0 (default: 0.1) + +### JavaScript API Example + +```javascript +const path = require("path"); +const { whisper } = require(path.join(__dirname, "../../build/Release/addon.node")); +const { promisify } = require("util"); + +const whisperAsync = promisify(whisper); + +// With VAD enabled +const vadParams = { + language: "en", + model: path.join(__dirname, "../../models/ggml-base.en.bin"), + fname_inp: path.join(__dirname, "../../samples/jfk.wav"), + vad: true, + vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), + vad_threshold: 0.5, + progress_callback: (progress) => console.log(`Progress: ${progress}%`) +}; + +whisperAsync(vadParams).then(result => console.log(result)); +``` + +## Supported Parameters + +Both traditional whisper.cpp parameters and new VAD parameters are supported: -Other parameters can also be specified in the node environment. +- `language`: Language code (e.g., "en", "es", "fr") +- `model`: Path to whisper model file +- `fname_inp`: Path to input audio file +- `use_gpu`: Enable GPU acceleration (default: true) +- `flash_attn`: Enable flash attention (default: false) +- `no_prints`: Disable console output (default: false) +- `no_timestamps`: Disable timestamps (default: false) +- `detect_language`: Auto-detect language (default: false) +- `audio_ctx`: Audio context size (default: 0) +- `max_len`: Maximum segment length (default: 0) +- `max_context`: Maximum context size (default: -1) +- `prompt`: Initial prompt for decoder +- `comma_in_time`: Use comma in timestamps (default: true) +- `print_progress`: Print progress info (default: false) +- `progress_callback`: Progress callback function +- VAD parameters (see above section) diff --git a/examples/addon.node/__test__/whisper.spec.js b/examples/addon.node/__test__/whisper.spec.js index ce063211efe..b07430aa249 100644 --- a/examples/addon.node/__test__/whisper.spec.js +++ b/examples/addon.node/__test__/whisper.spec.js @@ -1,39 +1,133 @@ -const path = require("path"); -const { whisper } = require(path.join( - __dirname, - "../../../build/Release/addon.node" -)); -const { promisify } = require("util"); +const { join } = require('path'); +const { whisper } = require('../../../build/Release/addon.node'); +const { promisify } = require('util'); const whisperAsync = promisify(whisper); -const whisperParamsMock = { - language: "en", - model: path.join(__dirname, "../../../models/ggml-base.en.bin"), - fname_inp: path.join(__dirname, "../../../samples/jfk.wav"), +const commonParams = { + language: 'en', + model: join(__dirname, '../../../models/ggml-base.en.bin'), + fname_inp: join(__dirname, '../../../samples/jfk.wav'), use_gpu: true, flash_attn: false, no_prints: true, - comma_in_time: false, - translate: true, no_timestamps: false, detect_language: false, audio_ctx: 0, - max_len: 0, - prompt: "", - print_progress: false, - progress_callback: (progress) => { - console.log(`Progress: ${progress}`); - }, - max_context: -1 + max_len: 0 }; -describe("Run whisper.node", () => { - test("it should receive a non-empty value", async () => { - let result = await whisperAsync(whisperParamsMock); - console.log(result); +describe('Whisper.cpp Node.js addon with VAD support', () => { + test('Basic whisper transcription without VAD', async () => { + const params = { + ...commonParams, + vad: false + }; - expect(result['transcription'].length).toBeGreaterThan(0); - }, 10000); + const result = await whisperAsync(params); + + expect(typeof result).toBe('object'); + expect(Array.isArray(result.transcription)).toBe(true); + expect(result.transcription.length).toBeGreaterThan(0); + + // Check that we got some transcription text + const text = result.transcription.map(segment => segment[2]).join(' '); + expect(text.length).toBeGreaterThan(0); + expect(text.toLowerCase()).toContain('ask not'); + }, 30000); + + test('VAD parameters validation', async () => { + // Test with invalid VAD model - should return empty transcription + const invalidParams = { + ...commonParams, + vad: true, + vad_model: 'non-existent-model.bin', + vad_threshold: 0.5 + }; + + // This should handle the error gracefully and return empty transcription + const result = await whisperAsync(invalidParams); + expect(typeof result).toBe('object'); + expect(Array.isArray(result.transcription)).toBe(true); + // When VAD model doesn't exist, it should return empty transcription + expect(result.transcription.length).toBe(0); + }, 10000); + + test('VAD parameter parsing', async () => { + // Test that VAD parameters are properly parsed (even if VAD model doesn't exist) + const vadParams = { + ...commonParams, + vad: false, // Disabled so no model required + vad_threshold: 0.7, + vad_min_speech_duration_ms: 300, + vad_min_silence_duration_ms: 150, + vad_max_speech_duration_s: 45.0, + vad_speech_pad_ms: 50, + vad_samples_overlap: 0.15 + }; + + const result = await whisperAsync(vadParams); + + expect(typeof result).toBe('object'); + expect(Array.isArray(result.transcription)).toBe(true); + }, 30000); + + test('Progress callback with VAD disabled', async () => { + let progressCalled = false; + let lastProgress = 0; + + const params = { + ...commonParams, + vad: false, + progress_callback: (progress) => { + progressCalled = true; + lastProgress = progress; + expect(progress).toBeGreaterThanOrEqual(0); + expect(progress).toBeLessThanOrEqual(100); + } + }; + + const result = await whisperAsync(params); + + expect(progressCalled).toBe(true); + expect(lastProgress).toBe(100); + expect(typeof result).toBe('object'); + }, 30000); + + test('Language detection without VAD', async () => { + const params = { + ...commonParams, + vad: false, + detect_language: true, + language: 'auto' + }; + + const result = await whisperAsync(params); + + expect(typeof result).toBe('object'); + expect(typeof result.language).toBe('string'); + expect(result.language.length).toBeGreaterThan(0); + }, 30000); + + test('Basic transcription with all VAD parameters set', async () => { + // Test with VAD disabled but all parameters set to ensure no crashes + const params = { + ...commonParams, + vad: false, // Disabled so it works without VAD model + vad_model: '', // Empty model path + vad_threshold: 0.6, + vad_min_speech_duration_ms: 200, + vad_min_silence_duration_ms: 80, + vad_max_speech_duration_s: 25.0, + vad_speech_pad_ms: 40, + vad_samples_overlap: 0.08 + }; + + const result = await whisperAsync(params); + + expect(typeof result).toBe('object'); + expect(Array.isArray(result.transcription)).toBe(true); + expect(result.transcription.length).toBeGreaterThan(0); + }, 30000); }); diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 67b1ec92d7b..952e44e3ce7 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -9,6 +9,7 @@ #include #include #include +#include struct whisper_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); @@ -51,6 +52,16 @@ struct whisper_params { std::vector fname_out = {}; std::vector pcmf32 = {}; // mono-channel F32 PCM + + // Voice Activity Detection (VAD) parameters + bool vad = false; + std::string vad_model = ""; + float vad_threshold = 0.5f; + int vad_min_speech_duration_ms = 250; + int vad_min_silence_duration_ms = 100; + float vad_max_speech_duration_s = FLT_MAX; + int vad_speech_pad_ms = 30; + float vad_samples_overlap = 0.1f; }; struct whisper_print_user_data { @@ -333,16 +344,16 @@ class ProgressWorker : public Napi::AsyncWorker { }; wparams.progress_callback_user_data = this; - // Abort mechanism example - { - static bool is_aborted = false; // Note: this should be atomic to avoid data races + // Set VAD parameters + wparams.vad = params.vad; + wparams.vad_model_path = params.vad_model.c_str(); - wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) { - bool is_aborted = *(bool*)user_data; - return !is_aborted; - }; - wparams.encoder_begin_callback_user_data = &is_aborted; - } + wparams.vad_params.threshold = params.vad_threshold; + wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms; + wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms; + wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s; + wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; + wparams.vad_params.samples_overlap = params.vad_samples_overlap; if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) { fprintf(stderr, "failed to process audio\n"); @@ -385,14 +396,46 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { std::string language = whisper_params.Get("language").As(); std::string model = whisper_params.Get("model").As(); std::string input = whisper_params.Get("fname_inp").As(); - bool use_gpu = whisper_params.Get("use_gpu").As(); - bool flash_attn = whisper_params.Get("flash_attn").As(); - bool no_prints = whisper_params.Get("no_prints").As(); - bool no_timestamps = whisper_params.Get("no_timestamps").As(); - bool detect_language = whisper_params.Get("detect_language").As(); - int32_t audio_ctx = whisper_params.Get("audio_ctx").As(); - bool comma_in_time = whisper_params.Get("comma_in_time").As(); - int32_t max_len = whisper_params.Get("max_len").As(); + + bool use_gpu = true; + if (whisper_params.Has("use_gpu") && whisper_params.Get("use_gpu").IsBoolean()) { + use_gpu = whisper_params.Get("use_gpu").As(); + } + + bool flash_attn = false; + if (whisper_params.Has("flash_attn") && whisper_params.Get("flash_attn").IsBoolean()) { + flash_attn = whisper_params.Get("flash_attn").As(); + } + + bool no_prints = false; + if (whisper_params.Has("no_prints") && whisper_params.Get("no_prints").IsBoolean()) { + no_prints = whisper_params.Get("no_prints").As(); + } + + bool no_timestamps = false; + if (whisper_params.Has("no_timestamps") && whisper_params.Get("no_timestamps").IsBoolean()) { + no_timestamps = whisper_params.Get("no_timestamps").As(); + } + + bool detect_language = false; + if (whisper_params.Has("detect_language") && whisper_params.Get("detect_language").IsBoolean()) { + detect_language = whisper_params.Get("detect_language").As(); + } + + int32_t audio_ctx = 0; + if (whisper_params.Has("audio_ctx") && whisper_params.Get("audio_ctx").IsNumber()) { + audio_ctx = whisper_params.Get("audio_ctx").As(); + } + + bool comma_in_time = true; + if (whisper_params.Has("comma_in_time") && whisper_params.Get("comma_in_time").IsBoolean()) { + comma_in_time = whisper_params.Get("comma_in_time").As(); + } + + int32_t max_len = 0; + if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) { + max_len = whisper_params.Get("max_len").As(); + } // Add support for max_context int32_t max_context = -1; @@ -408,7 +451,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { // Add support for print_progress bool print_progress = false; - if (whisper_params.Has("print_progress")) { + if (whisper_params.Has("print_progress") && whisper_params.Get("print_progress").IsBoolean()) { print_progress = whisper_params.Get("print_progress").As(); } // Add support for progress_callback @@ -417,6 +460,47 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { progress_callback = whisper_params.Get("progress_callback").As(); } + // Add support for VAD parameters + bool vad = false; + if (whisper_params.Has("vad") && whisper_params.Get("vad").IsBoolean()) { + vad = whisper_params.Get("vad").As(); + } + + std::string vad_model = ""; + if (whisper_params.Has("vad_model") && whisper_params.Get("vad_model").IsString()) { + vad_model = whisper_params.Get("vad_model").As(); + } + + float vad_threshold = 0.5f; + if (whisper_params.Has("vad_threshold") && whisper_params.Get("vad_threshold").IsNumber()) { + vad_threshold = whisper_params.Get("vad_threshold").As(); + } + + int vad_min_speech_duration_ms = 250; + if (whisper_params.Has("vad_min_speech_duration_ms") && whisper_params.Get("vad_min_speech_duration_ms").IsNumber()) { + vad_min_speech_duration_ms = whisper_params.Get("vad_min_speech_duration_ms").As(); + } + + int vad_min_silence_duration_ms = 100; + if (whisper_params.Has("vad_min_silence_duration_ms") && whisper_params.Get("vad_min_silence_duration_ms").IsNumber()) { + vad_min_silence_duration_ms = whisper_params.Get("vad_min_silence_duration_ms").As(); + } + + float vad_max_speech_duration_s = FLT_MAX; + if (whisper_params.Has("vad_max_speech_duration_s") && whisper_params.Get("vad_max_speech_duration_s").IsNumber()) { + vad_max_speech_duration_s = whisper_params.Get("vad_max_speech_duration_s").As(); + } + + int vad_speech_pad_ms = 30; + if (whisper_params.Has("vad_speech_pad_ms") && whisper_params.Get("vad_speech_pad_ms").IsNumber()) { + vad_speech_pad_ms = whisper_params.Get("vad_speech_pad_ms").As(); + } + + float vad_samples_overlap = 0.1f; + if (whisper_params.Has("vad_samples_overlap") && whisper_params.Get("vad_samples_overlap").IsNumber()) { + vad_samples_overlap = whisper_params.Get("vad_samples_overlap").As(); + } + Napi::Value pcmf32Value = whisper_params.Get("pcmf32"); std::vector pcmf32_vec; if (pcmf32Value.IsTypedArray()) { @@ -444,6 +528,16 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { params.prompt = prompt; params.detect_language = detect_language; + // Set VAD parameters + params.vad = vad; + params.vad_model = vad_model; + params.vad_threshold = vad_threshold; + params.vad_min_speech_duration_ms = vad_min_speech_duration_ms; + params.vad_min_silence_duration_ms = vad_min_silence_duration_ms; + params.vad_max_speech_duration_s = vad_max_speech_duration_s; + params.vad_speech_pad_ms = vad_speech_pad_ms; + params.vad_samples_overlap = vad_samples_overlap; + Napi::Function callback = info[1].As(); // Create a new Worker class with progress callback support ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env); diff --git a/examples/addon.node/vad-example.js b/examples/addon.node/vad-example.js new file mode 100644 index 00000000000..a9e0dae7adf --- /dev/null +++ b/examples/addon.node/vad-example.js @@ -0,0 +1,132 @@ +const path = require("path"); +const { whisper } = require(path.join( + __dirname, + "../../build/Release/addon.node" +)); +const { promisify } = require("util"); + +const whisperAsync = promisify(whisper); + +// Example with VAD enabled +const vadParams = { + language: "en", + model: path.join(__dirname, "../../models/ggml-base.en.bin"), + fname_inp: path.join(__dirname, "../../samples/jfk.wav"), + use_gpu: true, + flash_attn: false, + no_prints: false, + comma_in_time: true, + translate: false, + no_timestamps: false, + detect_language: false, + audio_ctx: 0, + max_len: 0, + // VAD parameters + vad: true, + vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model + vad_threshold: 0.5, + vad_min_speech_duration_ms: 250, + vad_min_silence_duration_ms: 100, + vad_max_speech_duration_s: 30.0, + vad_speech_pad_ms: 30, + vad_samples_overlap: 0.1, + progress_callback: (progress) => { + console.log(`VAD Transcription progress: ${progress}%`); + } +}; + +// Example without VAD (traditional approach) +const traditionalParams = { + language: "en", + model: path.join(__dirname, "../../models/ggml-base.en.bin"), + fname_inp: path.join(__dirname, "../../samples/jfk.wav"), + use_gpu: true, + flash_attn: false, + no_prints: false, + comma_in_time: true, + translate: false, + no_timestamps: false, + detect_language: false, + audio_ctx: 0, + max_len: 0, + vad: false, // Explicitly disable VAD + progress_callback: (progress) => { + console.log(`Traditional transcription progress: ${progress}%`); + } +}; + +async function runVADExample() { + try { + console.log("=== Whisper.cpp Node.js VAD Example ===\n"); + + // Check if VAD model exists + const fs = require('fs'); + if (!fs.existsSync(vadParams.vad_model)) { + console.log("⚠️ VAD model not found. Please download the VAD model first:"); + console.log(" ./models/download-vad-model.sh silero-v5.1.2"); + console.log(" Or run: python models/convert-silero-vad-to-ggml.py"); + console.log("\n Falling back to traditional transcription without VAD...\n"); + + // Run without VAD + console.log("🎵 Running traditional transcription..."); + const traditionalResult = await whisperAsync(traditionalParams); + console.log("\n📝 Traditional transcription result:"); + console.log(traditionalResult); + return; + } + + console.log("🎵 Running transcription with VAD enabled..."); + console.log("VAD Parameters:"); + console.log(` - Threshold: ${vadParams.vad_threshold}`); + console.log(` - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`); + console.log(` - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`); + console.log(` - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`); + console.log(` - Speech padding: ${vadParams.vad_speech_pad_ms}ms`); + console.log(` - Samples overlap: ${vadParams.vad_samples_overlap}\n`); + + const startTime = Date.now(); + const vadResult = await whisperAsync(vadParams); + const vadDuration = Date.now() - startTime; + + console.log("\n✅ VAD transcription completed!"); + console.log(`⏱️ Processing time: ${vadDuration}ms`); + console.log("\n📝 VAD transcription result:"); + console.log(vadResult); + + // Compare with traditional approach + console.log("\n🔄 Running traditional transcription for comparison..."); + const traditionalStartTime = Date.now(); + const traditionalResult = await whisperAsync(traditionalParams); + const traditionalDuration = Date.now() - traditionalStartTime; + + console.log("\n✅ Traditional transcription completed!"); + console.log(`⏱️ Processing time: ${traditionalDuration}ms`); + console.log("\n📝 Traditional transcription result:"); + console.log(traditionalResult); + + // Performance comparison + console.log("\n📊 Performance Comparison:"); + console.log(`VAD: ${vadDuration}ms`); + console.log(`Traditional: ${traditionalDuration}ms`); + const speedup = traditionalDuration / vadDuration; + if (speedup > 1) { + console.log(`🚀 VAD is ${speedup.toFixed(2)}x faster!`); + } else { + console.log(`ℹ️ Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`); + } + + } catch (error) { + console.error("❌ Error during transcription:", error); + } +} + +// Run the example +if (require.main === module) { + runVADExample(); +} + +module.exports = { + runVADExample, + vadParams, + traditionalParams +}; \ No newline at end of file From 869335f2d58d04010535be9ae23a69a9da12a169 Mon Sep 17 00:00:00 2001 From: accessiblepixel <62973371+accessiblepixel@users.noreply.github.com> Date: Mon, 7 Jul 2025 10:51:15 +0100 Subject: [PATCH 0048/1348] server : add dtw.params for v3-large-turbo (#3307) * Add DTW model large-v3-turbo parameters to server.cpp example DTW support is available in whispercpp and the large-v3-turbo model has already been added to the sources, but the large-v3-turbo model hasn't been added to the server.cpp file to make use of it. This commit hopefully corrects that issue. * match original linebreak of original server.cpp file after adding large.v3.turbo dtw --- examples/server/server.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8b6c5a96720..643d08a799a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -674,7 +674,10 @@ int main(int argc, char ** argv) { if (params.dtw == "large.v3") { cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3; } - + if (params.dtw == "large.v3.turbo") { + cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3_TURBO; + } + if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) { fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str()); return 3; From 5ea5c5876897ef10c2d573df1cfcf085f1d25f5f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 2 Jul 2025 13:55:32 +0200 Subject: [PATCH 0049/1348] ggml : add version function to get lib version (ggml/1286) * ggml : add version function to get lib version This commit adds a function `ggml_version()` to the ggml library that returns the version of the library as a string. The motivation for this is that it can be useful to be able to programmatically check the version of the ggml library being used. Usage: ```c printf("GGML version: %s\n", ggml_version()); ``` Output: ```console GGML version: 0.0.2219 ``` * ggml : add ggml_commit() --------- Co-authored-by: Georgi Gerganov --- ggml/CMakeLists.txt | 7 +++++++ ggml/include/ggml.h | 3 +++ ggml/src/ggml.c | 8 ++++++++ 3 files changed, 18 insertions(+) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 215eb234868..675f3bf7567 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -360,6 +360,13 @@ write_basic_package_version_file( VERSION ${GGML_INSTALL_VERSION} COMPATIBILITY SameMajorVersion) +target_compile_definitions(ggml-base PRIVATE + GGML_VERSION="${GGML_INSTALL_VERSION}" + GGML_COMMIT="${GGML_BUILD_COMMIT}" +) +message(STATUS "ggml version: ${GGML_INSTALL_VERSION}") +message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}") + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f8238f3159b..abbfcc831b0 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -639,6 +639,9 @@ extern "C" { // misc + GGML_API const char * ggml_version(void); + GGML_API const char * ggml_commit(void); + GGML_API void ggml_time_init(void); // call this once at the beginning of the program GGML_API int64_t ggml_time_ms(void); GGML_API int64_t ggml_time_us(void); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c51cb57cc68..4d31b2f0722 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -458,6 +458,14 @@ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) { return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0; } +const char * ggml_version(void) { + return GGML_VERSION; +} + +const char * ggml_commit(void) { + return GGML_COMMIT; +} + // // timing // From f98cb6607b0087bdefc69500907d953019e4cac0 Mon Sep 17 00:00:00 2001 From: Acly Date: Thu, 3 Jul 2025 19:47:15 +0200 Subject: [PATCH 0050/1348] vulkan : implement ggml_roll (ggml/1290) * vulkan : implement ggml_roll * vulkan : refactor vk_op_unary_push_constants initialization --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 174 ++++++++---------- ggml/src/ggml-vulkan/vulkan-shaders/roll.comp | 46 +++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 + 3 files changed, 127 insertions(+), 95 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/roll.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4696f1fe46e..0d12dd97547 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -417,6 +417,7 @@ struct vk_device_struct { vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; + vk_pipeline pipeline_roll_f32; vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16; vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16; @@ -687,6 +688,37 @@ struct vk_op_unary_push_constants { }; static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); +static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) { + GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst))); + ne = ne != 0 ? ne : ggml_nelements(dst); + GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); + + vk_op_unary_push_constants p{}; + p.ne = (uint32_t)ne; + + size_t src0_tsize = ggml_type_size(src0->type); + p.ne00 = (uint32_t)src0->ne[0]; + p.ne01 = (uint32_t)src0->ne[1]; + p.ne02 = (uint32_t)src0->ne[2]; + p.ne03 = (uint32_t)src0->ne[3]; + p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); + p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); + p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); + p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); + + size_t dst_tsize = ggml_type_size(dst->type); + p.ne10 = (uint32_t)dst->ne[0]; + p.ne11 = (uint32_t)dst->ne[1]; + p.ne12 = (uint32_t)dst->ne[2]; + p.ne13 = (uint32_t)dst->ne[3]; + p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); + p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); + p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); + p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); + + return p; // fastdiv values and offsets are initialized later in ggml_vk_op +} + // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division // can be computed using a multiply (high 32b of 64b result) @@ -2753,6 +2785,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -6425,6 +6459,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_pad_f32; } return nullptr; + case GGML_OP_ROLL: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_roll_f32; + } + return nullptr; case GGML_OP_REPEAT: if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { return ctx->device->pipeline_repeat_f32; @@ -6965,6 +7004,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_ROLL: case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_CPY: @@ -7416,117 +7456,60 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - float * op_params = (float *)dst->op_params; - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = ggml_get_op_params_f32(dst, 0); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - op_params[0], 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun); } static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun); } static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun); } static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun); } static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - float * op_params = (float *)dst->op_params; - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = ggml_get_op_params_f32(dst, 0); + p.param2 = ggml_get_op_params_f32(dst, 1); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - op_params[0], op_params[1], - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun); } static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun); +} - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, { - (uint32_t)ggml_nelements(dst), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); +static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + const int32_t s0 = ggml_get_op_params_i32(dst, 0); + const int32_t s1 = ggml_get_op_params_i32(dst, 1); + const int32_t s2 = ggml_get_op_params_i32(dst, 2); + const int32_t s3 = ggml_get_op_params_i32(dst, 3); + const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000); + const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000); + + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + memcpy(&p.param1, &s01_packed, sizeof(float)); + memcpy(&p.param2, &s23_packed, sizeof(float)); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun); } static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, { - (uint32_t)ggml_nelements(dst), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun); } static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, { - (uint32_t)ggml_nelements(dst), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun); } static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { @@ -7544,14 +7527,8 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const } } - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { - ne, - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun); } static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -8862,6 +8839,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_ROLL: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: @@ -9031,6 +9009,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_PAD: ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_ROLL: + ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun); + break; case GGML_OP_CPY: case GGML_OP_CONT: @@ -9247,6 +9229,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_ROLL: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: @@ -10368,6 +10351,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_CONCAT: case GGML_OP_SCALE: case GGML_OP_PAD: + case GGML_OP_ROLL: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp new file mode 100644 index 00000000000..b9abe8dedcf --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp @@ -0,0 +1,46 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +uint wrap_idx(int i, uint ne) { + if (i < 0) { + return i + ne; + } else if (i >= ne) { + return i - ne; + } + return i; +} + +void main() { + const uint idx = get_idx(); + if (idx >= p.ne) { + return; + } + + const uint i3 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; + const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, p.ne1_01L); + const uint i2_offset = i2*p.ne11*p.ne10; + const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, p.ne1_0L); + const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; + + const uint p1 = floatBitsToUint(p.param1); + const uint p2 = floatBitsToUint(p.param2); + const int s0 = int(p1 >> 16) - 0x8000; + const int s1 = int(p1 & 0xFFFF) - 0x8000; + const int s2 = int(p2 >> 16) - 0x8000; + const int s3 = int(p2 & 0xFFFF) - 0x8000; + + const uint i00 = wrap_idx(int(i0) - s0, p.ne10); + const uint i01 = wrap_idx(int(i1) - s1, p.ne11); + const uint i02 = wrap_idx(int(i2) - s2, p.ne12); + const uint i03 = wrap_idx(int(i3) - s3, p.ne13); + + const uint a_idx = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; + const uint d_idx = i3 *p.nb13 + i2 *p.nb12 + i1 *p.nb11 + i0 *p.nb10; + + data_d[get_doffset() + d_idx] = D_TYPE(data_a[get_aoffset() + a_idx]); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 23fc50bf295..5b527b104cc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -642,6 +642,8 @@ void process_shaders() { string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); + string_to_spv("roll_f32", "roll.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + for (auto &c : compiles) { c.wait(); } From 19b9aaf0444563d7fd449e2596c653c56b9fae10 Mon Sep 17 00:00:00 2001 From: Acly Date: Thu, 3 Jul 2025 19:58:12 +0200 Subject: [PATCH 0051/1348] vulkan : implement bilinear interpolation for ggml_upscale/ggml_interpolate (ggml/1291) * supports GGML_SCALE_MODE_BILINEAR and GGML_SCALE_FLAG_ALIGN_CORNERS --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 35 ++++++--- .../ggml-vulkan/vulkan-shaders/upscale.comp | 74 +++++++++++++++++-- 2 files changed, 95 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0d12dd97547..110348bf065 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -410,7 +410,7 @@ struct vk_device_struct { vk_pipeline pipeline_div_norepeat[2][2][2]; vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; - vk_pipeline pipeline_upscale_f32; + vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32; vk_pipeline pipeline_scale_f32; vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_sin_f32; @@ -880,6 +880,7 @@ struct vk_op_conv2d_dw_push_constants { struct vk_op_upscale_push_constants { uint32_t ne; uint32_t a_offset; uint32_t d_offset; + uint32_t ne00; uint32_t ne01; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; float sf0; float sf1; float sf2; float sf3; @@ -2773,7 +2774,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1); ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -6425,8 +6428,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_UPSCALE: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) { - return ctx->device->pipeline_upscale_f32; + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + int mode = ggml_get_op_params_i32(dst, 0); + switch (mode) { + case GGML_SCALE_MODE_NEAREST: + return ctx->device->pipeline_upscale_nearest_f32; + case GGML_SCALE_MODE_BILINEAR: + return ctx->device->pipeline_upscale_bilinear_f32; + case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS: + return ctx->device->pipeline_upscale_bilinear_ac_f32; + } } return nullptr; case GGML_OP_SCALE: @@ -7441,14 +7452,21 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0); - const float sf0 = (float)dst->ne[0] / src0->ne[0]; - const float sf1 = (float)dst->ne[1] / src0->ne[1]; - const float sf2 = (float)dst->ne[2] / src0->ne[2]; - const float sf3 = (float)dst->ne[3] / src0->ne[3]; + float sf0 = (float)dst->ne[0] / src0->ne[0]; + float sf1 = (float)dst->ne[1] / src0->ne[1]; + float sf2 = (float)dst->ne[2] / src0->ne[2]; + float sf3 = (float)dst->ne[3] / src0->ne[3]; + + if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) { + sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1); + sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1); + } ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, { (uint32_t)ggml_nelements(dst), 0, 0, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3], sf0, sf1, sf2, sf3, @@ -10346,7 +10364,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_UPSCALE: - return op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_ACC: case GGML_OP_CONCAT: case GGML_OP_SCALE: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp index 6f607380df8..d78c6190d93 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp @@ -3,6 +3,7 @@ layout (push_constant) uniform parameter { uint ne; uint a_offset; uint d_offset; + uint ne00; uint ne01; uint nb00; uint nb01; uint nb02; uint nb03; uint ne10; uint ne11; uint ne12; uint ne13; float sf0; float sf1; float sf2; float sf3; @@ -15,6 +16,61 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; +// from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag +#define NEAREST 0 +#define BILINEAR 1 +#define ALIGN_CORNERS (1 << 8) + +layout (constant_id = 0) const uint scale_mode = 0; + +float fetch_nearest(uint i10, uint i11, uint i12, uint i13) { + const uint i00 = uint(i10 / p.sf0); + const uint i01 = uint(i11 / p.sf1); + const uint i02 = uint(i12 / p.sf2); + const uint i03 = uint(i13 / p.sf3); + + return data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]; +} + +float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) { + const uint i02 = uint(i12 / p.sf2); + const uint i03 = uint(i13 / p.sf3); + const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02; + + const float v00 = data_a[base + c0.y * p.nb01 + c0.x * p.nb00]; + const float v01 = data_a[base + c0.y * p.nb01 + c1.x * p.nb00]; + const float v10 = data_a[base + c1.y * p.nb01 + c0.x * p.nb00]; + const float v11 = data_a[base + c1.y * p.nb01 + c1.x * p.nb00]; + + return + v00 * (1.0-d.x) * (1.0-d.y) + + v01 * d.x * (1.0-d.y) + + v10 * (1.0-d.x) * d.y + + v11 * d.x * d.y; +} + +float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) { + const ivec2 ne0 = ivec2(p.ne00, p.ne01); + + const vec2 c = (vec2(i10, i11) + 0.5) / vec2(p.sf0, p.sf1) - 0.5; + const vec2 c0f = floor(c); + const vec2 d = c - c0f; + const ivec2 c0 = max(ivec2(c0f), 0); + const ivec2 c1 = min(ivec2(c0f + 1), ne0 - 1); + + return fetch_bilinear(c0, c1, d, i12, i13); +} + +float interpolate_bilinear_align_corners(uint i10, uint i11, uint i12, uint i13) { + const vec2 c = vec2(i10, i11) / vec2(p.sf0, p.sf1); + const vec2 c0f = floor(c); + const vec2 d = c - c0f; + const ivec2 c0 = ivec2(c0f); + const ivec2 c1 = c0 + 1; + + return fetch_bilinear(c0, c1, d, i12, i13); +} + void main() { const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; @@ -27,10 +83,18 @@ void main() { const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12; const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13; - const uint i00 = uint(i10 / p.sf0); - const uint i01 = uint(i11 / p.sf1); - const uint i02 = uint(i12 / p.sf2); - const uint i03 = uint(i13 / p.sf3); + float result; + switch (scale_mode) { + case NEAREST: + result = fetch_nearest(i10, i11, i12, i13); + break; + case BILINEAR: + result = interpolate_bilinear(i10, i11, i12, i13); + break; + case BILINEAR | ALIGN_CORNERS: + result = interpolate_bilinear_align_corners(i10, i11, i12, i13); + break; + } - data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]); + data_d[p.d_offset + idx] = D_TYPE(result); } From 6fc80e84568e84a6d896f7676012064977f02f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 1 Jul 2025 10:14:21 +0200 Subject: [PATCH 0052/1348] add GELU_ERF (llama/14455) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 11 ++++++ .../ggml-vulkan/vulkan-shaders/gelu_erf.comp | 39 +++++++++++++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 + 3 files changed, 52 insertions(+) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 110348bf065..fae7851a616 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -432,6 +432,7 @@ struct vk_device_struct { // [src/dst 0=fp32,1=fp16] vk_pipeline pipeline_gelu[2]; + vk_pipeline pipeline_gelu_erf[2]; vk_pipeline pipeline_gelu_quick[2]; vk_pipeline pipeline_silu[2]; vk_pipeline pipeline_relu[2]; @@ -2798,6 +2799,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); CREATE_UNARY(gelu) + CREATE_UNARY(gelu_erf) CREATE_UNARY(gelu_quick) CREATE_UNARY(silu) CREATE_UNARY(relu) @@ -6531,6 +6533,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_GELU: return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_GELU_ERF: + return ctx->device->pipeline_gelu_erf[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_GELU_QUICK: return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_RELU: @@ -8822,6 +8826,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: @@ -9072,6 +9077,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: @@ -9290,6 +9296,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: @@ -10096,6 +10103,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: @@ -10836,6 +10844,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { case GGML_UNARY_OP_GELU: tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]); break; + case GGML_UNARY_OP_GELU_ERF: + tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]); + break; case GGML_UNARY_OP_GELU_QUICK: tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]); break; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp new file mode 100644 index 00000000000..5fd5a5e703a --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp @@ -0,0 +1,39 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation + // ref: https://www.johndcook.com/blog/python_erf/ + const float p_erf = 0.3275911f; + const float a1_erf = 0.254829592f; + const float a2_erf = -0.284496736f; + const float a3_erf = 1.421413741f; + const float a4_erf = -1.453152027f; + const float a5_erf = 1.061405429f; + + const float SQRT_2_INV = 0.70710678118654752440084436210484f; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + + const float a = float(data_a[i]); + const float a_div_sqr2 = a * SQRT_2_INV; + const float sign_x = sign(a_div_sqr2); + const float x = abs(a_div_sqr2); + const float t = 1.0f / (1.0f + p_erf * x); + const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); + const float erf_approx = sign_x * y; + + data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx)); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 5b527b104cc..4658ad0c9c2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -574,6 +574,8 @@ void process_shaders() { string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("gelu_erf_f32", "gelu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("silu_f16", "silu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); From 3c7939cfe5c6432777ec287e38c50e65a1232dce Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 1 Jul 2025 03:43:08 -0500 Subject: [PATCH 0053/1348] vulkan: Split large mul_mat_id to fit in shared memory (llama/14451) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 37 +++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index fae7851a616..09157bf5497 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -6003,7 +6003,30 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); } else { - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); + // Split based on number of ids, to fit in shared memory + const uint32_t nei0 = (uint32_t)src2->ne[0]; + const uint32_t nei1 = (uint32_t)src2->ne[1]; + + GGML_ASSERT(nei0 <= 4096); + const uint32_t split_size = std::min(nei1, 4096u / nei0); + + ggml_tensor src1_copy = *src1; + ggml_tensor src2_copy = *src2; + ggml_tensor dst_copy = *dst; + + for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) { + const uint32_t n_tokens = std::min(split_size, nei1 - token_start); + + src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2]; + src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1]; + dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2]; + + src1_copy.ne[2] = n_tokens; + src2_copy.ne[1] = n_tokens; + dst_copy.ne[2] = n_tokens; + + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun); + } } } @@ -10136,9 +10159,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm ggml_type src0_type = op->src[0]->type; ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; const vk_device& device = ggml_vk_get_device(ctx->device); - if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) { - // If there's not enough shared memory for row_ids and the result tile, fallback to CPU - return false; + if (op->op == GGML_OP_MUL_MAT_ID) { + if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) { + // If there's not enough shared memory for row_ids and the result tile, fallback to CPU + return false; + } + // Check against size of shared memory variable + if (op->src[2]->ne[0] > 4096) { + return false; + } } switch (src0_type) { case GGML_TYPE_F32: From 32d7c10766667e6e5da36c0e730cdc3985c7e345 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Tue, 1 Jul 2025 16:47:30 +0800 Subject: [PATCH 0054/1348] CANN: update aclnnGroupedMatmulV2 to aclnnGroupedMatmulV3 (llama/14411) * [CANN]update to aclnnGroupedMatmulV2 Signed-off-by: noemotiovon <757486878@qq.com> * Support MUL_MAT_ID on 310p Signed-off-by: noemotiovon <757486878@qq.com> * fix editorconfig Signed-off-by: noemotiovon <757486878@qq.com> --------- Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/aclnn_ops.cpp | 69 ++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 437ece2d4a3..69483de8f36 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include #include @@ -2654,6 +2654,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb)); } +#ifdef ASCEND_310P + ggml_tensor src0_row = *src0; + ggml_tensor src1_row = *src1; + ggml_tensor dst_row = *dst; + + if (src0->type == GGML_TYPE_F16) { + src0_row.type = GGML_TYPE_F32; + } + + // src0_row [D, M, 1, 1] weight without permute + src0_row.ne[2] = 1; + src0_row.ne[3] = 1; + src0_row.nb[0] = ori_src0_nb[0]; + src0_row.nb[1] = ori_src0_nb[1]; + src0_row.nb[2] = ori_src0_nb[1]; + src0_row.nb[3] = ori_src0_nb[1]; + + // src1_row [D, 1, 1, 1] -> input + src1_row.ne[1] = 1; + src1_row.ne[2] = 1; + src1_row.ne[3] = 1; + src1_row.nb[2] = nb11; + src1_row.nb[3] = nb11; + + // dst_row [M, 1, 1, 1] -> out + dst_row.ne[1] = 1; + dst_row.ne[2] = 1; + dst_row.ne[3] = 1; + dst_row.nb[2] = nb1; + dst_row.nb[3] = nb1; + + //create weight for one row + for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { + for (int64_t id = 0; id < n_ids; id++) { + // expert index + int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); + GGML_ASSERT(i02 >= 0 && i02 < n_as); + + // If B = 1 (broadcast), always use 0; otherwise, use id. + int64_t i11 = (ne11 == 1 ? 0 : id); + int64_t i12 = iid1; + + int64_t i1 = id; + int64_t i2 = i12; + + void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2]; + void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12; + void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2; + + src0_row.data = src0_tmp_ptr; + src1_row.data = src1_tmp_ptr; + dst_row.data = dst_tmp_ptr; + dst_row.src[0] = &src0_row; + dst_row.src[1] = &src1_row; + + ggml_cann_mul_mat(ctx, &dst_row); + } + } + return; +#endif + std::vector src0_tensor_vec; std::vector src1_tensor_vec; std::vector dst_tensor_vec; @@ -2701,9 +2762,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* } size_t GROUP_SIZE = 128; - // GroupedMatmulV2 required tensor_list.size < 128 + // GroupedMatmulV3 required tensor_list.size < 128 for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) { - // split and call GroupedMatmulV2 + // split and call GroupedMatmulV3 size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size()); std::vector src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end); std::vector src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end); @@ -2713,7 +2774,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size()); aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size()); - GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list, + GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list); ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list); From cdf5e72163517e1c271ed36a02a2af2e2f44167c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Jul 2025 18:04:08 +0300 Subject: [PATCH 0055/1348] ci : disable fast-math for Metal GHA CI (llama/14478) * ci : disable fast-math for Metal GHA CI ggml-ci * cont : remove -g flag ggml-ci --- ggml/src/ggml-metal/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index 77187efc175..0ca8a3c55ec 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -71,7 +71,9 @@ else() # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 # note: unfortunately, we have to call it default.metallib instead of ggml.metallib # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 - set(XC_FLAGS -fno-fast-math -fno-inline -g) + # note: adding -g causes segmentation fault during compile + #set(XC_FLAGS -fno-fast-math -fno-inline -g) + set(XC_FLAGS -fno-fast-math -fno-inline) else() set(XC_FLAGS -O3) endif() @@ -90,7 +92,7 @@ else() add_custom_command( OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - | - xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal DEPENDS ggml-metal.metal ${METALLIB_COMMON} From ab850d4680f415f7b804a0a5a89defca03f3b833 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ganster?= Date: Wed, 2 Jul 2025 07:19:31 +0200 Subject: [PATCH 0056/1348] ggml : Callback before abort (llama/14481) * Add a callback that will be called just before abort. This allows apps without a console to display a message to the user and save data if needed. * Return previous callback to allow callback chaining * style fixes --------- Co-authored-by: Diego Devesa --- ggml/include/ggml.h | 7 +++++++ ggml/src/ggml.c | 23 +++++++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index abbfcc831b0..5ceccb8fce1 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -314,6 +314,13 @@ extern "C" { #endif + // Function type used in fatal error callbacks + typedef void (*ggml_abort_callback_t)(const char * error_message); + + // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout) + // Returns the old callback for chaining + GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback); + GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4d31b2f0722..814f01b7bc8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -202,19 +202,34 @@ void ggml_print_backtrace(void) { } #endif +static ggml_abort_callback_t g_abort_callback = NULL; + +// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout) +GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback) { + ggml_abort_callback_t ret_val = g_abort_callback; + g_abort_callback = callback; + return ret_val; +} + void ggml_abort(const char * file, int line, const char * fmt, ...) { fflush(stdout); - fprintf(stderr, "%s:%d: ", file, line); + char message[2048]; + int offset = snprintf(message, sizeof(message), "%s:%d: ", file, line); va_list args; va_start(args, fmt); - vfprintf(stderr, fmt, args); + vsnprintf(message + offset, sizeof(message) - offset, fmt, args); va_end(args); - fprintf(stderr, "\n"); + if (g_abort_callback) { + g_abort_callback(message); + } else { + // default: print error and backtrace to stderr + fprintf(stderr, "%s\n", message); + ggml_print_backtrace(); + } - ggml_print_backtrace(); abort(); } From e43c38f9f1ea816232a6307873566154fae8f34a Mon Sep 17 00:00:00 2001 From: lhez Date: Wed, 2 Jul 2025 00:07:42 -0700 Subject: [PATCH 0057/1348] opencl : update upscale to support align corners (llama/14488) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 56 +++++++++++++++---------- ggml/src/ggml-opencl/kernels/upscale.cl | 5 +-- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 496e47575e9..518a18ef3a7 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -4453,7 +4453,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); + const int mode_flags = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); + const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF); cl_kernel kernel = nullptr; if (mode == GGML_SCALE_MODE_NEAREST) { @@ -4484,18 +4485,22 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3]; - const int ne00_src = src0->ne[0]; - const int ne01_src = src0->ne[1]; + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; - const int ne10_dst = dst->ne[0]; - const int ne11_dst = dst->ne[1]; - const int ne12_dst = dst->ne[2]; - const int ne13_dst = dst->ne[3]; + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + const int ne3 = dst->ne[3]; + + float sf0 = (float)ne0 / ne00; + float sf1 = (float)ne1 / ne01; + float sf2 = (float)ne2 / ne02; + float sf3 = (float)ne3 / ne03; - const float sf0 = (float)dst->ne[0] / src0->ne[0]; - const float sf1 = (float)dst->ne[1] / src0->ne[1]; - const float sf2 = (float)dst->ne[2] / src0->ne[2]; - const float sf3 = (float)dst->ne[3] / src0->ne[3]; + float pixel_offset = 0.5f; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0)); @@ -4507,29 +4512,36 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03)); if (mode == GGML_SCALE_MODE_NEAREST) { - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne3)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3)); } else if (mode == GGML_SCALE_MODE_BILINEAR) { - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst)); + if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) { + sf0 = (float)(ne0 - 1) / (ne00 - 1); + sf1 = (float)(ne1 - 1) / (ne01 - 1); + pixel_offset = 0.0f; + } + + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne3)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &pixel_offset)); } - size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst; + size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3; if (dst_total_elements == 0) { return; } diff --git a/ggml/src/ggml-opencl/kernels/upscale.cl b/ggml/src/ggml-opencl/kernels/upscale.cl index 219d31dbb92..25c68351bae 100644 --- a/ggml/src/ggml-opencl/kernels/upscale.cl +++ b/ggml/src/ggml-opencl/kernels/upscale.cl @@ -60,7 +60,8 @@ kernel void kernel_upscale_bilinear( float sf0, float sf1, float sf2, - float sf3 + float sf3, + float pixel_offset ) { global const char * src_base = (global const char *)p_src0 + off_src0; global float * dst_base = (global float *)((global char *)p_dst + off_dst); @@ -80,8 +81,6 @@ kernel void kernel_upscale_bilinear( int i02_src = (int)(i12_dst / sf2); int i03_src = (int)(i13_dst / sf3); - const float pixel_offset = 0.5f; - float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset; long y0_src = (long)floor(y_src_f); long y1_src = y0_src + 1; From cd87a2f7e0c09eff56800d5a64c6a46b6f0a2bf6 Mon Sep 17 00:00:00 2001 From: Eric Zhang <34133756+EZForever@users.noreply.github.com> Date: Wed, 2 Jul 2025 19:00:04 +0800 Subject: [PATCH 0058/1348] opencl : skip empty nodes on cgraph compute (llama/14491) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 518a18ef3a7..f747f756816 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2187,7 +2187,7 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm // dependencies. sync_with_other_backends(backend); - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } From 2e04b81f3ece0465ce66fd984dcf71636f40b338 Mon Sep 17 00:00:00 2001 From: zhouwg Date: Wed, 2 Jul 2025 20:38:10 +0800 Subject: [PATCH 0059/1348] opencl : fix possible buffer overflow in dump_tensor (llama/14490) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index f747f756816..71829c05a5b 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -3199,7 +3199,7 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso // Open file and dump. char fname[512]; - sprintf(fname, "./tensor-dumps/%s.txt", tensor->name); + snprintf(fname, sizeof(fname), "./tensor-dumps/%s.txt", tensor->name); FILE * f = fopen(fname, "w"); if (!f) { printf("Failed to open %s\n", fname); From d1286cf32b1b18219d33b692ee52383c25428e7f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 12 Jul 2025 14:33:49 +0300 Subject: [PATCH 0060/1348] ggml : support bcast ggml_soft_max_ext, ggml_flash_attn_ext (llama/14435) --- ggml/CMakeLists.txt | 7 - ggml/include/ggml.h | 26 +- ggml/src/ggml-cann/ggml-cann.cpp | 7 +- ggml/src/ggml-cpu/ops.cpp | 109 ++++----- ggml/src/ggml-cuda/ggml-cuda.cu | 11 +- ggml/src/ggml-metal/ggml-metal-impl.h | 20 +- ggml/src/ggml-metal/ggml-metal.m | 23 +- ggml/src/ggml-metal/ggml-metal.metal | 72 +++--- ggml/src/ggml-sycl/ggml-sycl.cpp | 10 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 222 +++++++++--------- .../ggml-vulkan/vulkan-shaders/upscale.comp | 74 +----- .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 - ggml/src/ggml.c | 20 +- 13 files changed, 301 insertions(+), 302 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 675f3bf7567..215eb234868 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -360,13 +360,6 @@ write_basic_package_version_file( VERSION ${GGML_INSTALL_VERSION} COMPATIBILITY SameMajorVersion) -target_compile_definitions(ggml-base PRIVATE - GGML_VERSION="${GGML_INSTALL_VERSION}" - GGML_COMMIT="${GGML_BUILD_COMMIT}" -) -message(STATUS "ggml version: ${GGML_INSTALL_VERSION}") -message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}") - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 5ceccb8fce1..a92495dbb00 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -646,9 +646,6 @@ extern "C" { // misc - GGML_API const char * ggml_version(void); - GGML_API const char * ggml_commit(void); - GGML_API void ggml_time_init(void); // call this once at the beginning of the program GGML_API int64_t ggml_time_ms(void); GGML_API int64_t ggml_time_us(void); @@ -1513,8 +1510,14 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // a [ne0, ne01, ne02, ne03] + // mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional + // + // broadcast: + // ne02 % ne12 == 0 + // ne03 % ne13 == 0 + // // fused soft_max(a*scale + mask*(ALiBi slope)) - // mask is optional // max_bias = 0.0f for no ALiBi GGML_API struct ggml_tensor * ggml_soft_max_ext( struct ggml_context * ctx, @@ -1977,11 +1980,16 @@ extern "C" { #define GGML_KQ_MASK_PAD 64 - // q: [n_embd_k, n_batch, n_head, 1] - // k: [n_embd_k, n_kv, n_head_kv, 1] - // v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !! - // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !! - // res: [n_embd_v, n_head, n_batch, 1] !! permuted !! + // q: [n_embd_k, n_batch, n_head, ne3] + // k: [n_embd_k, n_kv, n_head_kv, ne3] + // v: [n_embd_v, n_kv, n_head_kv, ne3] !! not transposed !! + // mask: [n_kv, n_batch_pad, ne32, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !! + // res: [n_embd_v, n_head, n_batch, ne3] !! permuted !! + // + // broadcast: + // n_head % n_head_kv == 0 + // ne3 % ne32 == 0 + // GGML_API struct ggml_tensor * ggml_flash_attn_ext( struct ggml_context * ctx, struct ggml_tensor * q, diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index d1a0ad374d6..8a3d2026ed8 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2187,7 +2187,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_SQRT: case GGML_OP_CLAMP: case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: @@ -2205,6 +2204,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_PAD_REFLECT_1D: case GGML_OP_COUNT_EQUAL: return true; + case GGML_OP_SOFT_MAX: + // TODO: support broadcast + // ref: https://github.com/ggml-org/llama.cpp/pull/14435 + return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1); case GGML_OP_FLASH_ATTN_EXT:{ // derived from [ggml-cuda.cu] if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){ @@ -2227,6 +2230,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, // DeepSeek MLA return false; } + // TODO: support broadcast + // ref: https://github.com/ggml-org/llama.cpp/pull/14435 if (op->src[0]->ne[3] != 1) { return false; } diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index dd83efde714..2ae0721e9d1 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5232,14 +5232,17 @@ static void ggml_compute_forward_soft_max_f32( memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); - // TODO: handle transposed/permuted matrices - const int ith = params->ith; const int nth = params->nth; GGML_TENSOR_UNARY_OP_LOCALS - //const int64_t ne11 = src1 ? src1->ne[1] : 1; + const int64_t nb11 = src1 ? src1->nb[1] : 1; + const int64_t nb12 = src1 ? src1->nb[2] : 1; + const int64_t nb13 = src1 ? src1->nb[3] : 1; + + const int64_t ne12 = src1 ? src1->ne[2] : 1; + const int64_t ne13 = src1 ? src1->ne[3] : 1; // TODO: is this supposed to be ceil instead of floor? // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 @@ -5249,68 +5252,66 @@ static void ggml_compute_forward_soft_max_f32( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; + float * wp = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); - for (int i1 = ir0; i1 < ir1; i1++) { - // ALiBi - const uint32_t h = (i1/ne01)%ne02; // head - const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; - - float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); - float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); - - // broadcast the mask across rows - ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; - float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; - - ggml_vec_cpy_f32 (nc, wp, sp); - ggml_vec_scale_f32(nc, wp, scale); - if (mp_f32) { - if (use_f16) { - for (int i = 0; i < nc; ++i) { - wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]); - } - } else { - for (int i = 0; i < nc; ++i) { - wp[i] += slope*mp_f32[i]; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + const int64_t i11 = i01; + const int64_t i12 = i02%ne12; + const int64_t i13 = i03%ne13; + + // ALiBi + const uint32_t h = i02; // head + const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; + + float * sp = (float *)((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + float * dp = (float *)((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + // broadcast the mask across rows + ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL; + float * mp_f32 = src1 ? (float *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL; + + ggml_vec_cpy_f32 (ne00, wp, sp); + ggml_vec_scale_f32(ne00, wp, scale); + if (mp_f32) { + if (use_f16) { + for (int i = 0; i < ne00; ++i) { + wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]); + } + } else { + for (int i = 0; i < ne00; ++i) { + wp[i] += slope*mp_f32[i]; + } + } } - } - } #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { - //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(wp[i])); - } + for (int i = 0; i < ne00; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(wp[i])); + } #endif - float max = -INFINITY; - ggml_vec_max_f32(nc, &max, wp); + float max = -INFINITY; + ggml_vec_max_f32(ne00, &max, wp); - ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max); - assert(sum > 0.0); + ggml_float sum = ggml_vec_soft_max_f32(ne00, dp, wp, max); + assert(sum > 0.0); - sum = 1.0/sum; - ggml_vec_scale_f32(nc, dp, sum); + sum = 1.0/sum; + ggml_vec_scale_f32(ne00, dp, sum); #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { - assert(!isnan(dp[i])); - assert(!isinf(dp[i])); - } + for (int i = 0; i < ne00; ++i) { + assert(!isnan(dp[i])); + assert(!isinf(dp[i])); + } #endif + } + } } } @@ -7766,7 +7767,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_type const k_vec_dot_type = ggml_get_type_traits_cpu(k->type)->vec_dot_type; + ggml_type const k_vec_dot_type = ggml_get_type_traits_cpu(k->type)->vec_dot_type; ggml_from_float_t const q_to_vec_dot = ggml_get_type_traits_cpu(k_vec_dot_type)->from_float; ggml_vec_dot_t const kq_vec_dot = ggml_get_type_traits_cpu(k->type)->vec_dot; ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float; @@ -7798,7 +7799,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( memset(VKQ32, 0, DV*sizeof(float)); } - const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL; + const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1] + (iq3%mask->ne[2])*mask->nb[2]) : NULL; // k indices const int ik3 = iq3 / rk3; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 086f9a56c4a..186492be340 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3327,8 +3327,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CONT: return op->src[0]->type != GGML_TYPE_BF16; case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: return true; + case GGML_OP_SOFT_MAX: + // TODO: support batching + if (op->src[0]->ne[3] != 1) { + return false; + } + // TODO: support broadcast + // ref: https://github.com/ggml-org/llama.cpp/pull/14435 + return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1); case GGML_OP_SOFT_MAX_BACK: { float max_bias = 0.0f; memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float)); @@ -3375,6 +3382,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g if (op->src[0]->ne[0] == 192) { return false; } + // TODO: support broadcast + // ref: https://github.com/ggml-org/llama.cpp/pull/14435 if (op->src[0]->ne[3] != 1) { return false; } diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 7a9aab31684..8c2a9719274 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -229,7 +229,9 @@ typedef struct { uint64_t nb21; uint64_t nb22; uint64_t nb23; + int32_t ne32; uint64_t nb31; + uint64_t nb32; int32_t ne1; int32_t ne2; float scale; @@ -461,9 +463,21 @@ typedef struct { } ggml_metal_kargs_sum_rows; typedef struct { - int64_t ne00; - int64_t ne01; - int64_t ne02; + int32_t ne00; + int32_t ne01; + int32_t ne02; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + int32_t ne11; + int32_t ne12; + int32_t ne13; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; float scale; float max_bias; float m0; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 12a36695789..1a0968ed4d2 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1725,7 +1725,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_MEAN: case GGML_OP_SOFT_MAX: case GGML_OP_GROUP_NORM: - return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]); + return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]); case GGML_OP_RMS_NORM: case GGML_OP_L2_NORM: return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); @@ -2644,10 +2644,7 @@ static bool ggml_metal_encode_node( memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(scale)); memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias)); - const int64_t nrows_x = ggml_nrows(src0); - const int64_t nrows_y = src0->ne[1]; - - const uint32_t n_head = nrows_x/nrows_y; + const uint32_t n_head = src0->ne[2]; const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); @@ -2707,6 +2704,18 @@ static bool ggml_metal_encode_node( /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.ne13 =*/ ne13, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + /*.nb3 =*/ nb3, /*.scale =*/ scale, /*.max_bias =*/ max_bias, /*.m0 =*/ m0, @@ -2726,7 +2735,7 @@ static bool ggml_metal_encode_node( [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_DIAG_MASK_INF: { @@ -4979,7 +4988,9 @@ static bool ggml_metal_encode_node( /*.nb21 =*/ nb21, /*.nb22 =*/ nb22, /*.nb23 =*/ nb23, + /*.ne32 =*/ ne32, /*.nb31 =*/ nb31, + /*.nb32 =*/ nb32, /*.ne1 =*/ ne1, /*.ne2 =*/ ne2, /*.scale =*/ scale, diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index dac45c7a99b..dfff669726b 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1320,24 +1320,28 @@ kernel void kernel_soft_max( device char * dst, constant ggml_metal_kargs_soft_max & args, threadgroup float * buf [[threadgroup(0)]], - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], uint tiisg[[thread_index_in_simdgroup]], - uint ntg[[threads_per_threadgroup]]) { - const int64_t i03 = (tgpig) / (args.ne02*args.ne01); - const int64_t i02 = (tgpig - i03*args.ne02*args.ne01) / args.ne01; - const int64_t i01 = (tgpig - i03*args.ne02*args.ne01 - i02*args.ne01); + uint3 tptg[[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + const int32_t i01 = tgpig.x; + + const int32_t i13 = i03%args.ne13; + const int32_t i12 = i02%args.ne12; + const int32_t i11 = i01; - device const float * psrc0 = (device const float *) src0 + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00); - device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*args.ne00 : nullptr; - device float * pdst = (device float *) dst + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00); + device const float * psrc0 = (device const float *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr; + device float * pdst = (device float *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3); float slope = 1.0f; // ALiBi if (args.max_bias > 0.0f) { - const int64_t h = i02; + const int32_t h = i02; const float base = h < args.n_head_log2 ? args.m0 : args.m1; const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; @@ -1348,13 +1352,13 @@ kernel void kernel_soft_max( // parallel max float lmax = -INFINITY; - for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) { + for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { lmax = MAX(lmax, psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)); } // find the max value in the block float max_val = simd_max(lmax); - if (ntg > N_SIMDWIDTH) { + if (tptg.x > N_SIMDWIDTH) { if (sgitg == 0) { buf[tiisg] = -INFINITY; } @@ -1373,7 +1377,7 @@ kernel void kernel_soft_max( // parallel sum float lsum = 0.0f; - for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) { + for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { const float exp_psrc0 = exp((psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val); lsum += exp_psrc0; pdst[i00] = exp_psrc0; @@ -1385,7 +1389,7 @@ kernel void kernel_soft_max( float sum = simd_sum(lsum); - if (ntg > N_SIMDWIDTH) { + if (tptg.x > N_SIMDWIDTH) { if (sgitg == 0) { buf[tiisg] = 0.0f; } @@ -1404,7 +1408,7 @@ kernel void kernel_soft_max( const float inv_sum = 1.0f/sum; - for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) { + for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) { pdst[i00] *= inv_sum; } } @@ -1416,23 +1420,27 @@ kernel void kernel_soft_max_4( device char * dst, constant ggml_metal_kargs_soft_max & args, threadgroup float * buf [[threadgroup(0)]], - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], uint tiisg[[thread_index_in_simdgroup]], - uint ntg[[threads_per_threadgroup]]) { - const int64_t i03 = (tgpig) / (args.ne02*args.ne01); - const int64_t i02 = (tgpig - i03*args.ne02*args.ne01) / args.ne01; - const int64_t i01 = (tgpig - i03*args.ne02*args.ne01 - i02*args.ne01); + uint3 tptg[[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + const int32_t i01 = tgpig.x; + + const int32_t i13 = i03%args.ne13; + const int32_t i12 = i02%args.ne12; + const int32_t i11 = i01; - device const float4 * psrc4 = (device const float4 *) src0 + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00)/4; - device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*args.ne00/4 : nullptr; - device float4 * pdst4 = (device float4 *) dst + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00)/4; + device const float4 * psrc4 = (device const float4 *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr; + device float4 * pdst4 = (device float4 *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3); float slope = 1.0f; if (args.max_bias > 0.0f) { - const int64_t h = i02; + const int32_t h = i02; const float base = h < args.n_head_log2 ? args.m0 : args.m1; const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1; @@ -1443,14 +1451,14 @@ kernel void kernel_soft_max_4( // parallel max float4 lmax4 = -INFINITY; - for (int i00 = tpitg; i00 < args.ne00/4; i00 += ntg) { + for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { lmax4 = fmax(lmax4, psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); float max_val = simd_max(lmax); - if (ntg > N_SIMDWIDTH) { + if (tptg.x > N_SIMDWIDTH) { if (sgitg == 0) { buf[tiisg] = -INFINITY; } @@ -1469,7 +1477,7 @@ kernel void kernel_soft_max_4( // parallel sum float4 lsum4 = 0.0f; - for (int i00 = tpitg; i00 < args.ne00/4; i00 += ntg) { + for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { const float4 exp_psrc4 = exp((psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; @@ -1483,7 +1491,7 @@ kernel void kernel_soft_max_4( float sum = simd_sum(lsum); - if (ntg > N_SIMDWIDTH) { + if (tptg.x > N_SIMDWIDTH) { if (sgitg == 0) { buf[tiisg] = 0.0f; } @@ -1502,7 +1510,7 @@ kernel void kernel_soft_max_4( const float inv_sum = 1.0f/sum; - for (int i00 = tpitg; i00 < args.ne00/4; i00 += ntg) { + for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) { pdst4[i00] *= inv_sum; } } @@ -3776,7 +3784,7 @@ kernel void kernel_flash_attn_ext( // load the mask in shared memory #pragma unroll(Q) for (short j = 0; j < Q; ++j) { - device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31); + device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31 + (iq3%args.ne32)*args.nb32); const float m = pm[ic + tiisg]; @@ -4262,7 +4270,7 @@ kernel void kernel_flash_attn_ext_vec( const bool has_mask = mask != q; // pointer to the mask - device const half * pm = (device const half *) (mask + iq1*args.nb31); + device const half * pm = (device const half *) (mask + iq1*args.nb31 + (iq3%args.ne32)*args.nb32); float slope = 1.0f; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ae5e062572e..1d41d7a4be8 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4395,9 +4395,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return true; case GGML_OP_CONT: return op->src[0]->type != GGML_TYPE_BF16; - case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: - return true; + // TODO: support batching + if (op->src[0]->ne[3] != 1) { + return false; + } + // TODO: support broadcast + // ref: https://github.com/ggml-org/llama.cpp/pull/14435 + return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1); + case GGML_OP_DIAG_MASK_INF: case GGML_OP_ROPE: case GGML_OP_IM2COL: return true; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 09157bf5497..b8e25ba2098 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -410,14 +410,13 @@ struct vk_device_struct { vk_pipeline pipeline_div_norepeat[2][2][2]; vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; - vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32; + vk_pipeline pipeline_upscale_f32; vk_pipeline pipeline_scale_f32; vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_sin_f32; vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; - vk_pipeline pipeline_roll_f32; vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16; vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16; @@ -689,37 +688,6 @@ struct vk_op_unary_push_constants { }; static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); -static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) { - GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst))); - ne = ne != 0 ? ne : ggml_nelements(dst); - GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); - - vk_op_unary_push_constants p{}; - p.ne = (uint32_t)ne; - - size_t src0_tsize = ggml_type_size(src0->type); - p.ne00 = (uint32_t)src0->ne[0]; - p.ne01 = (uint32_t)src0->ne[1]; - p.ne02 = (uint32_t)src0->ne[2]; - p.ne03 = (uint32_t)src0->ne[3]; - p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); - p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); - p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); - p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); - - size_t dst_tsize = ggml_type_size(dst->type); - p.ne10 = (uint32_t)dst->ne[0]; - p.ne11 = (uint32_t)dst->ne[1]; - p.ne12 = (uint32_t)dst->ne[2]; - p.ne13 = (uint32_t)dst->ne[3]; - p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); - p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); - p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); - p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); - - return p; // fastdiv values and offsets are initialized later in ggml_vk_op -} - // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division // can be computed using a multiply (high 32b of 64b result) @@ -881,7 +849,6 @@ struct vk_op_conv2d_dw_push_constants { struct vk_op_upscale_push_constants { uint32_t ne; uint32_t a_offset; uint32_t d_offset; - uint32_t ne00; uint32_t ne01; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; float sf0; float sf1; float sf2; float sf3; @@ -2775,9 +2742,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1); - ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1); - ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -2789,8 +2754,6 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -6453,16 +6416,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_UPSCALE: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - int mode = ggml_get_op_params_i32(dst, 0); - switch (mode) { - case GGML_SCALE_MODE_NEAREST: - return ctx->device->pipeline_upscale_nearest_f32; - case GGML_SCALE_MODE_BILINEAR: - return ctx->device->pipeline_upscale_bilinear_f32; - case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS: - return ctx->device->pipeline_upscale_bilinear_ac_f32; - } + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) { + return ctx->device->pipeline_upscale_f32; } return nullptr; case GGML_OP_SCALE: @@ -6495,11 +6450,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_pad_f32; } return nullptr; - case GGML_OP_ROLL: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_roll_f32; - } - return nullptr; case GGML_OP_REPEAT: if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { return ctx->device->pipeline_repeat_f32; @@ -7042,7 +6992,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: - case GGML_OP_ROLL: case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_CPY: @@ -7479,21 +7428,14 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0); - float sf0 = (float)dst->ne[0] / src0->ne[0]; - float sf1 = (float)dst->ne[1] / src0->ne[1]; - float sf2 = (float)dst->ne[2] / src0->ne[2]; - float sf3 = (float)dst->ne[3] / src0->ne[3]; - - if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) { - sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1); - sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1); - } + const float sf0 = (float)dst->ne[0] / src0->ne[0]; + const float sf1 = (float)dst->ne[1] / src0->ne[1]; + const float sf2 = (float)dst->ne[2] / src0->ne[2]; + const float sf3 = (float)dst->ne[3] / src0->ne[3]; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, { (uint32_t)ggml_nelements(dst), 0, 0, - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3], sf0, sf1, sf2, sf3, @@ -7501,60 +7443,117 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - p.param1 = ggml_get_op_params_f32(dst, 0); + float * op_params = (float *)dst->op_params; + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + op_params[0], 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - p.param1 = ggml_get_op_params_f32(dst, 0); - p.param2 = ggml_get_op_params_f32(dst, 1); + float * op_params = (float *)dst->op_params; + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + op_params[0], op_params[1], + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun); -} - -static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const int32_t s0 = ggml_get_op_params_i32(dst, 0); - const int32_t s1 = ggml_get_op_params_i32(dst, 1); - const int32_t s2 = ggml_get_op_params_i32(dst, 2); - const int32_t s3 = ggml_get_op_params_i32(dst, 3); - const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000); - const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000); - - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); - memcpy(&p.param1, &s01_packed, sizeof(float)); - memcpy(&p.param2, &s23_packed, sizeof(float)); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { @@ -7572,8 +7571,14 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const } } - vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { + ne, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -8885,7 +8890,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: - case GGML_OP_ROLL: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: @@ -9055,10 +9059,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_PAD: ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun); - break; - case GGML_OP_ROLL: - ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun); - break; case GGML_OP_CPY: case GGML_OP_CONT: @@ -9276,7 +9276,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: - case GGML_OP_ROLL: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: @@ -10249,6 +10248,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { return false; } + // TODO: support broadcast + // ref: https://github.com/ggml-org/llama.cpp/pull/14435 + if (op->src[0]->ne[3] != 1) { + return false; + } // It's straightforward to support different K/V dequant, but would // significantly increase the number of pipelines if (op->src[1]->type != op->src[2]->type) { @@ -10401,13 +10405,21 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_UPSCALE: + return op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_ACC: case GGML_OP_CONCAT: case GGML_OP_SCALE: case GGML_OP_PAD: - case GGML_OP_ROLL: case GGML_OP_DIAG_MASK_INF: + return true; case GGML_OP_SOFT_MAX: + // TODO: support batching + if (op->src[0]->ne[3] != 1) { + return false; + } + // TODO: support broadcast + // ref: https://github.com/ggml-org/llama.cpp/pull/14435 + return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1); case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ARGSORT: case GGML_OP_SUM: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp index d78c6190d93..6f607380df8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp @@ -3,7 +3,6 @@ layout (push_constant) uniform parameter { uint ne; uint a_offset; uint d_offset; - uint ne00; uint ne01; uint nb00; uint nb01; uint nb02; uint nb03; uint ne10; uint ne11; uint ne12; uint ne13; float sf0; float sf1; float sf2; float sf3; @@ -16,61 +15,6 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; -// from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag -#define NEAREST 0 -#define BILINEAR 1 -#define ALIGN_CORNERS (1 << 8) - -layout (constant_id = 0) const uint scale_mode = 0; - -float fetch_nearest(uint i10, uint i11, uint i12, uint i13) { - const uint i00 = uint(i10 / p.sf0); - const uint i01 = uint(i11 / p.sf1); - const uint i02 = uint(i12 / p.sf2); - const uint i03 = uint(i13 / p.sf3); - - return data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]; -} - -float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) { - const uint i02 = uint(i12 / p.sf2); - const uint i03 = uint(i13 / p.sf3); - const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02; - - const float v00 = data_a[base + c0.y * p.nb01 + c0.x * p.nb00]; - const float v01 = data_a[base + c0.y * p.nb01 + c1.x * p.nb00]; - const float v10 = data_a[base + c1.y * p.nb01 + c0.x * p.nb00]; - const float v11 = data_a[base + c1.y * p.nb01 + c1.x * p.nb00]; - - return - v00 * (1.0-d.x) * (1.0-d.y) + - v01 * d.x * (1.0-d.y) + - v10 * (1.0-d.x) * d.y + - v11 * d.x * d.y; -} - -float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) { - const ivec2 ne0 = ivec2(p.ne00, p.ne01); - - const vec2 c = (vec2(i10, i11) + 0.5) / vec2(p.sf0, p.sf1) - 0.5; - const vec2 c0f = floor(c); - const vec2 d = c - c0f; - const ivec2 c0 = max(ivec2(c0f), 0); - const ivec2 c1 = min(ivec2(c0f + 1), ne0 - 1); - - return fetch_bilinear(c0, c1, d, i12, i13); -} - -float interpolate_bilinear_align_corners(uint i10, uint i11, uint i12, uint i13) { - const vec2 c = vec2(i10, i11) / vec2(p.sf0, p.sf1); - const vec2 c0f = floor(c); - const vec2 d = c - c0f; - const ivec2 c0 = ivec2(c0f); - const ivec2 c1 = c0 + 1; - - return fetch_bilinear(c0, c1, d, i12, i13); -} - void main() { const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; @@ -83,18 +27,10 @@ void main() { const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12; const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13; - float result; - switch (scale_mode) { - case NEAREST: - result = fetch_nearest(i10, i11, i12, i13); - break; - case BILINEAR: - result = interpolate_bilinear(i10, i11, i12, i13); - break; - case BILINEAR | ALIGN_CORNERS: - result = interpolate_bilinear_align_corners(i10, i11, i12, i13); - break; - } + const uint i00 = uint(i10 / p.sf0); + const uint i01 = uint(i11 / p.sf1); + const uint i02 = uint(i12 / p.sf2); + const uint i03 = uint(i13 / p.sf3); - data_d[p.d_offset + idx] = D_TYPE(result); + data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 4658ad0c9c2..297a2a77119 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -644,8 +644,6 @@ void process_shaders() { string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); - string_to_spv("roll_f32", "roll.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); - for (auto &c : compiles) { c.wait(); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 814f01b7bc8..f1245c50c92 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -473,14 +473,6 @@ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) { return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0; } -const char * ggml_version(void) { - return GGML_VERSION; -} - -const char * ggml_commit(void) { - return GGML_COMMIT; -} - // // timing // @@ -3674,9 +3666,11 @@ static struct ggml_tensor * ggml_soft_max_impl( if (mask) { GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32); GGML_ASSERT(ggml_is_contiguous(mask)); - GGML_ASSERT(ggml_is_matrix(mask)); + GGML_ASSERT(ggml_is_3d(mask)); GGML_ASSERT(mask->ne[0] == a->ne[0]); GGML_ASSERT(mask->ne[1] >= a->ne[1]); + GGML_ASSERT(a->ne[2]%mask->ne[2] == 0); + GGML_ASSERT(a->ne[3]%mask->ne[3] == 0); } if (max_bias > 0.0f) { @@ -4697,13 +4691,17 @@ struct ggml_tensor * ggml_flash_attn_ext( GGML_ASSERT(ggml_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) + GGML_ASSERT(q->ne[3] == k->ne[3]); + GGML_ASSERT(q->ne[3] == v->ne[3]); + if (mask) { GGML_ASSERT(ggml_is_contiguous(mask)); - GGML_ASSERT(mask->ne[2] == 1); - GGML_ASSERT(mask->ne[3] == 1); + GGML_ASSERT(mask->ne[2] == q->ne[3]); GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) && "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big"); //GGML_ASSERT(ggml_can_repeat_rows(mask, qk)); + + GGML_ASSERT(q->ne[3] % mask->ne[2] == 0); } if (max_bias > 0.0f) { From 1b3e06a400b3356b6fce528b98601f1257ac40f4 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 1 Jul 2025 03:32:56 -0500 Subject: [PATCH 0061/1348] vulkan: support softmax/FA batch and broadcast (llama/14449) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 51 ++++++++++--------- .../vulkan-shaders/flash_attn.comp | 12 +++-- .../vulkan-shaders/flash_attn_base.comp | 2 +- .../vulkan-shaders/flash_attn_cm1.comp | 12 +++-- .../vulkan-shaders/flash_attn_cm2.comp | 13 +++-- .../flash_attn_split_k_reduce.comp | 10 ++-- .../ggml-vulkan/vulkan-shaders/soft_max.comp | 24 +++++++-- 7 files changed, 80 insertions(+), 44 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index b8e25ba2098..25f70127a62 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -633,6 +633,7 @@ struct vk_flash_attn_push_constants { uint32_t nev2; uint32_t nev3; uint32_t nem1; + uint32_t nem2; uint32_t nb01; uint32_t nb02; @@ -643,7 +644,6 @@ struct vk_flash_attn_push_constants { uint32_t nb21; uint32_t nb22; uint32_t nb23; - uint32_t nb31; float scale; float max_bias; @@ -658,6 +658,7 @@ struct vk_flash_attn_push_constants { uint32_t split_kv; uint32_t k_num; }; +static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128"); struct vk_op_push_constants { uint32_t KX; @@ -756,6 +757,14 @@ struct vk_op_rope_push_constants { struct vk_op_soft_max_push_constants { uint32_t KX; uint32_t KY; + uint32_t ne00; + uint32_t ne01; + uint32_t ne02; + uint32_t ne12; + uint32_t ne13; + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; float scale; float max_bias; float m0; @@ -6040,7 +6049,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const uint32_t nem1 = mask ? mask->ne[1] : 0; - const uint32_t nbm1 = mask ? mask->nb[1] : 0; + const uint32_t nem2 = mask ? mask->ne[2] : 0; const uint32_t D = neq0; uint32_t N = neq1; @@ -6203,7 +6212,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx // Try to use split_k when KV is large enough to be worth the overhead if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) { // Try to run two workgroups per SM. - split_k = ctx->device->shader_core_count * 2 / workgroups_y; + split_k = ctx->device->shader_core_count * 2 / (workgroups_y * workgroups_z); if (split_k > 1) { // Try to evenly split KV into split_k chunks, but it needs to be a multiple // of "align", so recompute split_k based on that. @@ -6213,9 +6222,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } } - // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1) - // and the per-row m and L values (ne1 rows). - const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0; + // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1) + // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows. + const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0; if (split_k_size > ctx->device->max_memory_allocation_size) { GGML_ABORT("Requested preallocation size is too large"); } @@ -6307,11 +6316,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, - nem1, + nem1, nem2, q_stride, (uint32_t)nbq2, (uint32_t)nbq3, k_stride, (uint32_t)nbk2, (uint32_t)nbk3, v_stride, (uint32_t)nbv2, (uint32_t)nbv3, - nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1, gqa_ratio, split_kv, split_k }; @@ -6334,13 +6342,13 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); ggml_vk_sync_buffers(subctx); - const std::array pc2 = { D, (uint32_t)ne1, split_k }; + const std::array pc2 = { D, (uint32_t)ne1, (uint32_t)ne3, split_k }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, { vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, - pc2, { (uint32_t)ne1, 1, 1 }); + pc2, { (uint32_t)ne1, 1, (uint32_t)ne3 }); } else { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { @@ -7666,7 +7674,13 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const uint32_t nrows_x = (uint32_t)ggml_nrows(src0); const uint32_t nrows_y = (uint32_t)src0->ne[1]; - const uint32_t n_head_kv = nrows_x/nrows_y; + const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u; + const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u; + const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u; + const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u; + const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u; + + const uint32_t n_head_kv = src0->ne[2]; const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); @@ -7675,6 +7689,9 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, { ncols, src1 != nullptr ? nrows_y : (uint32_t)0, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], + ne12, ne13, + nb11, nb12, nb13, scale, max_bias, m0, m1, n_head_log2, @@ -10248,11 +10265,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { return false; } - // TODO: support broadcast - // ref: https://github.com/ggml-org/llama.cpp/pull/14435 - if (op->src[0]->ne[3] != 1) { - return false; - } // It's straightforward to support different K/V dequant, but would // significantly increase the number of pipelines if (op->src[1]->type != op->src[2]->type) { @@ -10413,13 +10425,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_DIAG_MASK_INF: return true; case GGML_OP_SOFT_MAX: - // TODO: support batching - if (op->src[0]->ne[3] != 1) { - return false; - } - // TODO: support broadcast - // ref: https://github.com/ggml-org/llama.cpp/pull/14435 - return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1); case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ARGSORT: case GGML_OP_SUM: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index ce230a8f7d9..6f80101d1c4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -99,6 +99,10 @@ void main() { uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2; uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2; #endif + uint32_t m_offset = 0; + if (p.nem2 != 1) { + m_offset = (iq3 % p.nem2) * p.nem1 * KV; + } [[dont_unroll]] for (uint32_t j = start_j; j < end_j; ++j) { @@ -150,7 +154,7 @@ void main() { uint32_t c = (idx + tid) % Bc; uint32_t r = (idx + tid) / Bc; if (idx + tid < Bc * Br) { - masksh[c][r] = float(data_m[(i * Br + r) * m_stride + (j * Bc + c)]); + masksh[c][r] = float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]); } } barrier(); @@ -277,7 +281,7 @@ void main() { // If there is split_k, then the split_k resolve shader does the final // division by L. Store the intermediate O value and per-row m and L values. if (p.k_num > 1) { - uint32_t o_offset = D * p.ne1 * split_k_index; + uint32_t o_offset = D * p.ne1 * (split_k_index + iq3 * p.k_num); [[unroll]] for (uint32_t r = 0; r < Br; ++r) { if (r < N) { @@ -289,7 +293,7 @@ void main() { } } - o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2; + o_offset = D * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; [[unroll]] for (uint32_t r = 0; r < Br; ++r) { if (r < N) { perElemOpStoreCol0(r, 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N); @@ -311,7 +315,7 @@ void main() { } } - uint32_t o_offset = iq3*p.ne2*p.ne1; + uint32_t o_offset = iq3*p.ne2*p.ne1*D; if (p.gqa_ratio > 1) { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp index 61d90e2d8ed..67b935cf57e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp @@ -24,6 +24,7 @@ layout (push_constant) uniform parameter { uint32_t nev2; uint32_t nev3; uint32_t nem1; + uint32_t nem2; uint32_t nb01; uint32_t nb02; @@ -34,7 +35,6 @@ layout (push_constant) uniform parameter { uint32_t nb21; uint32_t nb22; uint32_t nb23; - uint32_t nb31; float scale; float max_bias; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index da478be24fb..26fe50c7a81 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -123,6 +123,10 @@ void main() { uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2; uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2; #endif + uint32_t m_offset = 0; + if (p.nem2 != 1) { + m_offset = (iq3 % p.nem2) * p.nem1 * KV; + } [[dont_unroll]] for (uint32_t j = start_j; j < end_j; ++j) { @@ -181,7 +185,7 @@ void main() { uint32_t c = (idx + tid) % Bc; uint32_t r = (idx + tid) / Bc; if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) { - sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * float(data_m[(i * Br + r) * m_stride + (j * Bc + c)])); + sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)])); } } barrier(); @@ -300,7 +304,7 @@ void main() { // If there is split_k, then the split_k resolve shader does the final // division by L. Store the intermediate O value and per-row m and L values. if (p.k_num > 1) { - uint32_t o_offset = D * p.ne1 * split_k_index; + uint32_t o_offset = D * p.ne1 * (split_k_index + iq3 * p.k_num); [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { if (tile_row(r) < N) { @@ -312,7 +316,7 @@ void main() { } } - o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2; + o_offset = D * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { if (tile_row(r) < N) { perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N); @@ -334,7 +338,7 @@ void main() { } } - uint32_t o_offset = iq3*p.ne2*p.ne1; + uint32_t o_offset = iq3*p.ne2*p.ne1*D; if (p.gqa_ratio > 1) { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index 6acf67a03a4..cf47bd53e28 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -130,6 +130,11 @@ void main() { coopMatPerElementNV(slopeMat, slopeMat, perElemOpComputeSlope, iq2); } + uint32_t m_offset = 0; + if (p.nem2 != 1) { + m_offset = (iq3 % p.nem2) * p.nem1 * KV * 2 /*sizeof(float16_t)*/; + } + [[dont_unroll]] for (uint32_t j = start_j; j < end_j; ++j) { @@ -155,7 +160,7 @@ void main() { coopmat mv; - coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); + coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); S += slopeMat*coopmat(mv); } @@ -229,10 +234,10 @@ void main() { if (p.k_num > 1) { coopmat O_D = coopmat(O); - uint32_t o_offset = D * p.ne1 * split_k_index; + uint32_t o_offset = D * p.ne1 * (split_k_index + iq3 * p.k_num); coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); - o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2; + o_offset = D * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N); coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N); return; @@ -250,7 +255,7 @@ void main() { O = Ldiag*O; - uint32_t o_offset = iq3*p.ne2*p.ne1; + uint32_t o_offset = iq3*p.ne2*p.ne1*D; coopmat O_D = coopmat(O); if (p.gqa_ratio > 1) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp index a7e3956854c..599cef072e9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp @@ -12,6 +12,7 @@ layout (binding = 1) writeonly buffer D {float data_d[];}; layout (push_constant) uniform parameter { uint D; uint N; + uint ne3; uint k_num; } p; @@ -19,13 +20,14 @@ void main() { // Each workgroup handles a row const uint n = gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; + const uint iq3 = gl_WorkGroupID.z; uint D = p.D; uint N = p.N; uint k_num = p.k_num; - uint l_offset = D * N * k_num + n; - uint m_offset = D * N * k_num + N + n; + uint l_offset = D * N * p.ne3 * k_num + N * iq3 * k_num * 2 + n; + uint m_offset = D * N * p.ne3 * k_num + N * iq3 * k_num * 2 + N + n; uint lm_stride = N * 2; // Compute the max m value for the row @@ -49,11 +51,11 @@ void main() { for (uint d = tid; d < D; d += BLOCK_SIZE) { float O = 0.0; [[unroll]] for (uint k = 0; k < k_num; ++k) { - uint o_offset = D * N * k + D * n + d; + uint o_offset = D * N * (k + iq3 * k_num) + D * n + d; float m = data_a[m_offset + k * lm_stride]; O += exp(m - m_max) * data_a[o_offset]; } O *= L; - data_d[D * n + d] = O; + data_d[iq3 * D * N + D * n + d] = O; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp index 51fc2dc7ed4..5bcd3b1e3dd 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp @@ -6,6 +6,14 @@ layout (push_constant) uniform parameter { uint KX; uint KY; + uint ne00; + uint ne01; + uint ne02; + uint ne12; + uint ne13; + uint nb11; + uint nb12; + uint nb13; float scale; float max_bias; float m0; @@ -31,7 +39,15 @@ shared FLOAT_TYPE vals[BLOCK_SIZE]; void soft_max(uint num_iters) { const uint tid = gl_LocalInvocationID.x; const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; - const uint rowy = (p.KY > 0) ? (rowx % p.KY) : 0; + + const uint32_t i03 = rowx / (p.ne01 * p.ne02); + const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01; + const uint32_t i01 = rowx % p.ne01; + + uint rowy_start = 0; + if (p.KY > 0) { + rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13; + } if (rowx >= p.nrows_x) { return; @@ -41,7 +57,7 @@ void soft_max(uint num_iters) { // ALiBi if (p.max_bias > 0.0f) { - const uint h = rowx/p.KY; // head index + const uint h = (rowx / p.ne01) % p.ne02; // head index const float base = h < p.n_head_log2 ? p.m0 : p.m1; const uint exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1; @@ -67,7 +83,7 @@ void soft_max(uint num_iters) { FLOAT_TYPE b = FLOAT_TYPE(0); if (p.KY > 0 && col < p.KX) { - b = data_b[rowy * p.KX + col]; + b = data_b[rowy_start + col]; } FLOAT_TYPE v = a * p.scale + slope * b; @@ -111,7 +127,7 @@ void soft_max(uint num_iters) { if (idx < DATA_CACHE_SIZE) { val = exp(data_cache[idx] - max_val); } else { - val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val); + val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val); } sum += val; if (idx < DATA_CACHE_SIZE) { From 70515ed728976d2eb64936417359ded81eaab2bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 2 Jul 2025 13:42:12 +0200 Subject: [PATCH 0062/1348] CUDA: broadcasting for FlashAttention mask (llama/14500) --- ggml/src/ggml-cuda/fattn-common.cuh | 5 ++++- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 12 ++++++++---- ggml/src/ggml-cuda/fattn-tile-f16.cu | 10 ++++++---- ggml/src/ggml-cuda/fattn-tile-f32.cu | 10 ++++++---- ggml/src/ggml-cuda/fattn-vec-f16.cuh | 8 +++++--- ggml/src/ggml-cuda/fattn-vec-f32.cuh | 9 ++++++--- ggml/src/ggml-cuda/fattn-wmma-f16.cu | 14 ++++++++------ 7 files changed, 43 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index cfab2b5ebac..075f14a49e9 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -32,7 +32,9 @@ typedef void (* fattn_kernel_t)( const int ne12, const int ne13, const int ne31, + const int ne32, const int nb31, + const int nb32, const int nb01, const int nb02, const int nb03, @@ -851,7 +853,8 @@ void launch_fattn( scale, max_bias, m0, m1, n_head_log2, logit_softcap, Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], K->ne[0], K->ne[1], K->ne[2], K->ne[3], - mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, + mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, + mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, Q->nb[1], Q->nb[2], Q->nb[3], nb11, nb12, nb13, nb21, nb22, nb23, diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index e230f6d494d..709589854f0 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1223,7 +1223,9 @@ static __global__ void flash_attn_ext_f16( const int ne12, const int ne13, const int ne31, + const int ne32, const int nb31, + const int nb32, const int nb01, const int nb02, const int nb03, @@ -1288,7 +1290,8 @@ static __global__ void flash_attn_ext_f16( const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2); const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio)); - const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr; + const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr : + (const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1); float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2); const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio)); @@ -1327,7 +1330,8 @@ static __global__ void flash_attn_ext_f16( const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2); const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio)); - const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr; + const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr : + (const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1); float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2); const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio)); @@ -1348,8 +1352,8 @@ static __global__ void flash_attn_ext_f16( GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); - GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); - GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); + GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3); diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 9283560d5c4..0c967f178e7 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -6,7 +6,7 @@ template // D == head size #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) -__launch_bounds__(nwarps*WARP_SIZE, 1) +__launch_bounds__(nwarps*WARP_SIZE, 2) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void flash_attn_tile_ext_f16( const char * __restrict__ Q, @@ -30,7 +30,9 @@ static __global__ void flash_attn_tile_ext_f16( const int ne12, const int ne13, const int ne31, + const int ne32, const int nb31, + const int nb32, const int nb01, const int nb02, const int nb03, @@ -64,7 +66,7 @@ static __global__ void flash_attn_tile_ext_f16( const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0); const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio)); const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) mask + ne11*ic0; + const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); const int stride_KV2 = nb11 / sizeof(half2); @@ -288,8 +290,8 @@ static __global__ void flash_attn_tile_ext_f16( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); - GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index 32673adb57f..124d5d3e891 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -6,7 +6,7 @@ template // D == head size #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) -__launch_bounds__(nwarps*WARP_SIZE, 1) +__launch_bounds__(nwarps*WARP_SIZE, 2) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void flash_attn_tile_ext_f32( const char * __restrict__ Q, @@ -30,7 +30,9 @@ static __global__ void flash_attn_tile_ext_f32( const int ne12, const int ne13, const int ne31, + const int ne32, const int nb31, + const int nb32, const int nb01, const int nb02, const int nb03, @@ -58,8 +60,8 @@ static __global__ void flash_attn_tile_ext_f32( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); - GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); @@ -76,7 +78,7 @@ static __global__ void flash_attn_tile_ext_f32( const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0); const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio)); const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) mask + ne11*ic0; + const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); const int stride_KV2 = nb11 / sizeof(half2); diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index 35e649cb3c8..e78fb181919 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -27,7 +27,9 @@ static __global__ void flash_attn_vec_ext_f16( const int ne12, const int ne13, const int ne31, + const int ne32, const int nb31, + const int nb32, const int nb01, const int nb02, const int nb03, @@ -68,7 +70,7 @@ static __global__ void flash_attn_vec_ext_f16( K += nb12*(blockIdx.z / gqa_ratio); V += nb22*(blockIdx.z / gqa_ratio); - const half * maskh = (const half *) mask + ne11*ic0; + const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1); const half slopeh = __float2half(slopef); @@ -342,8 +344,8 @@ static __global__ void flash_attn_vec_ext_f16( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); - GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh index 95396791779..c22baf41764 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -27,7 +27,9 @@ static __global__ void flash_attn_vec_ext_f32( const int ne12, const int ne13, const int ne31, + const int ne32, const int nb31, + const int nb32, const int nb01, const int nb02, const int nb03, @@ -51,8 +53,8 @@ static __global__ void flash_attn_vec_ext_f32( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); - GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); @@ -79,7 +81,8 @@ static __global__ void flash_attn_vec_ext_f32( Q += nb02* blockIdx.z + nb01*ic0; K += nb12*(blockIdx.z / gqa_ratio); V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape - const half * maskh = (const half *) mask + ne11*ic0; + + const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1); diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index f3b794c3644..c95ca7b1f28 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -46,7 +46,9 @@ static __global__ void flash_attn_ext_f16( const int ne12, const int ne13, const int ne31, + const int ne32, const int nb31, + const int nb32, const int nb01, const int nb02, const int nb03, @@ -94,11 +96,11 @@ static __global__ void flash_attn_ext_f16( constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half); const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0); - const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio)); - const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) mask + (nb31/sizeof(half))* ic0; - const half2 * mask2 = (const half2 *) mask + (nb31/sizeof(half))*(ic0/2); + const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0); + const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio)); + const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); + const half2 * mask2 = (const half2 *) maskh; const int stride_Q = nb01 / sizeof(float); const int stride_KV = nb11 / sizeof(half); @@ -440,7 +442,7 @@ static __global__ void flash_attn_ext_f16( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3); From fb5c4095ee439be18c3ba02a3919b12f775578cd Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 2 Jul 2025 20:34:24 +0800 Subject: [PATCH 0063/1348] CUDA: add softmax broadcast (llama/14475) * CUDA: add softmax broadcast * Pass by const ref * Review: Use blockDims for indexing, remove designated initializers * Add TODO for noncontigous input/output --- ggml/src/ggml-cuda/ggml-cuda.cu | 8 +-- ggml/src/ggml-cuda/softmax.cu | 119 ++++++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 186492be340..0fa7fb953d8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3329,13 +3329,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_DIAG_MASK_INF: return true; case GGML_OP_SOFT_MAX: - // TODO: support batching - if (op->src[0]->ne[3] != 1) { - return false; - } - // TODO: support broadcast - // ref: https://github.com/ggml-org/llama.cpp/pull/14435 - return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1); + return true; case GGML_OP_SOFT_MAX_BACK: { float max_bias = 0.0f; memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float)); diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index aac6e099988..e0eb921d85d 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -13,6 +13,29 @@ __device__ float __forceinline__ t2f32(half val) { return __half2float(val); } +struct soft_max_params { + + int64_t nheads; + uint32_t n_head_log2; + int64_t ncols; + int64_t nrows_x; + int64_t nrows_y; + int64_t ne00; + int64_t ne01; + int64_t ne02; + int64_t ne03; + int64_t nb11; + int64_t nb12; + int64_t nb13; + + int64_t ne12; + int64_t ne13; + float scale; + float max_bias; + float m0; + float m1; +}; + // When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled. // As we want to keep pragma unroll for all other cases we supress the clang transformation warning here. #ifdef __clang__ @@ -21,16 +44,24 @@ __device__ float __forceinline__ t2f32(half val) { #endif // __clang__ template static __global__ void soft_max_f32( - const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, - const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { - const int ncols = ncols_template == 0 ? ncols_par : ncols_template; + const float * x, const T * mask, float * dst, const soft_max_params p) { + const int ncols = ncols_template == 0 ? p.ncols : ncols_template; const int tid = threadIdx.x; - const int rowx = blockIdx.x; - const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension + + const int64_t i03 = blockIdx.z; + const int64_t i02 = blockIdx.y; + const int64_t i01 = blockIdx.x; + + //TODO: noncontigous inputs/outputs + const int rowx = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; + + const int64_t i11 = i01; + const int64_t i12 = i02 % p.ne12; + const int64_t i13 = i03 % p.ne13; x += int64_t(rowx)*ncols; - mask += int64_t(rowy)*ncols * (mask != nullptr); + mask += (i11*p.nb11 + i12*p.nb12 + i13*p.nb13) / sizeof(T) * (mask != nullptr); dst += int64_t(rowx)*ncols; const int block_size = block_size_template == 0 ? blockDim.x : block_size_template; @@ -38,7 +69,7 @@ static __global__ void soft_max_f32( const int warp_id = threadIdx.x / WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE; - const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1); + const float slope = get_alibi_slope(p.max_bias, i02, p.n_head_log2, p.m0, p.m1); extern __shared__ float data_soft_max_f32[]; float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication @@ -55,7 +86,7 @@ static __global__ void soft_max_f32( break; } - const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f); + const float val = x[col]*p.scale + (mask ? slope*t2f32(mask[col]) : 0.0f); vals[col] = val; max_val = max(max_val, val); @@ -151,63 +182,60 @@ static __global__ void soft_max_back_f32( } template -static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) { +static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const soft_max_params & params, cudaStream_t stream) { int nth = WARP_SIZE; + const int64_t ncols_x = params.ncols; + while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; const dim3 block_dims(nth, 1, 1); - const dim3 block_nums(nrows_x, 1, 1); + const dim3 block_nums(params.ne01, params.ne02, params.ne03); const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); - const uint32_t n_head = nrows_x/nrows_y; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); // FIXME: this limit could be raised by ~2-4x on Ampere or newer if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { switch (ncols_x) { case 32: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; case 64: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; case 128: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; case 256: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; case 512: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; case 1024: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; case 2048: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; case 4096: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; default: soft_max_f32<<>> - (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + (x, mask, dst, params); break; } } else { const size_t nbytes_shared_low = WARP_SIZE*sizeof(float); - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>>(x, mask, dst, params); } } @@ -235,10 +263,11 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional - const int64_t ne00 = src0->ne[0]; const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_y = src0->ne[1]; + const int64_t ne00 = src0->ne[0]; + float scale = 1.0f; float max_bias = 0.0f; @@ -247,10 +276,44 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); + const int64_t nb11 = src1 ? src1->nb[1] : 1; + const int64_t nb12 = src1 ? src1->nb[2] : 1; + const int64_t nb13 = src1 ? src1->nb[3] : 1; + + const int64_t ne12 = src1 ? src1->ne[2] : 1; + const int64_t ne13 = src1 ? src1->ne[3] : 1; + + const uint32_t n_head = src0->ne[2]; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + + soft_max_params params = {}; + params.nheads = src0->ne[2]; + params.n_head_log2 = n_head_log2; + params.ncols = ne00; + params.nrows_x = nrows_x; + params.nrows_y = nrows_y; + params.ne00 = src0->ne[0]; + params.ne01 = src0->ne[1]; + params.ne02 = src0->ne[2]; + params.ne03 = src0->ne[3]; + params.nb11 = nb11; + params.nb12 = nb12; + params.nb13 = nb13; + params.ne12 = ne12; + params.ne13 = ne13; + params.scale = scale; + params.max_bias = max_bias; + params.m0 = m0; + params.m1 = m1; + if (use_f16) { - soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); + soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, params, stream); } else { - soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); + soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, params, stream); } } From d2d120c25628d688a2170ebb872723ce83f396e3 Mon Sep 17 00:00:00 2001 From: compilade Date: Wed, 2 Jul 2025 13:10:24 -0400 Subject: [PATCH 0064/1348] llama : initial Mamba-2 support (llama/9126) * llama : initial Mamba-2 support * ggml : SIMD ggml_ssm_scan for Mamba-2 * ggml : improve ggml_mul speed when masking recurrent states * llama : support running Mamba-Codestral-7B-v0.1 * llama : fix Mamba-2 conv state saving * ggml : make the ggml_mul fast broadcast path more consistently formatted * llama : remove unused variable * llama : add missing break * convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires workarounds to work correctly. * llama : avoid redundant state copy for Mamba 1 and 2 * metal : attempt to adapt SSM_SCAN for Mamba-2 * metal : fix SSM_SCAN pipeline scope * metal : use log and exp instead of log1pf and expf in SSM_SCAN * metal : remove unused arguments for SSM_SCAN The max index is 31, so trimming the arguments is necessary. * metal : add back n_seqs to SSM_SCAN args Whoops, this is needed for the offset in the concatenated output. * metal : fix SSM_SCAN state head offset * metal : fix wrong number of tokens per sequence in SSM_SCAN * ggml : remove unused fast broadcast path in GGML_MUL This was initially added because states were masked with ggml_mul, but this is no longer done and so this "optimisation" is no longer necessary, or at least not worth the additional code complexity. * ggml : avoid multiply by D in GGML_OP_SSM_SCAN This makes the weight buft detection in src/llama.cpp simpler. * convert : transpose Mamba-2 A, D and reshape SSM_NORM This breaks existing conversions of Mamba-2 models to avoid some reshapes. Not sure if it's a good idea, but it makes the graph slightly cleaner. * llama : more appropriate SSM_SCAN and SSM_CONV buft support checks * convert : fix flake8 lint * metal : fix confusion between ; and , * metal : add missing args for nb references in ssm_scan_f32_group * metal : single-user mamba2 inference works * kv-cache : remove const_cast when setting inputs for s_copy And also fix multi-user inference for recurrent models by using cell_id instead of i as the kv cell index when populating s_copy. * convert : avoid AutoConfig for Mamba and Mamba2 hparams * kv-cache : allow context shift for recurrent models * graph : fix recurrent state copies when avoiding copies Works, but using lambda functions might not be that clean. * ggml : fix mamba2 ssm scan when compiled with SVE * ggml-cpu : reorder SVE FMA for consistency with other SIMD arches * cuda : implement ssm scan for Mamba2 There is still room for improvement, but it works! * cuda : adapt Mamba1 ssm scan to shape changes from Mamba2 * mamba : fix mismatched new and delete size for llm_build_mamba Subclasses of llm_graph_context cannot have extra fields, because the called destructor is not the one from the subclass. This otherwise would cause problems when runnning Mamba-(1|2) inference when compiled -DGGML_SANITIZE_ADDRESS=ON * cuda : graceful fallback for Mamba-1 models with weird embd size --- ggml/include/ggml.h | 3 +- ggml/src/ggml-cpu/ops.cpp | 278 +++++++++++++++++--------- ggml/src/ggml-cpu/simd-mappings.h | 2 +- ggml/src/ggml-cpu/vec.cpp | 18 +- ggml/src/ggml-cpu/vec.h | 18 +- ggml/src/ggml-cuda/ggml-cuda.cu | 17 +- ggml/src/ggml-cuda/ssm-scan.cu | 231 ++++++++++++++++----- ggml/src/ggml-metal/ggml-metal-impl.h | 11 +- ggml/src/ggml-metal/ggml-metal.m | 93 ++++++--- ggml/src/ggml-metal/ggml-metal.metal | 119 ++++++++--- ggml/src/ggml.c | 49 +++-- 11 files changed, 593 insertions(+), 246 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a92495dbb00..fbe072a3e0c 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2028,7 +2028,8 @@ extern "C" { struct ggml_tensor * dt, struct ggml_tensor * A, struct ggml_tensor * B, - struct ggml_tensor * C); + struct ggml_tensor * C, + struct ggml_tensor * ids); // partition into non-overlapping windows with padding if needed // example: diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 2ae0721e9d1..2f153419542 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8337,120 +8337,210 @@ void ggml_compute_forward_ssm_conv( static void ggml_compute_forward_ssm_scan_f32( const ggml_compute_params * params, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; // s - const ggml_tensor * src1 = dst->src[1]; // x - const ggml_tensor * src2 = dst->src[2]; // dt - const ggml_tensor * src3 = dst->src[3]; // A - const ggml_tensor * src4 = dst->src[4]; // B - const ggml_tensor * src5 = dst->src[5]; // C + const ggml_tensor * src0 = dst->src[0]; // s {d_state, dim, n_head, n_seqs+} + const ggml_tensor * src1 = dst->src[1]; // x {dim, n_head, n_seq_tokens, n_seqs} + const ggml_tensor * src2 = dst->src[2]; // dt {n_head, n_seq_tokens, n_seqs} + const ggml_tensor * src3 = dst->src[3]; // A {d_state, n_head} or {1, n_head} + const ggml_tensor * src4 = dst->src[4]; // B {d_state, n_group, n_seq_tokens, n_seqs} + const ggml_tensor * src5 = dst->src[5]; // C {d_state, n_group, n_seq_tokens, n_seqs} + const ggml_tensor * src6 = dst->src[6]; // ids {n_seqs} const int ith = params->ith; const int nth = params->nth; - const int64_t nc = src0->ne[0]; // d_state - const int64_t nr = src0->ne[1]; // d_inner - const int64_t n_t = src1->ne[1]; // number of tokens per sequence - const int64_t n_s = src0->ne[2]; // number of sequences in the batch + const int64_t nc = src0->ne[0]; // d_state + const int64_t nr = src0->ne[1]; // dim + const int64_t nh = src1->ne[1]; // n_head + const int64_t ng = src4->ne[1]; + const int64_t nt = src1->ne[2]; // number of tokens per sequence + const int64_t ns = src1->ne[3]; // number of sequences in the batch - GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst)); + // can't use ggml_nbytes because src1 is not necessarily contiguous + const int64_t s_off = ggml_nelements(src1) * ggml_element_size(src1); + + GGML_ASSERT(ggml_nelements(src1) + nc*nr*nh*ns == ggml_nelements(dst)); GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(src1->nb[0] == sizeof(float)); GGML_ASSERT(src2->nb[0] == sizeof(float)); GGML_ASSERT(src3->nb[0] == sizeof(float)); GGML_ASSERT(src4->nb[0] == sizeof(float)); GGML_ASSERT(src5->nb[0] == sizeof(float)); - // required for the dot product between s and C - GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); - // required for per-sequence offsets for states - GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float)); - // required to get correct offset for state destination (i.e. src1->nb[3]) - GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float)); + GGML_ASSERT(src6->nb[0] == sizeof(int32_t)); + // allows optimizing the modulo since n_group should be a power of 2 + GGML_ASSERT((ng & -ng) == ng); - // rows per thread - const int dr = (nr + nth - 1)/nth; + // heads per thread + const int dh = (nh + nth - 1)/nth; - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - const int ir = ir1 - ir0; + // head range for this thread + const int ih0 = dh*ith; + const int ih1 = MIN(ih0 + dh, nh); + + const int32_t * ids = (const int32_t *) src6->data; + + for (int i3 = 0; i3 < ns; ++i3) { + const float * s0 = (const float *) ((const char *) src0->data + ids[i3]*(src0->nb[3])); // {d_state, dim, nh, ns} + float * s = ( float *) (( char *) dst->data + i3*(src0->nb[3]) + s_off); // {d_state, dim, nh, ns} + + for (int i2 = 0; i2 < nt; ++i2) { + const float * x = (const float *) ((const char *) src1->data + i2*(src1->nb[2]) + i3*(src1->nb[3])); // {dim, nh, nt, ns} + const float * dt = (const float *) ((const char *) src2->data + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {nh, nt, ns} + const float * A = (const float *) ((const char *) src3->data); // {d_state, nh} or {1, nh} + const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[2]) + i3*(src4->nb[3])); // {d_state, ng, nt, ns} + const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[2]) + i3*(src5->nb[3])); // {d_state, ng, nt, ns} + float * y = ( float *) (( char *) dst->data + i2*(nh*nr*sizeof(float)) + i3*(nt*nh*nr*sizeof(float))); // {dim, nh, nt, ns} + + if (src3->ne[0] == 1) { + // Mamba-2 has a scalar decay factor per head; dA can be outside the state-wise loop + + // n_head + for (int h = ih0; h < ih1; ++h) { + // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16 + const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h]; + const float dA = expf(dt_soft_plus * A[h]); + + // dim + for (int i1 = 0; i1 < nr; ++i1) { + const int ii = i1 + h*nr; + const float x_dt = x[ii] * dt_soft_plus; + float sumf = 0.0f; +#if defined(GGML_SIMD) + #if defined(__ARM_FEATURE_SVE) + const int ggml_f32_epr = svcntw(); + const int ggml_f32_step = 1 * ggml_f32_epr; + + const int np = (nc & ~(ggml_f32_step - 1)); + + GGML_F32_VEC sum = GGML_F32_VEC_ZERO; + + GGML_F32_VEC adA = GGML_F32_VEC_SET1(dA); + GGML_F32_VEC axdt = GGML_F32_VEC_SET1(x_dt); + + for (int i = 0; i < np; i += ggml_f32_step) { + // TODO: maybe unroll more? + for (int j = 0; j < 1; j++) { + GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc); + GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + (h & (ng - 1))*nc); + GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + (h & (ng - 1))*nc); + + t0 = GGML_F32_VEC_MUL(t0, adA); + t1 = GGML_F32_VEC_MUL(t1, axdt); + + t0 = GGML_F32_VEC_ADD(t0, t1); + + sum = GGML_F32_VEC_FMA(sum, t0, t2); + + GGML_F32_VEC_STORE(s + i + j*ggml_f32_epr + ii*nc, t0); + } + } + + sumf = GGML_F32xt_REDUCE_ONE(sum); + #else + const int np = (nc & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + + GGML_F32_VEC adA = GGML_F32_VEC_SET1(dA); + GGML_F32_VEC axdt = GGML_F32_VEC_SET1(x_dt); + + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + GGML_F32_VEC az[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc); + ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + (h & (ng - 1))*nc); + az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + (h & (ng - 1))*nc); + + ax[j] = GGML_F32_VEC_MUL(ax[j], adA); + ay[j] = GGML_F32_VEC_MUL(ay[j], axdt); + + ax[j] = GGML_F32_VEC_ADD(ax[j], ay[j]); - #ifdef __ARM_FEATURE_SVE - for (int i3 = 0; i3 < n_s; ++i3) { - for (int i2 = 0; i2 < n_t; ++i2) { - const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s} - const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} - const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s} - const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner} - const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s} - const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s} - float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} - float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s} - - // use the output as the source for the next token-wise iterations - if (i2 > 0) { s0 = s; } - - // d_inner - for (int i1 = 0; i1 < ir; ++i1) { - float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1]; - float x_dt = x[i1] * dt_soft_plus; - svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt); - svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus); - svfloat32_t r1_vector = GGML_F32_VEC_ZERO; - - for (int64_t k = 0; k < nc; k += svcntw()) { - svfloat32_t vA = GGML_F32_VEC_LOAD(&A[i1*nc + k]); - svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k]); - svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k]); - svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[i1*nc + k]); - - svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA); - t1 = exp_ps_sve(svptrue_b32(), t1); - svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB); - - vs0 = GGML_F32_VEC_FMA(vs0, t1, t2); - r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector); - - GGML_F32_VEC_STORE(&s[i1*nc + k], vs0); + sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], az[j]); + + GGML_F32_VEC_STORE(s + i + j*GGML_F32_EPR + ii*nc, ax[j]); + } + } + + // reduce sum0..sum3 to sum0 + GGML_F32_VEC_REDUCE(sumf, sum); + #endif +#else + const int np = 0; +#endif + // d_state + for (int i0 = np; i0 < nc; ++i0) { + const int i = i0 + ii*nc; + const int ig = i0 + (h & (ng - 1))*nc; + // state = prev_state * dA + dB * x + const float state = (s0[i] * dA) + (B[ig] * x_dt); + // y = rowwise_dotprod(state, C) + sumf += state * C[ig]; + s[i] = state; + } + y[ii] = sumf; } - y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector); } - } - } - #else - for (int i3 = 0; i3 < n_s; ++i3) { - for (int i2 = 0; i2 < n_t; ++i2) { - const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s} - const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} - const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s} - const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner} - const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s} - const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s} - float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} - float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s} - - // use the output as the source for the next token-wise iterations - if (i2 > 0) { s0 = s; } - - // d_inner - for (int i1 = 0; i1 < ir; ++i1) { - // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78 - float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1]; - float x_dt = x[i1] * dt_soft_plus; - float sumf = 0.0f; - // d_state - for (int i0 = 0; i0 < nc; ++i0) { - int i = i0 + i1*nc; - // state = prev_state * dA + dB * x - float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt); - // y = rowwise_dotprod(state, C) - sumf += state * C[i0]; - s[i] = state; + } else { + // Mamba-1 has an element-wise decay factor for the states + + // n_head + for (int h = ih0; h < ih1; ++h) { + // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16 + const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h]; + + // dim + for (int i1 = 0; i1 < nr; ++i1) { + const int ii = i1 + h*nr; + const float x_dt = x[ii] * dt_soft_plus; +#if defined(__ARM_FEATURE_SVE) + svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt); + svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus); + svfloat32_t r1_vector = GGML_F32_VEC_ZERO; + + // d_state + // TODO: what happens when (d_state % svcntw()) != 0? + for (int64_t k = 0; k < nc; k += svcntw()) { + svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]); + svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + (h & (ng - 1))*nc]); + svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + (h & (ng - 1))*nc]); + svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]); + + svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA); + t1 = exp_ps_sve(svptrue_b32(), t1); + svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB); + + vs0 = GGML_F32_VEC_FMA(t2, vs0, t1); + r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector); + + GGML_F32_VEC_STORE(&s[ii*nc + k], vs0); + } + y[ii] = GGML_F32xt_REDUCE_ONE(r1_vector); +#else + float sumf = 0.0f; + // NOTE: can't really use GGML_SIMD here because d_state is usually 16 + // and also because expf is used within the loop. + // d_state + for (int i0 = 0; i0 < nc; ++i0) { + const int i = i0 + ii*nc; + const int ig = i0 + (h & (ng - 1))*nc; + // state = prev_state * dA + dB * x + const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt); + // y = rowwise_dotprod(state, C) + sumf += state * C[ig]; + s[i] = state; + } + y[ii] = sumf; +#endif } - y[i1] = sumf; } } + // use the output as the source when it's not the first token-wise iteration + s0 = s; } - #endif + } } void ggml_compute_forward_ssm_scan( diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index b68ac0dd68b..b4ad68c9fd6 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -189,7 +189,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__) #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b) #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__) -#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c) +#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a) #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__) #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b) #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__) diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index ed5d7aefc35..a8156011eba 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G for (int i = 0; i < np; i += ggml_f32_step) { ax1 = GGML_F32_VEC_LOAD(x + i); ay1 = GGML_F32_VEC_LOAD(y + i); - sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1); + sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1); ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr); ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); - sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2); + sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2); ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr); ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr); - sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3); + sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3); ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr); ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr); - sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4); + sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4); ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr); ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr); - sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5); + sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5); ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr); ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr); - sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6); + sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6); ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr); ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr); - sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7); + sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7); ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr); ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr); - sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8); + sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8); } // leftovers // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop @@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G for (int i = np; i < np2; i += ggml_f32_epr) { ax1 = GGML_F32_VEC_LOAD(x + i); ay1 = GGML_F32_VEC_LOAD(y + i); - sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1); + sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1); } // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only if (np2 < n) { diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index d5507d75646..c432c990818 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const ax1 = GGML_F32_VEC_LOAD(x + i); ay1 = GGML_F32_VEC_LOAD(y + i); - ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1); + ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx); GGML_F32_VEC_STORE(y + i, ay1); ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr); ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); - ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2); + ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx); GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr); ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr); - ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3); + ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx); GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3); ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr); ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr); - ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4); + ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx); GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4); ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr); ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr); - ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5); + ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx); GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5); ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr); ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr); - ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6); + ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx); GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6); ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr); ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr); - ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7); + ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx); GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7); ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr); ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr); - ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8); + ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx); GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8); } @@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const for (int i = np; i < np2; i += ggml_f32_epr) { ax1 = GGML_F32_VEC_LOAD(x + i); ay1 = GGML_F32_VEC_LOAD(y + i); - ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1); + ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx); GGML_F32_VEC_STORE(y + i, ay1); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 0fa7fb953d8..121e83641b2 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3321,9 +3321,22 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_LOG: - case GGML_OP_SSM_SCAN: - case GGML_OP_SSM_CONV: return true; + case GGML_OP_SSM_SCAN: { + if (op->src[3]->ne[0] == 1) { + // Mamba2 + // (kernel only supports d_state == 128 && d_head % 16 == 0) + return op->src[0]->ne[0] == 128 && op->src[0]->ne[1] % 16 == 0; + } else { + // Mamba + // (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1) + return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1; + } + } + case GGML_OP_SSM_CONV: { + // assumes d_inner % threads == 0 + return op->src[0]->ne[1] % 128 == 0; + } case GGML_OP_CONT: return op->src[0]->type != GGML_TYPE_BF16; case GGML_OP_DIAG_MASK_INF: diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu index 2d34b836054..dc3b1a9a8cb 100644 --- a/ggml/src/ggml-cuda/ssm-scan.cu +++ b/ggml/src/ggml-cuda/ssm-scan.cu @@ -4,16 +4,15 @@ template __global__ void __launch_bounds__(splitD, 2) ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2, const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5, - const int src0_nb1, const int src0_nb2, const int src1_nb0, const int src1_nb1, const int src1_nb2, - const int src1_nb3, const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1, - const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2, - float * __restrict__ dst, const int64_t L) { - GGML_UNUSED(src1_nb0); - GGML_UNUSED(src2_nb0); + const int32_t * __restrict__ src6, float * __restrict__ dst, + const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3, + const int src2_nb1, const int src2_nb2, const int src3_nb1, + const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3, + const int64_t s_off, const int64_t d_inner, const int64_t L) { constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - const int bidx = blockIdx.x; // split along B - const int bidy = blockIdx.y; // split along D + const int bidx = blockIdx.x; // split along B (sequences) + const int bidy = blockIdx.y; // split along D (d_inner) const int tid = threadIdx.x; const int wid = tid / 32; const int wtid = tid % 32; @@ -24,23 +23,23 @@ __global__ void __launch_bounds__(splitD, 2) float * smem_A = smem; float * smem_s0 = smem_A + splitD * stride_sA; - const float * s0_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * splitD * src0_nb1); - const float * x_block = (const float *) ((const char *) src1 + (bidx * src1_nb2) + bidy * splitD * sizeof(float)); + const float * s0_block = (const float *) ((const char *) src0 + src6[bidx] * src0_nb3 + bidy * splitD * src0_nb2); + const float * x_block = (const float *) ((const char *) src1 + (bidx * src1_nb3) + bidy * splitD * sizeof(float)); const float * dt_block = (const float *) ((const char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float)); const float * A_block = (const float *) ((const char *) src3 + bidy * splitD * src3_nb1); - const float * B_block = (const float *) ((const char *) src4 + (bidx * src4_nb2)); - const float * C_block = (const float *) ((const char *) src5 + (bidx * src5_nb2)); - float * y_block = (float *) ((char *) dst + (bidx * src1_nb2) + bidy * splitD * sizeof(float)); - float * s_block = (float *) ((char *) dst + src1_nb3 + bidx * src0_nb2 + bidy * splitD * src0_nb1); + const float * B_block = (const float *) ((const char *) src4 + (bidx * src4_nb3)); + const float * C_block = (const float *) ((const char *) src5 + (bidx * src5_nb3)); + float * y_block = (float *) ((char *) dst + (bidx * d_inner * L * sizeof(float)) + bidy * splitD * sizeof(float)); + float * s_block = (float *) ((char *) dst + s_off + bidx * src0_nb3 + bidy * splitD * src0_nb2); - const int stride_s0 = src0_nb1 / sizeof(float); - const int stride_x = src1_nb1 / sizeof(float); + const int stride_s0 = src0_nb2 / sizeof(float); + const int stride_x = src1_nb2 / sizeof(float); const int stride_dt = src2_nb1 / sizeof(float); const int stride_A = src3_nb1 / sizeof(float); - const int stride_B = src4_nb1 / sizeof(float); - const int stride_C = src5_nb1 / sizeof(float); + const int stride_B = src4_nb2 / sizeof(float); + const int stride_C = src5_nb2 / sizeof(float); const int stride_s = stride_s0; - const int stride_y = stride_x; + const int stride_y = d_inner; // can N not be 16? for example 32? if (N == 16) { @@ -84,24 +83,156 @@ __global__ void __launch_bounds__(splitD, 2) } } +// assumes as many threads as d_state +template +__global__ void __launch_bounds__(d_state, 1) + ssm_scan_f32_group( + const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2, + const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5, + const int32_t * __restrict__ src6, float * __restrict__ dst, + const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3, + const int src2_nb1, const int src2_nb2, const int src3_nb1, + const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3, + const int64_t s_off, const int64_t n_head, const int64_t d_head, const int64_t n_group, const int64_t n_tok) { + + const int head_idx = (blockIdx.x * splitH) / d_head; + const int head_off = ((blockIdx.x * splitH) % d_head) * sizeof(float); + const int seq_idx = blockIdx.y; + + const int group_off = (head_idx & (n_group - 1)) * d_state * sizeof(float); + + const float * s0_block = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state); + const float * x_block = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + blockIdx.x * splitH * sizeof(float)); + const float * dt_block = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float)); + const float * A_block = (const float *) ((const char *) src3 + head_idx * src3_nb1); + const float * B_block = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off)); + const float * C_block = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off)); + float * y_block = dst + (seq_idx * n_tok * n_head * d_head) + blockIdx.x * splitH; + float * s_block = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state); + + // strides across n_seq_tokens + const int stride_x = src1_nb2 / sizeof(float); + const int stride_dt = src2_nb1 / sizeof(float); + const int stride_B = src4_nb2 / sizeof(float); + const int stride_C = src5_nb2 / sizeof(float); + const int stride_y = n_head * d_head; + + float state[splitH]; + // for the parallel accumulation + __shared__ float stateC[splitH * d_state]; + +#pragma unroll + for (int j = 0; j < splitH; j++) { + state[j] = s0_block[j * d_state + threadIdx.x]; + } + + for (int64_t i = 0; i < n_tok; i++) { + // TODO: only calculate dA and dt_soft_plus once per head instead of every splitH head elements + // TODO: only calculate B and C once per head group + // NOTE: dt_soft_plus, dA and x_dt have the same value across threads here. + float dt_soft_plus = dt_block[i * stride_dt]; + if (dt_soft_plus <= 20.0f) { + dt_soft_plus = log1pf(expf(dt_soft_plus)); + } + const float dA = expf(dt_soft_plus * A_block[0]); + const float B = B_block[i * stride_B + threadIdx.x]; + const float C = C_block[i * stride_C + threadIdx.x]; + + // across d_head +#pragma unroll + for (int j = 0; j < splitH; j++) { + const float x_dt = x_block[i * stride_x + j] * dt_soft_plus; + + state[j] = (state[j] * dA) + (B * x_dt); + + stateC[j * d_state + threadIdx.x] = state[j] * C; + } + + __syncthreads(); + + // parallel accumulation for stateC + // TODO: simplify + { + static_assert((d_state & -d_state) == d_state, "the state size has to be a power of 2"); + static_assert((splitH & -splitH) == splitH, "splitH has to be a power of 2"); + + // reduce until w matches the warp size + // TODO: does this work even when the physical warp size is 64? +#pragma unroll + for (int w = d_state; w > WARP_SIZE; w >>= 1) { + // (assuming there are d_state threads) +#pragma unroll + for (int j = 0; j < ((w >> 1) * splitH + d_state - 1) / d_state; j++) { + // TODO: check for bank conflicts + const int k = (threadIdx.x % (w >> 1)) + (d_state * (threadIdx.x / (w >> 1))) + j * d_state * (d_state / (w >> 1)); + stateC[k] += stateC[k + (w >> 1)]; + + } + __syncthreads(); + } + + static_assert(splitH >= d_state / WARP_SIZE); + +#pragma unroll + for (int j = 0; j < splitH / (d_state / WARP_SIZE); j++) { + float y = stateC[(threadIdx.x % WARP_SIZE) + d_state * (threadIdx.x / WARP_SIZE) + j * d_state * (d_state / WARP_SIZE)]; + y = warp_reduce_sum(y); + + // store the above accumulations + if (threadIdx.x % WARP_SIZE == 0) { + const int k = threadIdx.x / WARP_SIZE + j * (d_state / WARP_SIZE); + y_block[i * stride_y + k] = y; + } + } + } + } + + // write back the state +#pragma unroll + for (int j = 0; j < splitH; j++) { + s_block[j * d_state + threadIdx.x] = state[j]; + } +} + static void ssm_scan_f32_cuda(const float * src0, const float * src1, const float * src2, const float * src3, - const float * src4, const float * src5, const int src0_nb1, const int src0_nb2, - const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3, - const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1, - const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2, - float * dst, const int64_t N, const int64_t D, const int64_t L, const int64_t B, + const float * src4, const float * src5, const int32_t * src6, float * dst, + const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3, const int src2_nb1, + const int src2_nb2, const int src3_nb1, const int src4_nb2, const int src4_nb3, const int src5_nb2, + const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim, + const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq, cudaStream_t stream) { const int threads = 128; - // todo: consider D cannot be divided,does this situation exist? - GGML_ASSERT(D % threads == 0); - const dim3 blocks(B, (D + threads - 1) / threads, 1); - const int smem_size = (threads * (N + 1) * 2) * sizeof(float); - if (N == 16) { - ssm_scan_f32<128, 16><<>>( - src0, src1, src2, src3, src4, src5, src0_nb1, src0_nb2, src1_nb0, src1_nb1, src1_nb2, src1_nb3, src2_nb0, - src2_nb1, src2_nb2, src3_nb1, src4_nb1, src4_nb2, src5_nb1, src5_nb2, dst, L); + // NOTE: if you change conditions here, be sure to update the corresponding supports_op condition! + if (src3_nb1 == sizeof(float)) { + // Mamba-2 + if (d_state == 128) { + GGML_ASSERT(d_state % threads == 0); + // NOTE: can be any power of two between 4 and 64 + const int splitH = 16; + GGML_ASSERT(head_dim % splitH == 0); + const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1); + ssm_scan_f32_group<16, 128><<>>( + src0, src1, src2, src3, src4, src5, src6, dst, + src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, + src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); + } else { + GGML_ABORT("doesn't support d_state!=128."); + } } else { - GGML_ABORT("doesn't support N!=16."); + // Mamba-1 + GGML_ASSERT(n_head % threads == 0); + GGML_ASSERT(head_dim == 1); + GGML_ASSERT(n_group == 1); + const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1); + const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float); + if (d_state == 16) { + ssm_scan_f32<128, 16><<>>( + src0, src1, src2, src3, src4, src5, src6, dst, + src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, + src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); + } else { + GGML_ABORT("doesn't support d_state!=16."); + } } } @@ -112,30 +243,25 @@ void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const struct ggml_tensor * src3 = dst->src[3]; // A const struct ggml_tensor * src4 = dst->src[4]; // B const struct ggml_tensor * src5 = dst->src[5]; // C - - // const int64_t d_state = src0->ne[0]; - // const int64_t d_inner = src0->ne[1]; - // const int64_t l = src1->ne[1]; - // const int64_t b = src0->ne[2]; + const struct ggml_tensor * src6 = dst->src[6]; // ids const int64_t nc = src0->ne[0]; // d_state - const int64_t nr = src0->ne[1]; // d_inner - const int64_t n_t = src1->ne[1]; // number of tokens per sequence - const int64_t n_s = src0->ne[2]; // number of sequences in the batch + const int64_t nr = src0->ne[1]; // head_dim or 1 + const int64_t nh = src1->ne[1]; // n_head + const int64_t ng = src4->ne[1]; // n_group + const int64_t n_t = src1->ne[2]; // number of tokens per sequence + const int64_t n_s = src1->ne[3]; // number of sequences in the batch + + const int64_t s_off = ggml_nelements(src1) * sizeof(float); - GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst)); + GGML_ASSERT(ggml_nelements(src1) + nc*nr*nh*n_s == ggml_nelements(dst)); GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(src1->nb[0] == sizeof(float)); GGML_ASSERT(src2->nb[0] == sizeof(float)); GGML_ASSERT(src3->nb[0] == sizeof(float)); GGML_ASSERT(src4->nb[0] == sizeof(float)); GGML_ASSERT(src5->nb[0] == sizeof(float)); - // required for the dot product between s and C - GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float)); - // required for per-sequence offsets for states - GGML_ASSERT(src0->nb[2] == src0->ne[0] * src0->ne[1] * sizeof(float)); - // required to get correct offset for state destination (i.e. src1->nb[3]) - GGML_ASSERT(src1->nb[3] == src1->ne[0] * src1->ne[1] * src1->ne[2] * sizeof(float)); + GGML_ASSERT(src6->nb[0] == sizeof(int32_t)); const float * src0_d = (const float *) src0->data; const float * src1_d = (const float *) src1->data; @@ -143,13 +269,16 @@ void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const float * src3_d = (const float *) src3->data; const float * src4_d = (const float *) src4->data; const float * src5_d = (const float *) src5->data; + const int32_t * src6_d = (const int32_t *) src6->data; float * dst_d = (float *) dst->data; cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src6->type == GGML_TYPE_I32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - ssm_scan_f32_cuda(src0_d, src1_d, src2_d, src3_d, src4_d, src5_d, src0->nb[1], src0->nb[2], src1->nb[0], - src1->nb[1], src1->nb[2], src1->nb[3], src2->nb[0], src2->nb[1], src2->nb[2], src3->nb[1], - src4->nb[1], src4->nb[2], src5->nb[1], src5->nb[2], dst_d, nc, nr, n_t, n_s, stream); + ssm_scan_f32_cuda(src0_d, src1_d, src2_d, src3_d, src4_d, src5_d, src6_d, dst_d, + src0->nb[2], src0->nb[3], src1->nb[2], src1->nb[3], src2->nb[1], src2->nb[2], + src3->nb[1], src4->nb[2], src4->nb[3], src5->nb[2], src5->nb[3], + s_off, nc, nr, nh, ng, n_t, n_s, stream); } diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 8c2a9719274..adb65c0e7b7 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -513,26 +513,25 @@ typedef struct { typedef struct { int64_t d_state; int64_t d_inner; + int64_t n_head; + int64_t n_group; int64_t n_seq_tokens; int64_t n_seqs; - uint64_t nb00; uint64_t nb01; uint64_t nb02; - uint64_t nb10; + uint64_t nb03; uint64_t nb11; uint64_t nb12; uint64_t nb13; - uint64_t nb20; uint64_t nb21; uint64_t nb22; - uint64_t nb30; uint64_t nb31; - uint64_t nb40; uint64_t nb41; uint64_t nb42; - uint64_t nb50; + uint64_t nb43; uint64_t nb51; uint64_t nb52; + uint64_t nb53; } ggml_metal_kargs_ssm_scan; typedef struct { diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 1a0968ed4d2..de40430ef35 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -217,6 +217,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_NORM, GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, + GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP, GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, @@ -1196,6 +1197,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP, ssm_scan_f32_group, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, rwkv_wkv6_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, rwkv_wkv7_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); @@ -2809,71 +2811,91 @@ static bool ggml_metal_encode_node( struct ggml_tensor * src3 = node->src[3]; struct ggml_tensor * src4 = node->src[4]; struct ggml_tensor * src5 = node->src[5]; + struct ggml_tensor * src6 = node->src[6]; GGML_ASSERT(src3); GGML_ASSERT(src4); GGML_ASSERT(src5); + GGML_ASSERT(src6); size_t offs_src3 = 0; size_t offs_src4 = 0; size_t offs_src5 = 0; + size_t offs_src6 = 0; id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; id id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil; id id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil; + id id_src6 = src6 ? ggml_metal_get_buffer(src6, &offs_src6) : nil; - const int64_t ne30 = src3->ne[0]; GGML_UNUSED(ne30); + const int64_t ne30 = src3->ne[0]; const int64_t ne31 = src3->ne[1]; GGML_UNUSED(ne31); - const uint64_t nb30 = src3->nb[0]; + const uint64_t nb30 = src3->nb[0]; GGML_UNUSED(nb30); const uint64_t nb31 = src3->nb[1]; const int64_t ne40 = src4->ne[0]; GGML_UNUSED(ne40); - const int64_t ne41 = src4->ne[1]; GGML_UNUSED(ne41); + const int64_t ne41 = src4->ne[1]; const int64_t ne42 = src4->ne[2]; GGML_UNUSED(ne42); + const int64_t ne43 = src4->ne[3]; GGML_UNUSED(ne43); - const uint64_t nb40 = src4->nb[0]; + const uint64_t nb40 = src4->nb[0]; GGML_UNUSED(nb40); const uint64_t nb41 = src4->nb[1]; const uint64_t nb42 = src4->nb[2]; + const uint64_t nb43 = src4->nb[3]; const int64_t ne50 = src5->ne[0]; GGML_UNUSED(ne50); const int64_t ne51 = src5->ne[1]; GGML_UNUSED(ne51); const int64_t ne52 = src5->ne[2]; GGML_UNUSED(ne52); + const int64_t ne53 = src5->ne[3]; GGML_UNUSED(ne53); - const uint64_t nb50 = src5->nb[0]; + const uint64_t nb50 = src5->nb[0]; GGML_UNUSED(nb50); const uint64_t nb51 = src5->nb[1]; const uint64_t nb52 = src5->nb[2]; + const uint64_t nb53 = src5->nb[3]; + + const int64_t ne60 = src6->ne[0]; GGML_UNUSED(ne60); + + const uint64_t nb60 = src6->nb[0]; GGML_UNUSED(nb60); const int64_t d_state = ne00; const int64_t d_inner = ne01; - const int64_t n_seq_tokens = ne11; - const int64_t n_seqs = ne02; + const int64_t n_head = ne02; + const int64_t n_group = ne41; + const int64_t n_seq_tokens = ne12; + const int64_t n_seqs = ne13; + + id pipeline = nil; - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline; + if (ne30 == 1) { + // Mamba-2 + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline; + } ggml_metal_kargs_ssm_scan args = { - /*.d_state =*/ d_state, - /*.d_inner =*/ d_inner, + /*.d_state =*/ d_state, + /*.d_inner =*/ d_inner, + /*.n_head =*/ n_head, + /*.n_group =*/ n_group, /*.n_seq_tokens =*/ n_seq_tokens, - /*.n_seqs =*/ n_seqs, - /*.nb00 =*/ nb00, - /*.nb01 =*/ nb01, - /*.nb02 =*/ nb02, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.nb13 =*/ nb13, - /*.nb20 =*/ nb20, - /*.nb21 =*/ nb21, - /*.nb22 =*/ nb22, - /*.nb30 =*/ nb30, - /*.nb31 =*/ nb31, - /*.nb40 =*/ nb40, - /*.nb41 =*/ nb41, - /*.nb42 =*/ nb42, - /*.nb50 =*/ nb50, - /*.nb51 =*/ nb51, - /*.nb52 =*/ nb52, + /*.n_seqs =*/ n_seqs, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.nb21 =*/ nb21, + /*.nb22 =*/ nb22, + /*.nb31 =*/ nb31, + /*.nb41 =*/ nb41, + /*.nb42 =*/ nb42, + /*.nb43 =*/ nb43, + /*.nb51 =*/ nb51, + /*.nb52 =*/ nb52, + /*.nb53 =*/ nb53, }; [encoder setComputePipelineState:pipeline]; @@ -2883,10 +2905,17 @@ static bool ggml_metal_encode_node( [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4]; [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:6]; - [encoder setBytes:&args length:sizeof(args) atIndex:7]; + [encoder setBuffer:id_src6 offset:offs_src6 atIndex:6]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:7]; + [encoder setBytes:&args length:sizeof(args) atIndex:8]; - [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + if (ne30 == 1) { + // Mamba-2 + [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } else { + GGML_ASSERT(d_inner == 1); + [encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } } break; case GGML_OP_RWKV_WKV6: { diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index dfff669726b..a7657e0fc3c 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1596,7 +1596,7 @@ kernel void kernel_ssm_conv_f32( x[0] = sumf; } -// ref: ggml.c:ggml_compute_forward_ssm_scan_f32 +// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-1 part kernel void kernel_ssm_scan_f32( device const void * src0, device const void * src1, @@ -1604,46 +1604,119 @@ kernel void kernel_ssm_scan_f32( device const void * src3, device const void * src4, device const void * src5, + device const void * src6, device float * dst, constant ggml_metal_kargs_ssm_scan & args, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { - const int64_t ir = tgpig.x; - const int64_t i3 = tgpig.y; + const int64_t i1 = 0; + const int64_t ir = tgpig.x; // current head + const int64_t i3 = tgpig.y; // current seq + + const uint64_t nb00 = sizeof(float); + const uint64_t nb10 = sizeof(float); + const uint64_t nb20 = sizeof(float); const int64_t nc = args.d_state; - // const int64_t nr = args.d_inner; + const int64_t nr = args.d_inner; + const int64_t nh = args.n_head; + const int64_t ng = args.n_group; const int64_t n_t = args.n_seq_tokens; - // const int64_t n_s = args.n_seqs; + + const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float); + + device const int32_t * ids = (device const int32_t *) src6; + + device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03); + device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off); for (int64_t i2 = 0; i2 < n_t; ++i2) { - device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb01 + i3*args.nb02); - device const float * x = (device const float *) ((device const char *) src1 + ir*args.nb10 + i2*args.nb11 + i3*args.nb12); - device const float * dt = (device const float *) ((device const char *) src2 + ir*args.nb20 + i2*args.nb21 + i3*args.nb22); - device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); - device const float * B = (device const float *) ((device const char *) src4 + i2*args.nb41 + i3*args.nb42); - device const float * C = (device const float *) ((device const char *) src5 + i2*args.nb51 + i3*args.nb52); - device float * y = (device float *) ((device char *) dst + ir*args.nb10 + i2*args.nb11 + i3*args.nb12); // TODO: do not use src1 strides - device float * s = (device float *) ((device char *) dst + ir*args.nb01 + i3*args.nb02 + args.nb13); - - if (i2 > 0) { - s0 = s; - } - - // i1 == 0 - float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0]; - float x_dt = x[0] * dt_soft_plus; + device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns} + device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns} + device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {d_state, nh} + device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns} + device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns} + device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns} + + const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0]; + const float x_dt = x[0] * dt_soft_plus; float sumf = 0.0f; for (int64_t i0 = 0; i0 < nc; ++i0) { - int64_t i = i0; - float state = (s0[i] * exp(dt_soft_plus * A[i])) + (B[i0] * x_dt); + const int64_t i = i0 + i1*nc; + const float state = (s0[i] * exp(dt_soft_plus * A[i0])) + (B[i0] * x_dt); sumf += state * C[i0]; s[i] = state; } y[0] = sumf; + + // recurse + s0 = s; + } +} + +// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part +// TODO: optimize (e.g. by parallelizing over d_state) +kernel void kernel_ssm_scan_f32_group( + device const void * src0, + device const void * src1, + device const void * src2, + device const void * src3, + device const void * src4, + device const void * src5, + device const void * src6, + device float * dst, + constant ggml_metal_kargs_ssm_scan & args, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i1 = tgpig.x; + const int64_t ir = tgpig.y; // current head + const int64_t i3 = tgpig.z; // current seq + + const uint64_t nb00 = sizeof(float); + const uint64_t nb10 = sizeof(float); + const uint64_t nb20 = sizeof(float); + + const int64_t nc = args.d_state; + const int64_t nr = args.d_inner; + const int64_t nh = args.n_head; + const int64_t ng = args.n_group; + const int64_t n_t = args.n_seq_tokens; + + const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float); + + device const int32_t * ids = (device const int32_t *) src6; + + device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03); + device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off); + + for (int64_t i2 = 0; i2 < n_t; ++i2) { + device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns} + device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns} + device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh} + device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns} + device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns} + device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns} + + const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0]; + const float x_dt = x[0] * dt_soft_plus; + const float dA = exp(dt_soft_plus * A[0]); + float sumf = 0.0f; + + for (int64_t i0 = 0; i0 < nc; ++i0) { + const int64_t i = i0 + i1*nc; + const float state = (s0[i] * dA) + (B[i0] * x_dt); + sumf += state * C[i0]; + s[i] = state; + } + + y[0] = sumf; + + // recurse + s0 = s; } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f1245c50c92..90c121339fb 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4829,7 +4829,6 @@ struct ggml_tensor * ggml_ssm_conv( const int64_t n_s = sx->ne[2]; // TODO: maybe support other strides than 1? - // FIXME: this is always true? GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t); GGML_ASSERT(sx->ne[1] == d_inner); GGML_ASSERT(n_t >= 0); @@ -4852,36 +4851,49 @@ struct ggml_tensor * ggml_ssm_scan( struct ggml_tensor * dt, struct ggml_tensor * A, struct ggml_tensor * B, - struct ggml_tensor * C) { + struct ggml_tensor * C, + struct ggml_tensor * ids) { GGML_ASSERT(ggml_is_contiguous(s)); - GGML_ASSERT(ggml_is_contiguous(x)); GGML_ASSERT(ggml_is_contiguous(dt)); GGML_ASSERT(ggml_is_contiguous(A)); - GGML_ASSERT(ggml_is_matrix(A)); - GGML_ASSERT(ggml_is_3d(B)); - GGML_ASSERT(ggml_is_3d(s)); + GGML_ASSERT(x->nb[0] == ggml_type_size(x->type)); GGML_ASSERT(B->nb[0] == ggml_type_size(B->type)); GGML_ASSERT(C->nb[0] == ggml_type_size(C->type)); - GGML_ASSERT(ggml_are_same_shape(x, dt)); + GGML_ASSERT(x->nb[1] == x->ne[0]*x->nb[0]); + GGML_ASSERT(B->nb[1] == B->ne[0]*B->nb[0]); + GGML_ASSERT(C->nb[1] == C->ne[0]*C->nb[0]); GGML_ASSERT(ggml_are_same_shape(B, C)); + GGML_ASSERT(ids->type == GGML_TYPE_I32); { const int64_t d_state = s->ne[0]; - const int64_t d_inner = s->ne[1]; - const int64_t n_seq_tokens = x->ne[1]; - const int64_t n_seqs = x->ne[2]; - - GGML_ASSERT(s->ne[2] == n_seqs); - GGML_ASSERT(x->ne[0] == d_inner); - GGML_ASSERT(A->ne[0] == d_state); - GGML_ASSERT(A->ne[1] == d_inner); + const int64_t head_dim = x->ne[0]; + const int64_t n_head = x->ne[1]; + const int64_t n_seq_tokens = x->ne[2]; + const int64_t n_seqs = x->ne[3]; + + GGML_ASSERT(dt->ne[0] == n_head); + GGML_ASSERT(dt->ne[1] == n_seq_tokens); + GGML_ASSERT(dt->ne[2] == n_seqs); + GGML_ASSERT(ggml_is_3d(dt)); + GGML_ASSERT(s->ne[1] == head_dim); + GGML_ASSERT(s->ne[2] == n_head); GGML_ASSERT(B->ne[0] == d_state); - GGML_ASSERT(B->ne[1] == n_seq_tokens); - GGML_ASSERT(B->ne[2] == n_seqs); + GGML_ASSERT(B->ne[2] == n_seq_tokens); + GGML_ASSERT(B->ne[3] == n_seqs); + GGML_ASSERT(ids->ne[0] == n_seqs); + GGML_ASSERT(ggml_is_vector(ids)); + GGML_ASSERT(A->ne[1] == n_head); + GGML_ASSERT(ggml_is_matrix(A)); + + if (A->ne[0] != 1) { + // Mamba-1 has more granular decay factors + GGML_ASSERT(A->ne[0] == d_state); + } } // concatenated y + ssm_states - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s)); + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + s->ne[0]*s->ne[1]*s->ne[2]*ids->ne[0]); result->op = GGML_OP_SSM_SCAN; result->src[0] = s; @@ -4890,6 +4902,7 @@ struct ggml_tensor * ggml_ssm_scan( result->src[3] = A; result->src[4] = B; result->src[5] = C; + result->src[6] = ids; return result; } From 9cde908c0a50d530b5e4fb4d7a06ae69bfc62294 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 3 Jul 2025 07:45:11 +0800 Subject: [PATCH 0065/1348] CUDA: add dynamic shared mem to softmax, refactor general usage (llama/14497) --- ggml/src/ggml-cuda/common.cuh | 14 +++++ ggml/src/ggml-cuda/cross-entropy-loss.cu | 16 +---- ggml/src/ggml-cuda/mmq.cuh | 10 +-- ggml/src/ggml-cuda/softmax.cu | 78 ++++++++++++------------ 4 files changed, 56 insertions(+), 62 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index ea203550238..954f74d408f 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -175,6 +175,20 @@ static const char * cu_get_error_str(CUresult err) { #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) #endif +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) +#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ + do { \ + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; \ + const int id = ggml_cuda_get_device(); \ + if (!shared_memory_limit_raised[id]) { \ + CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \ + shared_memory_limit_raised[id] = true; \ + } \ + } while (0) +#else +#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) do {} while (0) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) + #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA) #define GGML_CUDA_ASSUME(x) __builtin_assume(x) #else diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu index 0ce4afbb222..0c8b0819724 100644 --- a/ggml/src/ggml-cuda/cross-entropy-loss.cu +++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu @@ -123,13 +123,7 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * ggml_cuda_pool_alloc dst_tmp(pool, blocks_num.x); if (nbytes_shared <= smpbo) { -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; - if (!shared_memory_limit_raised[id]) { - CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo)); - shared_memory_limit_raised[id] = true; - } -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) + CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_f32), smpbo); cross_entropy_loss_f32<<>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows); } else { cross_entropy_loss_f32<<>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows); @@ -175,13 +169,7 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten const size_t smpbo = ggml_cuda_info().devices[id].smpbo; if (nbytes_shared <= smpbo) { -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; - if (!shared_memory_limit_raised[id]) { - CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo)); - shared_memory_limit_raised[id] = true; - } -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) + CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_back_f32), smpbo); cross_entropy_loss_back_f32<<>>(grad_d, src0f_d, src1f_d, dst_d, ne00); } else { cross_entropy_loss_back_f32<<>>(grad_d, src0f_d, src1f_d, dst_d, ne00); diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 80baf459c15..9696a320462 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -3016,14 +3016,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc); -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; - if (!shared_memory_limit_raised[id]) { - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); - shared_memory_limit_raised[id] = true; - } -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) + CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); + CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x; diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index e0eb921d85d..14543e978cf 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -2,6 +2,7 @@ #include "ggml.h" #include "softmax.cuh" #include +#include template static __device__ __forceinline__ float t2f32(T val) { @@ -181,6 +182,37 @@ static __global__ void soft_max_back_f32( } } +template +static void launch_soft_max_kernels(const float * x, const T * mask, float * dst, + const soft_max_params & p, cudaStream_t stream, dim3 block_dims, dim3 block_nums, size_t nbytes_shared) +{ + const int id = ggml_cuda_get_device(); + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; + + auto launch_kernel = [=](auto I) -> bool { + constexpr int ncols = decltype(I)::value; + constexpr int block = (ncols > 1024 ? 1024 : ncols); + + if (p.ncols == ncols) { + CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32), smpbo); + soft_max_f32<<>> + (x, mask, dst, p); + return true; + } + return false; + }; + + // unary fold over launch_kernel + if ((launch_kernel(std::integral_constant{}) || ...)) { + return; + } + + //default case + CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32), smpbo); + soft_max_f32<<>>(x, mask, dst, p); +} + + template static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const soft_max_params & params, cudaStream_t stream) { int nth = WARP_SIZE; @@ -193,46 +225,12 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); - // FIXME: this limit could be raised by ~2-4x on Ampere or newer - if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { - switch (ncols_x) { - case 32: - soft_max_f32<<>> - (x, mask, dst, params); - break; - case 64: - soft_max_f32<<>> - (x, mask, dst, params); - break; - case 128: - soft_max_f32<<>> - (x, mask, dst, params); - break; - case 256: - soft_max_f32<<>> - (x, mask, dst, params); - break; - case 512: - soft_max_f32<<>> - (x, mask, dst, params); - break; - case 1024: - soft_max_f32<<>> - (x, mask, dst, params); - break; - case 2048: - soft_max_f32<<>> - (x, mask, dst, params); - break; - case 4096: - soft_max_f32<<>> - (x, mask, dst, params); - break; - default: - soft_max_f32<<>> - (x, mask, dst, params); - break; - } + const int id = ggml_cuda_get_device(); + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; + + + if (nbytes_shared <= smpbo) { + launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, dst, params, stream, block_dims, block_nums, nbytes_shared); } else { const size_t nbytes_shared_low = WARP_SIZE*sizeof(float); soft_max_f32<<>>(x, mask, dst, params); From 0abd0660e14b828b31cf05da1f364c7474f03287 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 3 Jul 2025 10:46:57 +0300 Subject: [PATCH 0066/1348] ggml : fix FA mask dim 2 and 3 (llama/14505) * ggml : fix FA mask dim 2 and 3 ggml-ci * backends : unsupport batched FA in CUDA and Vulkan ggml-ci * vulkan : disable FA for mask->ne[2] != 1 --- ggml/include/ggml.h | 13 +++++++------ ggml/src/ggml-cpu/ops.cpp | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 3 ++- ggml/src/ggml-metal/ggml-metal-impl.h | 2 ++ ggml/src/ggml-metal/ggml-metal.m | 2 ++ ggml/src/ggml-metal/ggml-metal.metal | 4 ++-- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 ++++++ ggml/src/ggml.c | 5 ++--- 8 files changed, 24 insertions(+), 13 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index fbe072a3e0c..d25bfcc9f45 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1980,15 +1980,16 @@ extern "C" { #define GGML_KQ_MASK_PAD 64 - // q: [n_embd_k, n_batch, n_head, ne3] - // k: [n_embd_k, n_kv, n_head_kv, ne3] - // v: [n_embd_v, n_kv, n_head_kv, ne3] !! not transposed !! - // mask: [n_kv, n_batch_pad, ne32, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !! - // res: [n_embd_v, n_head, n_batch, ne3] !! permuted !! + // q: [n_embd_k, n_batch, n_head, ne3 ] + // k: [n_embd_k, n_kv, n_head_kv, ne3 ] + // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !! + // mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !! + // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !! // // broadcast: // n_head % n_head_kv == 0 - // ne3 % ne32 == 0 + // n_head % ne32 == 0 + // ne3 % ne33 == 0 // GGML_API struct ggml_tensor * ggml_flash_attn_ext( struct ggml_context * ctx, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 2f153419542..0fb2c08b5bc 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7799,7 +7799,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( memset(VKQ32, 0, DV*sizeof(float)); } - const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1] + (iq3%mask->ne[2])*mask->nb[2]) : NULL; + const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]) : NULL; // k indices const int ik3 = iq3 / rk3; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 121e83641b2..1c04bba52e8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3390,7 +3390,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; } // TODO: support broadcast - // ref: https://github.com/ggml-org/llama.cpp/pull/14435 + // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but + // the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505 if (op->src[0]->ne[3] != 1) { return false; } diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index adb65c0e7b7..752d55c2166 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -230,8 +230,10 @@ typedef struct { uint64_t nb22; uint64_t nb23; int32_t ne32; + int32_t ne33; uint64_t nb31; uint64_t nb32; + uint64_t nb33; int32_t ne1; int32_t ne2; float scale; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index de40430ef35..5e5467e88a1 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -5018,8 +5018,10 @@ static bool ggml_metal_encode_node( /*.nb22 =*/ nb22, /*.nb23 =*/ nb23, /*.ne32 =*/ ne32, + /*.ne33 =*/ ne33, /*.nb31 =*/ nb31, /*.nb32 =*/ nb32, + /*.nb33 =*/ nb33, /*.ne1 =*/ ne1, /*.ne2 =*/ ne2, /*.scale =*/ scale, diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index a7657e0fc3c..ebde005f33e 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -3857,7 +3857,7 @@ kernel void kernel_flash_attn_ext( // load the mask in shared memory #pragma unroll(Q) for (short j = 0; j < Q; ++j) { - device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31 + (iq3%args.ne32)*args.nb32); + device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33); const float m = pm[ic + tiisg]; @@ -4343,7 +4343,7 @@ kernel void kernel_flash_attn_ext_vec( const bool has_mask = mask != q; // pointer to the mask - device const half * pm = (device const half *) (mask + iq1*args.nb31 + (iq3%args.ne32)*args.nb32); + device const half * pm = (device const half *) (mask + iq1*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33); float slope = 1.0f; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 25f70127a62..ccb0d47b675 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -10265,6 +10265,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { return false; } + // TODO: support broadcast + // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14449, but + // the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505 + if (op->src[0]->ne[3] != 1 || (op->src[3] && op->src[3]->ne[2] != 1)) { + return false; + } // It's straightforward to support different K/V dequant, but would // significantly increase the number of pipelines if (op->src[1]->type != op->src[2]->type) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 90c121339fb..b89a68db83c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3666,7 +3666,6 @@ static struct ggml_tensor * ggml_soft_max_impl( if (mask) { GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32); GGML_ASSERT(ggml_is_contiguous(mask)); - GGML_ASSERT(ggml_is_3d(mask)); GGML_ASSERT(mask->ne[0] == a->ne[0]); GGML_ASSERT(mask->ne[1] >= a->ne[1]); GGML_ASSERT(a->ne[2]%mask->ne[2] == 0); @@ -4696,12 +4695,12 @@ struct ggml_tensor * ggml_flash_attn_ext( if (mask) { GGML_ASSERT(ggml_is_contiguous(mask)); - GGML_ASSERT(mask->ne[2] == q->ne[3]); GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) && "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big"); //GGML_ASSERT(ggml_can_repeat_rows(mask, qk)); - GGML_ASSERT(q->ne[3] % mask->ne[2] == 0); + GGML_ASSERT(q->ne[2] % mask->ne[2] == 0); + GGML_ASSERT(q->ne[3] % mask->ne[3] == 0); } if (max_bias > 0.0f) { From ed639c7f2286bd5b8dd2427dd57832fb613d1510 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 3 Jul 2025 10:53:35 +0300 Subject: [PATCH 0067/1348] kv-cache : use ggml_set_rows (llama/14285) * kv-cache : use ggml_set_rows ggml-ci * graph : separate k and v indices ggml-ci * cont : remove redundant ifs ggml-ci * kv-cache : improve find_slot impl * kv-cache : bounds-check when accessing slot_info indices * kv-cache : add comments ggml-ci * ggml : add TODOs for adding GGML_OP_SET_ROWS support in the backends ggml-ci --- ggml/src/ggml-cann/ggml-cann.cpp | 6 ++++++ ggml/src/ggml-opencl/ggml-opencl.cpp | 6 ++++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 6 ++++++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 ++++++ 4 files changed, 24 insertions(+) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 8a3d2026ed8..eae575cc040 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2086,6 +2086,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, return false; } } break; + case GGML_OP_SET_ROWS: + { + // TODO: add support + // ref: https://github.com/ggml-org/llama.cpp/pull/14274 + return false; + } break; case GGML_OP_CPY: { ggml_tensor *src = op->src[0]; if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) || diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 71829c05a5b..9436e6ea9a0 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2222,6 +2222,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te default: return false; } + case GGML_OP_SET_ROWS: + { + // TODO: add support + // ref: https://github.com/ggml-org/llama.cpp/pull/14274 + return false; + } break; case GGML_OP_CPY: case GGML_OP_DUP: case GGML_OP_CONT: diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 1d41d7a4be8..ee4c8f7b2fa 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4285,6 +4285,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return false; } } + case GGML_OP_SET_ROWS: + { + // TODO: add support + // ref: https://github.com/ggml-org/llama.cpp/pull/14274 + return false; + } break; case GGML_OP_CPY: { ggml_type src0_type = op->src[0]->type; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ccb0d47b675..8114cb42046 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -10339,6 +10339,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return false; } } break; + case GGML_OP_SET_ROWS: + { + // TODO: add support + // ref: https://github.com/ggml-org/llama.cpp/pull/14274 + return false; + } break; case GGML_OP_CONT: case GGML_OP_CPY: case GGML_OP_DUP: From 0ca760433c29b037532910db18660a0622782593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= Date: Thu, 3 Jul 2025 11:00:03 +0200 Subject: [PATCH 0068/1348] Fix conditional enabling following arch checks for ggml-sycl (llama/14504) Signed-off-by: nscipione --- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ee4c8f7b2fa..b063a698ded 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -83,7 +83,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.devices[i].cc = 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - info.devices[i].opt_feature.reorder = !device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); + info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); info.max_work_group_sizes[i] = prop.get_max_work_group_size(); } From 4aaf8114e7953715ef9880463d0de46dd294e13d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 3 Jul 2025 17:05:18 +0200 Subject: [PATCH 0069/1348] ggml: backward pass for split swiglu (llama/14483) --- ggml/src/ggml.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b89a68db83c..68768842904 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6042,13 +6042,28 @@ static void ggml_compute_backward( } GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented"); } break; + case GGML_OP_GLU: { + switch (ggml_get_glu_op(tensor)) { + case GGML_GLU_OP_SWIGLU: { + if (src0_needs_grads) { + GGML_ASSERT(src1 && "backward pass only implemented for split swiglu"); + ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, ggml_mul(ctx, grad, src1), src0)); + } + if (src1_needs_grads) { + ggml_add_or_set(ctx, cgraph, isrc1, ggml_mul(ctx, ggml_silu(ctx, src0), grad)); + } + } break; + default: { + GGML_ABORT("unsupported glu op for backward pass: %s", ggml_glu_op_name(ggml_get_glu_op(tensor))); + } //break; + } + } break; case GGML_OP_NONE: { // noop } break; case GGML_OP_COUNT: default: { - fprintf(stderr, "%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op)); - GGML_ABORT("fatal error"); + GGML_ABORT("%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op)); } //break; } From a432929d585993de8ee8ca97a5d2e7f43e05756e Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 3 Jul 2025 13:21:14 -0500 Subject: [PATCH 0070/1348] vulkan: support mixed/deepseekR1 FA head sizes (llama/14509) * vulkan: better parameterize FA by head sizes * vulkan: support mixed/deepseekR1 FA head sizes --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 232 ++++++++++-------- .../vulkan-shaders/flash_attn.comp | 43 ++-- .../vulkan-shaders/flash_attn_base.comp | 8 +- .../vulkan-shaders/flash_attn_cm1.comp | 62 ++--- .../vulkan-shaders/flash_attn_cm2.comp | 50 ++-- 5 files changed, 210 insertions(+), 185 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 8114cb42046..c0032cba218 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -224,6 +224,21 @@ enum vk_device_architecture { INTEL_XE2, }; +// HSK x HSV +enum FaHeadSizes { + FA_HEAD_SIZE_64, + FA_HEAD_SIZE_80, + FA_HEAD_SIZE_96, + FA_HEAD_SIZE_112, + FA_HEAD_SIZE_128, + FA_HEAD_SIZE_192, + FA_HEAD_SIZE_192_128, + FA_HEAD_SIZE_256, + FA_HEAD_SIZE_576_512, + FA_HEAD_SIZE_UNSUPPORTED, + FA_HEAD_SIZE_COUNT = FA_HEAD_SIZE_UNSUPPORTED, +}; + static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { vk::PhysicalDeviceProperties props = device.getProperties(); @@ -467,26 +482,11 @@ struct vk_device_struct { vk_pipeline pipeline_conv2d_dw_cwhn_f32; // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} - vk_pipeline pipeline_flash_attn_f32_f16_D64_cm2[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D80_cm2[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D96_cm2[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D112_cm2[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D128_cm2[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D256_cm2[GGML_TYPE_COUNT][2][2][2]; - - vk_pipeline pipeline_flash_attn_f32_f16_D64_cm1[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D80_cm1[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D96_cm1[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D112_cm1[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D128_cm1[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D256_cm1[GGML_TYPE_COUNT][2][2][2]; - - vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2]; - vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_cm2[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2]; + + vk_pipeline pipeline_flash_attn_f32_f16_cm1[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2]; + + vk_pipeline pipeline_flash_attn_f32_f16[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2]; vk_pipeline pipeline_flash_attn_split_k_reduce; @@ -1003,7 +1003,7 @@ struct ggml_backend_vk_context { // number of additional consecutive nodes that are being fused with the // node currently being processed - uint32_t num_additional_fused_ops {}; + int num_additional_fused_ops {}; }; static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT @@ -1699,6 +1699,35 @@ enum FaCodePath { FA_COOPMAT2, }; +static FaHeadSizes fa_get_head_sizes(uint32_t hsk, uint32_t hsv) { + if (hsk != 192 && hsk != 576 && hsk != hsv) { + return FA_HEAD_SIZE_UNSUPPORTED; + } + switch (hsk) { + case 64: return FA_HEAD_SIZE_64; + case 80: return FA_HEAD_SIZE_80; + case 96: return FA_HEAD_SIZE_96; + case 112: return FA_HEAD_SIZE_112; + case 128: return FA_HEAD_SIZE_128; + case 192: + if (hsv == 192) { + return FA_HEAD_SIZE_192; + } else if (hsv == 128) { + return FA_HEAD_SIZE_192_128; + } else { + return FA_HEAD_SIZE_UNSUPPORTED; + } + case 256: return FA_HEAD_SIZE_256; + case 576: + if (hsv == 512) { + return FA_HEAD_SIZE_576_512; + } else { + return FA_HEAD_SIZE_UNSUPPORTED; + } + default: return FA_HEAD_SIZE_UNSUPPORTED; + } +} + // number of rows/cols for flash attention shader static constexpr uint32_t flash_attention_num_small_rows = 32; static constexpr uint32_t scalar_flash_attention_num_small_rows = 1; @@ -1719,8 +1748,9 @@ static uint32_t get_fa_num_small_rows(FaCodePath path) { } } -static std::array fa_rows_cols(FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) { +static std::array fa_rows_cols(FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows) { GGML_UNUSED(clamp); + GGML_UNUSED(hsv); if (path == FA_SCALAR) { if (small_rows) { @@ -1744,7 +1774,7 @@ static std::array fa_rows_cols(FaCodePath path, uint32_t D, uint32_ } // small cols to reduce register count - if (ggml_is_quantized(type) || D == 256) { + if (ggml_is_quantized(type) || hsk >= 256) { return {64, 32}; } return {64, 64}; @@ -2037,19 +2067,21 @@ static void ggml_vk_load_shaders(vk_device& device) { parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); }; - auto const &fa_wg_denoms = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array { - return {fa_rows_cols(path, D, clamp, type, small_rows)[0], 1, 1}; + auto const &fa_wg_denoms = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows) -> std::array { + return {fa_rows_cols(path, hsk, hsv, clamp, type, small_rows)[0], 1, 1}; }; - auto const &fa_spec_constants = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector { + auto const &fa_spec_constants = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector { // For large number of rows, 128 invocations seems to work best. // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we // can't use 256 for D==80. // For scalar, use 128 (arbitrary) + // The same D_split value is used for both HSK and HSV, so just base it on the union of the LSBs. + const uint32_t D = (hsk|hsv); uint32_t wg_size = (path == FA_SCALAR || path == FA_COOPMAT1) ? scalar_flash_attention_workgroup_size : ((small_rows && (D % 32) == 0) ? 256 : 128); - auto rows_cols = fa_rows_cols(path, D, clamp, type, small_rows); + auto rows_cols = fa_rows_cols(path, hsk, hsv, clamp, type, small_rows); // D_split can't be larger than a subgroup because we use subgroupShuffle to reduce it. // D_split can't be larger than the LSB of D divided by 4 due to vectorization in the shader. @@ -2058,26 +2090,29 @@ static void ggml_vk_load_shaders(vk_device& device) { // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0); - return {wg_size, rows_cols[0], rows_cols[1], (D), clamp, D_split}; + return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split}; }; -#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, D) \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true), fa_rows_cols(FAPATH,D,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true), fa_rows_cols(FAPATH,D,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ +#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, HSK, HSV, HEAD_SIZES) \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ #define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 80) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 96) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 112) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 128) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 256) + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64, 64, 64) \ + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 80, 80, 80) \ + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 96, 96, 96) \ + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 112, 112, 112) \ + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 128, 128, 128) \ + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 192, 192, 192) \ + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 192, 128, 192_128) \ + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 256, 256, 256) \ + CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 576, 512, 576_512) CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, ) CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, ) @@ -3688,7 +3723,6 @@ static void ggml_vk_instance_init() { } - size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan @@ -6002,24 +6036,47 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx } } -static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t D, bool f32acc) { +static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const uint32_t hsk, uint32_t hsv) { // Needs to be kept up to date on shader changes + GGML_UNUSED(hsv); const uint32_t wg_size = scalar_flash_attention_workgroup_size; const uint32_t Br = scalar_flash_attention_num_large_rows; const uint32_t Bc = scalar_flash_attention_Bc; + const uint32_t tmpsh = wg_size * sizeof(float); + const uint32_t tmpshv4 = wg_size * 4 * sizeof(float); + + const uint32_t masksh = Bc * Br * sizeof(float); + + const uint32_t Qf = Br * (hsk / 4 + 2) * 4 * sizeof(float); + + const uint32_t total_size = tmpsh + tmpshv4 + masksh + Qf; + const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; + + VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", total_size=" << total_size << ", supported=" << supported); + + return supported; +} + +static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t hsk, uint32_t hsv, bool f32acc) { + // Needs to be kept up to date on shader changes + GGML_UNUSED(hsv); + const uint32_t wg_size = scalar_flash_attention_workgroup_size; + const uint32_t Br = coopmat1_flash_attention_num_large_rows; + const uint32_t Bc = scalar_flash_attention_Bc; + const uint32_t acctype = f32acc ? 4 : 2; const uint32_t f16vec4 = 8; const uint32_t tmpsh = wg_size * sizeof(float); const uint32_t tmpshv4 = wg_size * 4 * acctype; - const uint32_t Qf = Br * (D / 4 + 2) * f16vec4; + const uint32_t Qf = Br * (hsk / 4 + 2) * f16vec4; - const uint32_t sfshstride = (D <= 128) ? (Br + 8) : Br; + const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br; const uint32_t sfsh = Bc * sfshstride * acctype; - const uint32_t kshstride = D / 4 + 2; + const uint32_t kshstride = hsk / 4 + 2; const uint32_t ksh = Bc * kshstride * f16vec4; const uint32_t slope = Br * sizeof(float); @@ -6027,7 +6084,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co const uint32_t total_size = tmpsh + tmpshv4 + Qf + sfsh + ksh + slope; const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; - VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(D=" << D << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported); + VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported); return supported; } @@ -6051,11 +6108,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx const uint32_t nem1 = mask ? mask->ne[1] : 0; const uint32_t nem2 = mask ? mask->ne[2] : 0; - const uint32_t D = neq0; + const uint32_t HSK = nek0; + const uint32_t HSV = nev0; uint32_t N = neq1; const uint32_t KV = nek1; - GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne0 == HSV); GGML_ASSERT(ne2 == N); // input tensor rows must be contiguous @@ -6063,12 +6121,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx GGML_ASSERT(nbk0 == ggml_type_size(k->type)); GGML_ASSERT(nbv0 == ggml_type_size(v->type)); - GGML_ASSERT(neq0 == D); - GGML_ASSERT(nek0 == D); - GGML_ASSERT(nev0 == D); + GGML_ASSERT(neq0 == HSK); GGML_ASSERT(neq1 == N); - GGML_ASSERT(nev0 == D); GGML_ASSERT(nev1 == nek1); @@ -6089,7 +6144,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) || (dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc); - const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, D, dst->op_params[3] == GGML_PREC_F32); + const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, HSK, HSV, dst->op_params[3] == GGML_PREC_F32); if (!coopmat_shape_supported || !coopmat_shmem_supported) { path = FA_SCALAR; @@ -6142,47 +6197,25 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx path = FA_SCALAR; } + // with large hsk/hsv, scalar path may need to use small_rows to fit in shared memory + if (path == FA_SCALAR && + !ggml_vk_flash_attn_scalar_shmem_support(ctx->device, HSK, HSV)) { + small_rows = true; + } + bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32; + FaHeadSizes head_sizes = fa_get_head_sizes(k->ne[0], v->ne[0]); + switch (path) { case FA_SCALAR: - switch (D) { - case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break; - case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break; - case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96[k->type][f32acc][small_rows][0]; break; - case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112[k->type][f32acc][small_rows][0]; break; - case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128[k->type][f32acc][small_rows][0]; break; - case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256[k->type][f32acc][small_rows][0]; break; - default: - GGML_ASSERT(!"unsupported D value"); - return; - } + pipelines = &ctx->device->pipeline_flash_attn_f32_f16[k->type][head_sizes][f32acc][small_rows][0]; break; case FA_COOPMAT1: - switch (D) { - case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm1[k->type][f32acc][small_rows][0]; break; - case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm1[k->type][f32acc][small_rows][0]; break; - case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96_cm1[k->type][f32acc][small_rows][0]; break; - case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112_cm1[k->type][f32acc][small_rows][0]; break; - case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128_cm1[k->type][f32acc][small_rows][0]; break; - case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256_cm1[k->type][f32acc][small_rows][0]; break; - default: - GGML_ASSERT(!"unsupported D value"); - return; - } + pipelines = &ctx->device->pipeline_flash_attn_f32_f16_cm1[k->type][head_sizes][f32acc][small_rows][0]; break; case FA_COOPMAT2: - switch (D) { - case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm2[k->type][f32acc][small_rows][0]; break; - case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm2[k->type][f32acc][small_rows][0]; break; - case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96_cm2[k->type][f32acc][small_rows][0]; break; - case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112_cm2[k->type][f32acc][small_rows][0]; break; - case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128_cm2[k->type][f32acc][small_rows][0]; break; - case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256_cm2[k->type][f32acc][small_rows][0]; break; - default: - GGML_ASSERT(!"unsupported D value"); - return; - } + pipelines = &ctx->device->pipeline_flash_attn_f32_f16_cm2[k->type][head_sizes][f32acc][small_rows][0]; break; default: GGML_ASSERT(0); @@ -6212,7 +6245,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx // Try to use split_k when KV is large enough to be worth the overhead if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) { // Try to run two workgroups per SM. - split_k = ctx->device->shader_core_count * 2 / (workgroups_y * workgroups_z); + split_k = shader_core_count * 2 / (workgroups_y * workgroups_z); if (split_k > 1) { // Try to evenly split KV into split_k chunks, but it needs to be a multiple // of "align", so recompute split_k based on that. @@ -6224,7 +6257,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1) // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows. - const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0; + const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0; if (split_k_size > ctx->device->max_memory_allocation_size) { GGML_ABORT("Requested preallocation size is too large"); } @@ -6342,7 +6375,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); ggml_vk_sync_buffers(subctx); - const std::array pc2 = { D, (uint32_t)ne1, (uint32_t)ne3, split_k }; + const std::array pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, { vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, @@ -10241,19 +10274,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); bool coopmat2 = device->coopmat2; - switch (op->src[0]->ne[0]) { - case 64: - case 80: - case 96: - case 112: - case 128: - case 256: - break; - default: - return false; - } - if (op->src[1]->ne[0] != op->src[2]->ne[0]) { - // different head sizes of K and V are not supported yet + FaHeadSizes head_sizes = fa_get_head_sizes(op->src[1]->ne[0], op->src[2]->ne[0]); + if (head_sizes == FA_HEAD_SIZE_UNSUPPORTED) { return false; } if (op->src[0]->type != GGML_TYPE_F32) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 6f80101d1c4..788a5e065d1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -11,7 +11,8 @@ #include "types.comp" #include "flash_attn_base.comp" -const uint32_t D_per_thread = D / D_split; +const uint32_t HSK_per_thread = HSK / D_split; +const uint32_t HSV_per_thread = HSV / D_split; const uint32_t cols_per_iter = WorkGroupSize / D_split; const uint32_t cols_per_thread = Bc / cols_per_iter; @@ -29,7 +30,7 @@ layout (binding = 3) readonly buffer M {float16_t data_m[];}; // Rows index by Q's dimension 2, and the first N rows are valid. D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N) { - uint32_t offset = (iq2 + r) * D + c; + uint32_t offset = (iq2 + r) * HSV + c; data_o[o_offset + offset] = D_TYPE(elem); return elem; } @@ -38,7 +39,7 @@ shared FLOAT_TYPE tmpsh[WorkGroupSize]; shared vec4 tmpshv4[WorkGroupSize]; shared float masksh[Bc][Br]; -shared vec4 Qf[Br][D / 4]; +shared vec4 Qf[Br][HSK / 4]; void main() { #ifdef NEEDS_INIT_IQ_SHMEM @@ -53,18 +54,18 @@ void main() { uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4; - [[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) { - uint32_t d = (idx + tid) % (D / 4); - uint32_t r = (idx + tid) / (D / 4); - if (r < Br && d < D / 4 && + [[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) { + uint32_t d = (idx + tid) % (HSK / 4); + uint32_t r = (idx + tid) / (HSK / 4); + if (r < Br && d < HSK / 4 && i * Br + r < N) { Qf[r][d] = vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d]) * p.scale; } } barrier(); - vec4 Of[Br][D_per_thread / 4]; - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + vec4 Of[Br][HSV_per_thread / 4]; + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { Of[r][d] = vec4(0.0); } @@ -116,7 +117,7 @@ void main() { [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSK_per_thread / 4; ++d) { #if BLOCK_SIZE > 1 uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid); uint ib = coord / BLOCK_SIZE; @@ -195,14 +196,14 @@ void main() { Lf[r] = eMf[r]*Lf[r] + rowsumf[r]; } - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { Of[r][d] = eMf[r] * Of[r][d]; } } [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { #if BLOCK_SIZE > 1 uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid); uint ib = coord / BLOCK_SIZE; @@ -259,7 +260,7 @@ void main() { Lf[r] = tmpsh[d_tid]; barrier(); - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { Of[r][d] = eMf * Of[r][d]; tmpshv4[tid] = Of[r][d]; @@ -281,11 +282,11 @@ void main() { // If there is split_k, then the split_k resolve shader does the final // division by L. Store the intermediate O value and per-row m and L values. if (p.k_num > 1) { - uint32_t o_offset = D * p.ne1 * (split_k_index + iq3 * p.k_num); + uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num); [[unroll]] for (uint32_t r = 0; r < Br; ++r) { if (r < N) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) { perElemOpGqaStore(r, 4*(d * D_split + d_tid) + comp, Of[r][d][comp], o_offset, iq2, N); } @@ -293,7 +294,7 @@ void main() { } } - o_offset = D * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; + o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; [[unroll]] for (uint32_t r = 0; r < Br; ++r) { if (r < N) { perElemOpStoreCol0(r, 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N); @@ -309,18 +310,18 @@ void main() { Lfrcp[r] = 1.0 / Lf[r]; } - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { Of[r][d] *= Lfrcp[r]; } } - uint32_t o_offset = iq3*p.ne2*p.ne1*D; + uint32_t o_offset = iq3*p.ne2*p.ne1*HSV; if (p.gqa_ratio > 1) { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { if (r < N) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) { perElemOpGqaStore(r, 4*(d * D_split + d_tid) + comp, Of[r][d][comp], o_offset, iq2, N); } @@ -330,9 +331,9 @@ void main() { } else { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { if (i * Br + r < N) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) { - data_o[o_offset + iq2 * D + (i * Br + r) * p.ne1 * D + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]); + data_o[o_offset + iq2 * HSV + (i * Br + r) * p.ne1 * HSV + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]); } } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp index 67b935cf57e..6609f0bad3d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp @@ -4,10 +4,10 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint32_t WorkGroupSize = 128; layout (constant_id = 1) const uint32_t Br = 1; layout (constant_id = 2) const uint32_t Bc = 32; -layout (constant_id = 3) const uint32_t D = 32; -layout (constant_id = 4) const uint32_t Clamp = 0; -layout (constant_id = 5) const uint32_t D_split = 16; - +layout (constant_id = 3) const uint32_t HSK = 32; +layout (constant_id = 4) const uint32_t HSV = 32; +layout (constant_id = 5) const uint32_t Clamp = 0; +layout (constant_id = 6) const uint32_t D_split = 16; layout (push_constant) uniform parameter { uint32_t N; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index 26fe50c7a81..e74e2fa9346 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -13,7 +13,9 @@ #include "types.comp" #include "flash_attn_base.comp" -const uint32_t D_per_thread = D / D_split; +const uint32_t HSK_per_thread = HSK / D_split; +const uint32_t HSV_per_thread = HSV / D_split; + const uint32_t row_split = 4; const uint32_t rows_per_thread = Br / row_split; const uint32_t cols_per_iter = gl_WorkGroupSize.x / D_split / row_split; @@ -32,7 +34,7 @@ layout (binding = 3) readonly buffer M {float16_t data_m[];}; // Rows index by Q's dimension 2, and the first N rows are valid. D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N) { - uint32_t offset = (iq2 + r) * D + c; + uint32_t offset = (iq2 + r) * HSV + c; data_o[o_offset + offset] = D_TYPE(elem); return elem; } @@ -44,14 +46,14 @@ const uint32_t MatBc = 16; shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x]; shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x]; -const uint32_t qstride = D / 4 + 2; // in units of f16vec4 +const uint32_t qstride = HSK / 4 + 2; // in units of f16vec4 shared f16vec4 Qf[Br * qstride]; -// Avoid padding for D==256 to make it fit in 48KB shmem. -const uint32_t sfshstride = (D <= 128) ? (Br + 8) : Br; +// Avoid padding for hsk==256 to make it fit in 48KB shmem. +const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br; shared ACC_TYPE sfsh[Bc * sfshstride]; -const uint32_t kshstride = D / 4 + 2; // in units of f16vec4 +const uint32_t kshstride = HSK / 4 + 2; // in units of f16vec4 shared f16vec4 ksh[Bc * kshstride]; shared float slope[Br]; @@ -74,18 +76,18 @@ void main() { uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4; - [[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) { - uint32_t d = (idx + tid) % (D / 4); - uint32_t r = (idx + tid) / (D / 4); - if (r < Br && d < D / 4 && + [[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) { + uint32_t d = (idx + tid) % (HSK / 4); + uint32_t r = (idx + tid) / (HSK / 4); + if (r < Br && d < HSK / 4 && i * Br + r < N) { Qf[r * qstride + d] = f16vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale); } } barrier(); - ACC_TYPEV4 Of[rows_per_thread][D_per_thread / 4]; - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + ACC_TYPEV4 Of[rows_per_thread][HSV_per_thread / 4]; + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { Of[r][d] = ACC_TYPEV4(0.0); } @@ -131,10 +133,10 @@ void main() { [[dont_unroll]] for (uint32_t j = start_j; j < end_j; ++j) { - [[unroll]] for (uint32_t idx = 0; idx < Bc * D / 4; idx += gl_WorkGroupSize.x) { - uint32_t d = (idx + tid) % (D / 4); - uint32_t c = (idx + tid) / (D / 4); - if (c < Bc && d < D / 4) { + [[unroll]] for (uint32_t idx = 0; idx < Bc * HSK / 4; idx += gl_WorkGroupSize.x) { + uint32_t d = (idx + tid) % (HSK / 4); + uint32_t c = (idx + tid) / (HSK / 4); + if (c < Bc && d < HSK / 4) { #if BLOCK_SIZE > 1 uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE + 4 * d; uint ib = coord / BLOCK_SIZE; @@ -149,14 +151,14 @@ void main() { } barrier(); - // K * Q^T -> S^T: Bc x D * D x Br -> Bc x Br - // Bc split across workgroup (four subgroups), loop over D in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16 + // K * Q^T -> S^T: Bc x HSK * HSK x Br -> Bc x Br + // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16 // This is written transposed in order to allow for N being 8 if implementations need it coopmat SfMat = coopmat(0); coopmat KMat; coopmat QMat; - for (uint32_t d = 0; d < D / 16; ++d) { + for (uint32_t d = 0; d < HSK / 16; ++d) { coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor); uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4; @@ -206,7 +208,7 @@ void main() { eMf[r] = exp(Moldf - Mf[r]); } - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { Of[r][d] = float16_t(eMf[r]) * Of[r][d]; } @@ -221,7 +223,7 @@ void main() { Pf[r] = exp(sfsh[tile_row(r) + (c * cols_per_iter + col_tid) * sfshstride] - Mf[r]); Lf[r] += Pf[r]; } - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { #if BLOCK_SIZE > 1 uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid); uint ib = coord / BLOCK_SIZE; @@ -284,7 +286,7 @@ void main() { } [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { Of[r][d] = float16_t(eMf[r]) * Of[r][d]; tmpshv4[tid] = Of[r][d]; @@ -304,11 +306,11 @@ void main() { // If there is split_k, then the split_k resolve shader does the final // division by L. Store the intermediate O value and per-row m and L values. if (p.k_num > 1) { - uint32_t o_offset = D * p.ne1 * (split_k_index + iq3 * p.k_num); + uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num); [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { if (tile_row(r) < N) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) { perElemOpGqaStore(tile_row(r), 4*(d * D_split + d_tid) + comp, float(Of[r][d][comp]), o_offset, iq2, N); } @@ -316,7 +318,7 @@ void main() { } } - o_offset = D * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; + o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { if (tile_row(r) < N) { perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N); @@ -332,18 +334,18 @@ void main() { Lfrcp[r] = 1.0 / Lf[r]; } - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { Of[r][d] *= float16_t(Lfrcp[r]); } } - uint32_t o_offset = iq3*p.ne2*p.ne1*D; + uint32_t o_offset = iq3*p.ne2*p.ne1*HSV; if (p.gqa_ratio > 1) { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { if (tile_row(r) < N) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) { perElemOpGqaStore(tile_row(r), 4*(d * D_split + d_tid) + comp, float(Of[r][d][comp]), o_offset, iq2, N); } @@ -353,9 +355,9 @@ void main() { } else { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { if (i * Br + tile_row(r) < N) { - [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) { + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) { - data_o[o_offset + iq2 * D + (i * Br + tile_row(r)) * p.ne1 * D + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]); + data_o[o_offset + iq2 * HSV + (i * Br + tile_row(r)) * p.ne1 * HSV + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]); } } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index cf47bd53e28..8792d5195e4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -61,8 +61,8 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele // Rows index by Q's dimension 2, and the first N rows are valid. D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N) { - if (r < N && c < D) { - uint32_t offset = (iq2 + r) * D + c; + if (r < N && c < HSV) { + uint32_t offset = (iq2 + r) * HSV + c; data_o[o_offset + offset] = D_TYPE(elem); } return elem; @@ -86,9 +86,9 @@ void main() { tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE); #endif - tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, D); - tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D); - tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D); + tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, HSK); + tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, HSK); + tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, HSV); // hint to the compiler that strides are aligned for the aligned variant of the shader if (Clamp != gl_CooperativeMatrixClampModeConstantNV) @@ -104,16 +104,16 @@ void main() { tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1); tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1); - coopmat Q; - coopmat Qf16; + coopmat Q; + coopmat Qf16; uint32_t q_offset = iq2*p.nb02+iq3*p.nb03; - coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, D)); + coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK)); - Qf16 = coopmat(Q); + Qf16 = coopmat(Q); Qf16 *= float16_t(p.scale); - coopmat O = coopmat(0); + coopmat O = coopmat(0); coopmat L, M; @@ -140,10 +140,10 @@ void main() { coopmat S = coopmat(0); - coopmat K_T; + coopmat K_T; uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13; - coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, D), tensorViewTranspose DECODEFUNC); + coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK), tensorViewTranspose DECODEFUNC); S = coopMatMulAdd(Qf16, K_T, S); if (p.logit_softcap != 0.0f) { @@ -208,42 +208,42 @@ void main() { rowsum = coopmat(0.0); rowsum = coopMatMulAdd(P_A, One, rowsum); - coopmat V; + coopmat V; uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23; - coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, D) DECODEFUNC); + coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV) DECODEFUNC); L = eM*L + rowsum; // This is the "diagonal" matrix in the paper, but since we do componentwise // multiply rather than matrix multiply it has the diagonal element smeared // across the row - coopmat eMdiag; + coopmat eMdiag; // resize eM by using smear/reduce coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce); // multiply with fp16 accumulation, then add to O. - coopmat PV = coopmat(0); + coopmat PV = coopmat(0); PV = coopMatMulAdd(P_A, V, PV); - O = eMdiag * O + coopmat(PV); + O = eMdiag * O + coopmat(PV); } // If there is split_k, then the split_k resolve shader does the final // division by L. Store the intermediate O value and per-row m and L values. if (p.k_num > 1) { - coopmat O_D = coopmat(O); + coopmat O_D = coopmat(O); - uint32_t o_offset = D * p.ne1 * (split_k_index + iq3 * p.k_num); + uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num); coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); - o_offset = D * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; + o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2; coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N); coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N); return; } - coopmat Ldiag; + coopmat Ldiag; // resize L by using smear/reduce coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce); @@ -255,18 +255,18 @@ void main() { O = Ldiag*O; - uint32_t o_offset = iq3*p.ne2*p.ne1*D; + uint32_t o_offset = iq3*p.ne2*p.ne1*HSV; - coopmat O_D = coopmat(O); + coopmat O_D = coopmat(O); if (p.gqa_ratio > 1) { coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); } else { tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV); - tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D); + tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, HSV); // permute dimensions tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2); - coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, D), tensorViewPermute); + coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV), tensorViewPermute); } } From 7cec4cc83a11500493a5e0d19415e2817d7c950b Mon Sep 17 00:00:00 2001 From: lhez Date: Thu, 3 Jul 2025 11:22:24 -0700 Subject: [PATCH 0071/1348] opencl : broadcast for soft_max (llama/14510) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 51 +++++++++++++------ ggml/src/ggml-opencl/kernels/softmax_4_f16.cl | 35 +++++++++---- ggml/src/ggml-opencl/kernels/softmax_4_f32.cl | 35 +++++++++---- ggml/src/ggml-opencl/kernels/softmax_f16.cl | 35 +++++++++---- ggml/src/ggml-opencl/kernels/softmax_f32.cl | 35 +++++++++---- 5 files changed, 132 insertions(+), 59 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 9436e6ea9a0..2450100b43c 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -5763,19 +5763,31 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0; - const int ne00 = src0 ? src0->ne[0] : 0; - const int ne01 = src0 ? src0->ne[1] : 0; - const int ne02 = src0 ? src0->ne[2] : 0; - const int ne03 = src0 ? src0->ne[3] : 0; + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const cl_long nb01 = src0->nb[1]; + const cl_long nb02 = src0->nb[2]; + const cl_long nb03 = src0->nb[3]; + + const int ne12 = src1 ? src1->ne[2] : 0; + const int ne13 = src1 ? src1->ne[3] : 0; + + const cl_long nb11 = src1 ? src1->nb[1] : 0; + const cl_long nb12 = src1 ? src1->nb[2] : 0; + const cl_long nb13 = src1 ? src1->nb[3] : 0; + + const cl_long nb1 = dst->nb[1]; + const cl_long nb2 = dst->nb[2]; + const cl_long nb3 = dst->nb[3]; float scale, max_bias; memcpy(&scale, dst->op_params + 0, sizeof(float)); memcpy(&max_bias, dst->op_params + 1, sizeof(float)); - const int nrows_x = ggml_nrows(src0); - const int nrows_y = src0->ne[1]; - - const int n_head = nrows_x/nrows_y; + const int n_head = src0->ne[2]; const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); @@ -5820,13 +5832,22 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0)); + CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1)); + CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2)); size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; diff --git a/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl b/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl index 62c05369a87..a6d8ede6701 100644 --- a/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +++ b/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl @@ -22,32 +22,45 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_soft_max_4_f16( - global float * src0, + global char * src0, ulong offset0, - global half * src1, + global char * src1, ulong offset1, - global float * dst, + global char * dst, ulong offsetd, int ne00, - int ne01, - int ne02, + ulong nb01, + ulong nb02, + ulong nb03, + int ne12, + int ne13, + ulong nb11, + ulong nb12, + ulong nb13, + ulong nb1, + ulong nb2, + ulong nb3, float scale, float max_bias, float m0, float m1, int n_head_log2 ) { - src0 = (global float *)((global char *)src0 + offset0); - src1 = (global half *)((global char *)src1 + offset1); - dst = (global float *)((global char *)dst + offsetd); + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; int i03 = get_group_id(2); int i02 = get_group_id(1); int i01 = get_group_id(0); - global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - global half4 * pmask = (global char *)src1 != (global char *)src0 ? (global half4 *)(src1 + i01*ne00) : 0; - global float4 * pdst4 = (global float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + int i13 = i03%ne13; + int i12 = i02%ne12; + int i11 = i01; + + global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03); + global half4 * pmask = src1 != src0 ? (global half4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0; + global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3); float slope = 1.0f; diff --git a/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl b/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl index d562774eaba..35b5573b46a 100644 --- a/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +++ b/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl @@ -22,32 +22,45 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_soft_max_4( - global float * src0, + global char * src0, ulong offset0, - global float * src1, + global char * src1, ulong offset1, - global float * dst, + global char * dst, ulong offsetd, int ne00, - int ne01, - int ne02, + ulong nb01, + ulong nb02, + ulong nb03, + int ne12, + int ne13, + ulong nb11, + ulong nb12, + ulong nb13, + ulong nb1, + ulong nb2, + ulong nb3, float scale, float max_bias, float m0, float m1, int n_head_log2 ) { - src0 = (global float*)((global char*)src0 + offset0); - src1 = (global float*)((global char*)src1 + offset1); - dst = (global float*)((global char*)dst + offsetd); + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; int i03 = get_group_id(2); int i02 = get_group_id(1); int i01 = get_group_id(0); - global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i01*ne00) : 0; - global float4 * pdst4 = (global float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + int i13 = i03%ne13; + int i12 = i02%ne12; + int i11 = i01; + + global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03); + global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0; + global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3); float slope = 1.0f; diff --git a/ggml/src/ggml-opencl/kernels/softmax_f16.cl b/ggml/src/ggml-opencl/kernels/softmax_f16.cl index d38d099671e..9d292b57465 100644 --- a/ggml/src/ggml-opencl/kernels/softmax_f16.cl +++ b/ggml/src/ggml-opencl/kernels/softmax_f16.cl @@ -22,32 +22,45 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_soft_max_f16( - global float * src0, + global char * src0, ulong offset0, - global half * src1, + global char * src1, ulong offset1, - global float * dst, + global char * dst, ulong offsetd, int ne00, - int ne01, - int ne02, + ulong nb01, + ulong nb02, + ulong nb03, + int ne12, + int ne13, + ulong nb11, + ulong nb12, + ulong nb13, + ulong nb1, + ulong nb2, + ulong nb3, float scale, float max_bias, float m0, float m1, int n_head_log2 ) { - src0 = (global float *)((global char *)src0 + offset0); - src1 = (global half *)((global char *)src1 + offset1); - dst = (global float *)((global char *)dst + offsetd); + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; int i03 = get_group_id(2); int i02 = get_group_id(1); int i01 = get_group_id(0); - global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; - global half * pmask = (global char *)src1 != (global char *)src0 ? src1 + i01*ne00 : 0; - global float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + int i13 = i03%ne13; + int i12 = i02%ne12; + int i11 = i01; + + global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03); + global half * pmask = src1 != src0 ? (global half *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0; + global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3); float slope = 1.0f; diff --git a/ggml/src/ggml-opencl/kernels/softmax_f32.cl b/ggml/src/ggml-opencl/kernels/softmax_f32.cl index 001b587abe3..7c53dfbe5a2 100644 --- a/ggml/src/ggml-opencl/kernels/softmax_f32.cl +++ b/ggml/src/ggml-opencl/kernels/softmax_f32.cl @@ -22,32 +22,45 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_soft_max( - global float * src0, + global char * src0, ulong offset0, - global float * src1, + global char * src1, ulong offset1, - global float * dst, + global char * dst, ulong offsetd, int ne00, - int ne01, - int ne02, + ulong nb01, + ulong nb02, + ulong nb03, + int ne12, + int ne13, + ulong nb11, + ulong nb12, + ulong nb13, + ulong nb1, + ulong nb2, + ulong nb3, float scale, float max_bias, float m0, float m1, int n_head_log2 ) { - src0 = (global float*)((global char*)src0 + offset0); - src1 = (global float*)((global char*)src1 + offset1); - dst = (global float*)((global char*)dst + offsetd); + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; int i03 = get_group_id(2); int i02 = get_group_id(1); int i01 = get_group_id(0); - global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; - global float * pmask = src1 != src0 ? src1 + i01*ne00 : 0; - global float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + int i13 = i03%ne13; + int i12 = i02%ne12; + int i11 = i01; + + global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03); + global float * pmask = src1 != src0 ? (global float *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0; + global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3); float slope = 1.0f; From e8138c51d2c85ead8263a36a783454dd3ae0fe15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 3 Jul 2025 23:07:22 +0200 Subject: [PATCH 0072/1348] ggml : implement GEGLU_ERF and GEGLU_QUICK ops (llama/14445) --- ggml/include/ggml.h | 28 ++ ggml/src/ggml-cpu/ggml-cpu.c | 2 + ggml/src/ggml-cpu/ops.cpp | 294 ++++++++++++++++++ ggml/src/ggml-cpu/vec.h | 40 +++ ggml/src/ggml-cuda/ggml-cuda.cu | 8 + ggml/src/ggml-cuda/unary.cu | 8 + ggml/src/ggml-cuda/unary.cuh | 4 + ggml/src/ggml-metal/ggml-metal.m | 12 + ggml/src/ggml-metal/ggml-metal.metal | 44 +++ ggml/src/ggml-opencl/ggml-opencl.cpp | 36 ++- ggml/src/ggml-opencl/kernels/glu.cl | 136 ++++++++ ggml/src/ggml-sycl/element_wise.cpp | 50 +++ ggml/src/ggml-sycl/element_wise.hpp | 2 + ggml/src/ggml-sycl/ggml-sycl.cpp | 8 + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 16 + .../ggml-vulkan/vulkan-shaders/geglu_erf.comp | 27 ++ .../vulkan-shaders/geglu_quick.comp | 11 + .../vulkan-shaders/vulkan-shaders-gen.cpp | 4 + ggml/src/ggml.c | 46 ++- 19 files changed, 767 insertions(+), 9 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d25bfcc9f45..74068f2be60 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -557,6 +557,8 @@ extern "C" { GGML_GLU_OP_REGLU, GGML_GLU_OP_GEGLU, GGML_GLU_OP_SWIGLU, + GGML_GLU_OP_GEGLU_ERF, + GGML_GLU_OP_GEGLU_QUICK, GGML_GLU_OP_COUNT, }; @@ -1144,6 +1146,22 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_geglu_erf( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_erf_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_quick_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + // A: n columns, r rows, // B: n columns, r rows, GGML_API struct ggml_tensor * ggml_glu_split( @@ -1167,6 +1185,16 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_geglu_erf_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_geglu_quick_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // normalize along rows GGML_API struct ggml_tensor * ggml_norm( struct ggml_context * ctx, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 11ff228f07a..c5271b77572 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2172,6 +2172,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_GLU_OP_REGLU: case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: { n_tasks = n_threads; } break; diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 0fb2c08b5bc..aaeee614ab9 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -3614,6 +3614,292 @@ static void ggml_compute_forward_swiglu( } } +// ggml_compute_forward_geglu_erf + +static void ggml_compute_forward_geglu_erf_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * src0_p = (float *) (src0_d + i1*src0_o); + float * src1_p = (float *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_geglu_erf_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_geglu_erf_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o); + ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_geglu_erf_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_geglu_erf( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_geglu_erf_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_geglu_erf_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_geglu_quick + +static void ggml_compute_forward_geglu_quick_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * src0_p = (float *) (src0_d + i1*src0_o); + float * src1_p = (float *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_geglu_quick_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_geglu_quick_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o); + ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_geglu_quick_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_geglu_quick( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_geglu_quick_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_geglu_quick_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_norm static void ggml_compute_forward_norm_f32( @@ -8779,6 +9065,14 @@ void ggml_compute_forward_glu( { ggml_compute_forward_swiglu(params, dst); } break; + case GGML_GLU_OP_GEGLU_ERF: + { + ggml_compute_forward_geglu_erf(params, dst); + } break; + case GGML_GLU_OP_GEGLU_QUICK: + { + ggml_compute_forward_geglu_quick(params, dst); + } break; default: { GGML_ABORT("fatal error"); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index c432c990818..1f5857a23e3 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -959,6 +959,46 @@ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_ } } +inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) { + for (int i = 0; i < n; ++i) { + float xi = x[i]; + y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i]; + } +} + +inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { + for (int i = 0; i < n; ++i) { + float xi = GGML_CPU_FP16_TO_FP32(x[i]); + float gi = GGML_CPU_FP16_TO_FP32(g[i]); + y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi); + } +} + +#ifdef GGML_GELU_QUICK_FP16 +inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) { + uint16_t t; + for (int i = 0; i < n; ++i) { + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i]; + } +} +#else +inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_quick_f32(x[i]) * g[i]; + } +} +#endif + +inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { + const uint16_t * i16 = (const uint16_t *) x; + for (int i = 0; i < n; ++i) { + float v = GGML_CPU_FP16_TO_FP32(g[i]); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v); + } +} + inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE ggml_float sum = 0.0; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1c04bba52e8..af5ad1ed52c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2314,6 +2314,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_GLU_OP_SWIGLU: ggml_cuda_op_swiglu(ctx, dst); break; + case GGML_GLU_OP_GEGLU_ERF: + ggml_cuda_op_geglu_erf(ctx, dst); + break; + case GGML_GLU_OP_GEGLU_QUICK: + ggml_cuda_op_geglu_quick(ctx, dst); + break; default: return false; } @@ -3116,6 +3122,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_GLU_OP_REGLU: case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: return ggml_is_contiguous_1(op->src[0]); default: return false; diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index ba3c0f13762..f9c7b83c40d 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -285,6 +285,14 @@ void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_unary_gated(ctx, dst); } +void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary_gated(ctx, dst); +} + +void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary_gated(ctx, dst); +} + /* silu_back */ static __device__ __forceinline__ float op_silu_back(float grad, float x) { diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index 9094f1d0bad..289d690e5cf 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -64,3 +64,7 @@ void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 5e5467e88a1..40fc315e82f 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -530,6 +530,8 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_REGLU, GGML_METAL_KERNEL_TYPE_GEGLU, GGML_METAL_KERNEL_TYPE_SWIGLU, + GGML_METAL_KERNEL_TYPE_GEGLU_ERF, + GGML_METAL_KERNEL_TYPE_GEGLU_QUICK, GGML_METAL_KERNEL_TYPE_SUM_ROWS, GGML_METAL_KERNEL_TYPE_MEAN, GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, @@ -1510,6 +1512,8 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REGLU, reglu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU, geglu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SWIGLU, swiglu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_ERF, geglu_erf, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_QUICK, geglu_quick, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true); @@ -1693,6 +1697,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_GLU_OP_REGLU: case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; default: return false; @@ -2456,6 +2462,12 @@ static bool ggml_metal_encode_node( case GGML_GLU_OP_SWIGLU: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SWIGLU].pipeline; break; + case GGML_GLU_OP_GEGLU_ERF: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GEGLU_ERF].pipeline; + break; + case GGML_GLU_OP_GEGLU_QUICK: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GEGLU_QUICK].pipeline; + break; default: GGML_ABORT("fatal error"); } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index ebde005f33e..dc7a0af2769 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1258,6 +1258,50 @@ kernel void kernel_swiglu( } } +kernel void kernel_geglu_erf( + device const char * src0, + device const char * src1, + device char * dst, + constant ggml_metal_kargs_glu & args, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu_erf = 0.5f*x0*(1.0f+erf_approx(x0*SQRT_2_INV)); + + dst_row[i0] = gelu_erf*x1; + } +} + +kernel void kernel_geglu_quick( + device const char * src0, + device const char * src1, + device char * dst, + constant ggml_metal_kargs_glu & args, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0))); + + dst_row[i0] = gelu_quick*x1; + } +} + template kernel void kernel_sum_rows( constant ggml_metal_kargs_sum_rows & args, diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 2450100b43c..970dd3f67f5 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -402,8 +402,8 @@ struct ggml_backend_opencl_context { cl_kernel kernel_relu; cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16; cl_kernel kernel_clamp; - cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, - kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16; + cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick, + kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16; cl_kernel kernel_norm; cl_kernel kernel_rms_norm; cl_kernel kernel_group_norm; @@ -753,12 +753,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve backend_ctx->program_glu = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); - CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err)); - CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err)); - CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err)); - CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err)); - CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err)); - CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err)); + CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err)); + CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err)); + CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err)); + CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err)); + CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_geglu_erf_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err)); GGML_LOG_CONT("."); } @@ -2277,6 +2281,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_REGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); default: return false; @@ -6254,6 +6260,20 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_swiglu_f16; } break; + case GGML_GLU_OP_GEGLU_ERF: + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_geglu_erf; + } else { + kernel = backend_ctx->kernel_geglu_erf_f16; + } + break; + case GGML_GLU_OP_GEGLU_QUICK: + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_geglu_quick; + } else { + kernel = backend_ctx->kernel_geglu_quick_f16; + } + break; default: GGML_ABORT("Unsupported glu op"); } diff --git a/ggml/src/ggml-opencl/kernels/glu.cl b/ggml/src/ggml-opencl/kernels/glu.cl index ba861d8b18f..7cca16e6a9e 100644 --- a/ggml/src/ggml-opencl/kernels/glu.cl +++ b/ggml/src/ggml-opencl/kernels/glu.cl @@ -1,7 +1,9 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define GELU_COEF_A 0.044715f +#define GELU_QUICK_COEF -1.702f #define SQRT_2_OVER_PI 0.79788456080286535587989211986876f +#define SQRT_2_INV 0.70710678118654752440084436210484f //------------------------------------------------------------------------------ // geglu @@ -199,3 +201,137 @@ kernel void kernel_swiglu_f16( dst_row[i0] = silu*x1; } } + +//------------------------------------------------------------------------------ +// geglu_erf +//------------------------------------------------------------------------------ +kernel void kernel_geglu_erf( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu_erf = 0.5f*x0*(1.0f + erf(x0*SQRT_2_INV)); + + dst_row[i0] = gelu_erf*x1; + } +} + +kernel void kernel_geglu_erf_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const half x0 = src0_row[i0]; + const half x1 = src1_row[i0]; + + const half gelu_erf = 0.5f*x0*(1.0f + erf(x0*SQRT_2_INV)); + + dst_row[i0] = gelu_erf*x1; + } +} + +//------------------------------------------------------------------------------ +// geglu_quick +//------------------------------------------------------------------------------ +kernel void kernel_geglu_quick( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu_quick = x0*(1.0f/(1.0f + exp(GELU_QUICK_COEF*x0))); + + dst_row[i0] = gelu_quick*x1; + } +} + +kernel void kernel_geglu_quick_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const half x0 = src0_row[i0]; + const half x1 = src1_row[i0]; + + const half gelu_quick = x0*(1.0f/(1.0f + exp(GELU_QUICK_COEF*x0))); + + dst_row[i0] = gelu_quick*x1; + } +} diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index c7788bdb6bf..0363b06a3ec 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -383,6 +383,24 @@ static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint6 } } +template +static void gated_op_fused_geglu_erf(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_gelu_erf(x[j0]) * g[j1]; + } +} + +template +static void gated_op_fused_geglu_quick(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_gelu_quick(x[j0]) * g[j1]; + } +} + namespace ggml_sycl_detail { static void acc_f32_sycl(const float *x, const float *y, float *dst, const int n_elements, const int ne10, const int ne11, @@ -978,6 +996,28 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten }); } +static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(main_stream, + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(main_stream, + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); @@ -1118,3 +1158,13 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_swiglu(ctx, dst); } + +void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_geglu_erf(ctx, dst); +} + +void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_geglu_quick(ctx, dst); +} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 86068b10129..50749e87d78 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -80,5 +80,7 @@ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index b063a698ded..21c81e99a19 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3687,6 +3687,12 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_GLU_OP_SWIGLU: ggml_sycl_swiglu(ctx, dst); break; + case GGML_GLU_OP_GEGLU_ERF: + ggml_sycl_geglu_erf(ctx, dst); + break; + case GGML_GLU_OP_GEGLU_QUICK: + ggml_sycl_geglu_quick(ctx, dst); + break; default: return false; } @@ -4232,6 +4238,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_GLU_OP_REGLU: case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: return ggml_is_contiguous_1(op->src[0]); default: return false; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c0032cba218..22a34a43356 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -456,6 +456,8 @@ struct vk_device_struct { vk_pipeline pipeline_geglu[2]; vk_pipeline pipeline_reglu[2]; vk_pipeline pipeline_swiglu[2]; + vk_pipeline pipeline_geglu_erf[2]; + vk_pipeline pipeline_geglu_quick[2]; vk_pipeline pipeline_leaky_relu_f32; vk_pipeline pipeline_silu_back_f32; @@ -2821,6 +2823,8 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_GLU(geglu) CREATE_GLU(reglu) CREATE_GLU(swiglu) + CREATE_GLU(geglu_erf) + CREATE_GLU(geglu_quick) #undef CREATE_GLU ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); @@ -6575,6 +6579,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16]; case GGML_GLU_OP_SWIGLU: return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_GEGLU_ERF: + return ctx->device->pipeline_geglu_erf[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_GEGLU_QUICK: + return ctx->device->pipeline_geglu_quick[dst->type == GGML_TYPE_F16]; default: break; } @@ -8919,6 +8927,8 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_REGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: break; default: return false; @@ -9166,6 +9176,8 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_REGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun); break; default: @@ -9384,6 +9396,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_REGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: buf = tensor->buffer; break; default: @@ -10194,6 +10208,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_REGLU: case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp b/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp new file mode 100644 index 00000000000..cbd4cb36bff --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp @@ -0,0 +1,27 @@ +#version 450 + +#include "glu_head.comp" + +// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation +// ref: https://www.johndcook.com/blog/python_erf/ +const float p_erf = 0.3275911f; +const float a1_erf = 0.254829592f; +const float a2_erf = -0.284496736f; +const float a3_erf = 1.421413741f; +const float a4_erf = -1.453152027f; +const float a5_erf = 1.061405429f; + +const float SQRT_2_INV = 0.70710678118654752440084436210484f; + +float op(float a, float b) { + const float a_div_sqr2 = a * SQRT_2_INV; + const float sign_x = sign(a_div_sqr2); + const float x = abs(a_div_sqr2); + const float t = 1.0f / (1.0f + p_erf * x); + const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); + const float erf_approx = sign_x * y; + + return 0.5f * a * (1.0f + erf_approx) * b; +} + +#include "glu_main.comp" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp b/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp new file mode 100644 index 00000000000..3a2a6897bfe --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp @@ -0,0 +1,11 @@ +#version 450 + +#include "glu_head.comp" + +const float GELU_QUICK_COEF = -1.702f; + +float op(float a, float b) { + return a * (1.0f / (1.0f + exp(GELU_QUICK_COEF * a))) * b; +} + +#include "glu_main.comp" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 297a2a77119..2698522ed71 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -593,6 +593,10 @@ void process_shaders() { string_to_spv("reglu_f32", "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("swiglu_f16", "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("swiglu_f32", "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("geglu_erf_f16", "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("geglu_erf_f32", "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("geglu_quick_f16","geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("geglu_quick_f32","geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 68768842904..e2d9d616a59 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1132,9 +1132,11 @@ static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = { "REGLU", "GEGLU", "SWIGLU", + "GEGLU_ERF", + "GEGLU_QUICK", }; -static_assert(GGML_GLU_OP_COUNT == 3, "GGML_GLU_OP_COUNT != 3"); +static_assert(GGML_GLU_OP_COUNT == 5, "GGML_GLU_OP_COUNT != 5"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); @@ -2760,6 +2762,48 @@ struct ggml_tensor * ggml_swiglu_split( return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU, false); } +// ggml_geglu_erf + +struct ggml_tensor * ggml_geglu_erf( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_ERF, false); +} + +struct ggml_tensor * ggml_geglu_erf_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_ERF, true); +} + +struct ggml_tensor * ggml_geglu_erf_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_ERF, false); +} + +// ggml_geglu_quick + +struct ggml_tensor * ggml_geglu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_QUICK, false); +} + +struct ggml_tensor * ggml_geglu_quick_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_QUICK, true); +} + +struct ggml_tensor * ggml_geglu_quick_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_QUICK, false); +} + // ggml_norm static struct ggml_tensor * ggml_norm_impl( From af304ef080542469528ff18e07cf5cf6471edb6b Mon Sep 17 00:00:00 2001 From: luyhcsu <110711054+luyhcsu@users.noreply.github.com> Date: Fri, 4 Jul 2025 11:50:07 +0800 Subject: [PATCH 0073/1348] CANN: Replace aclrtMemsetSync with aclnnInplaceZero operator (llama/14002) Co-authored-by: luyuhong --- ggml/src/ggml-cann/aclnn_ops.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 69483de8f36..4d5c2c18252 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -804,10 +805,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, nb[i] = nb[i - 1] * ne[i - 1]; } - ggml_cann_async_memset(ctx, buffer, n_bytes, 0); aclTensor* zero = ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero); return zero; + GGML_UNUSED(n_bytes); } /** From 10d0d28f7c79d57c398ed786a2a6a84b868c3622 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 4 Jul 2025 19:19:09 +0300 Subject: [PATCH 0074/1348] metal : disable fast math in all quantize kernels (llama/14528) ggml-ci --- ggml/src/ggml-metal/ggml-metal.metal | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index dc7a0af2769..22240bab472 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -109,6 +109,7 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r } void quantize_q4_0(device const float * src, device block_q4_0 & dst) { +#pragma METAL fp math_mode(safe) float amax = 0.0f; // absolute max float max = 0.0f; @@ -167,6 +168,7 @@ void quantize_q4_1(device const float * src, device block_q4_1 & dst) { } void quantize_q5_0(device const float * src, device block_q5_0 & dst) { +#pragma METAL fp math_mode(safe) float amax = 0.0f; // absolute max float max = 0.0f; @@ -461,6 +463,7 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re } void quantize_q8_0(device const float * src, device block_q8_0 & dst) { +#pragma METAL fp math_mode(safe) float amax = 0.0f; // absolute max for (int j = 0; j < QK8_0; j++) { From df5af1dc75753644b63f7f22d5c230c23d8c0593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 5 Jul 2025 08:24:56 +0200 Subject: [PATCH 0075/1348] opencl: add GELU_ERF (llama/14476) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 48 ++++++++++++++++++++++++++++ ggml/src/ggml-opencl/kernels/gelu.cl | 27 ++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 970dd3f67f5..a9fc0390387 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -398,6 +398,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_scale; cl_kernel kernel_silu, kernel_silu_4; cl_kernel kernel_gelu, kernel_gelu_4; + cl_kernel kernel_gelu_erf, kernel_gelu_erf_4; cl_kernel kernel_gelu_quick, kernel_gelu_quick_4; cl_kernel kernel_relu; cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16; @@ -736,6 +737,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err)); CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err)); + CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err)); + CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err)); CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err)); CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err)); GGML_LOG_CONT("."); @@ -2266,6 +2269,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; case GGML_UNARY_OP_SIGMOID: @@ -3870,6 +3874,44 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } +static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + UNUSED(src1); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + cl_kernel kernel; + + int n = ggml_nelements(dst); + + if (n % 4 == 0) { + kernel = backend_ctx->kernel_gelu_erf_4; + n /= 4; + } else { + kernel = backend_ctx->kernel_gelu_erf; + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); + + size_t global_work_size[] = {(size_t)n, 1, 1}; + size_t local_work_size[] = {64, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} + static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -6388,6 +6430,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_gelu; break; + case GGML_UNARY_OP_GELU_ERF: + if (!any_on_device) { + return false; + } + func = ggml_cl_gelu_erf; + break; case GGML_UNARY_OP_GELU_QUICK: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/gelu.cl b/ggml/src/ggml-opencl/kernels/gelu.cl index 71c310cc9f9..1ab426c7744 100644 --- a/ggml/src/ggml-opencl/kernels/gelu.cl +++ b/ggml/src/ggml-opencl/kernels/gelu.cl @@ -6,6 +6,7 @@ #define GELU_COEF_A 0.044715f #define GELU_QUICK_COEF -1.702f #define SQRT_2_OVER_PI 0.79788456080286535587989211986876f +#define SQRT_2_INV 0.70710678118654752440084436210484f kernel void kernel_gelu( global float * src0, @@ -35,6 +36,32 @@ kernel void kernel_gelu_4( dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); } +kernel void kernel_gelu_erf( + global float * src0, + ulong offset0, + global float * dst, + ulong offsetd +) { + src0 = (global float*)((global char*)src0 + offset0); + dst = (global float*)((global char*)dst + offsetd); + + float x = src0[get_global_id(0)]; + dst[get_global_id(0)] = 0.5f*x*(1.0f + erf(x*SQRT_2_INV)); +} + +kernel void kernel_gelu_erf_4( + global float4 * src0, + ulong offset0, + global float4 * dst, + ulong offsetd +) { + src0 = (global float4*)((global char*)src0 + offset0); + dst = (global float4*)((global char*)dst + offsetd); + + float4 x = src0[get_global_id(0)]; + dst[get_global_id(0)] = 0.5f*x*(1.0f + erf(x*SQRT_2_INV)); +} + kernel void kernel_gelu_quick( global float * src0, ulong offset0, From 3aebb8d5d3c7e67b21c31577d111213a1d116c99 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 5 Jul 2025 02:26:04 -0500 Subject: [PATCH 0076/1348] vulkan: Handle updated FA dim2/3 definition (llama/14518) * vulkan: Handle updated FA dim2/3 definition Pack mask boolean and n_head_log2 into a single dword to keep the push constant block under the 128B limit. * handle null mask for gqa * allow gqa with dim3>1 --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 19 ++++++++----------- .../vulkan-shaders/flash_attn.comp | 6 +++--- .../vulkan-shaders/flash_attn_base.comp | 13 +++++++++---- .../vulkan-shaders/flash_attn_cm1.comp | 6 +++--- .../vulkan-shaders/flash_attn_cm2.comp | 6 +++--- 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 22a34a43356..e8df00d4183 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -636,6 +636,7 @@ struct vk_flash_attn_push_constants { uint32_t nev3; uint32_t nem1; uint32_t nem2; + uint32_t nem3; uint32_t nb01; uint32_t nb02; @@ -651,8 +652,7 @@ struct vk_flash_attn_push_constants { float max_bias; float logit_softcap; - uint32_t mask; - uint32_t n_head_log2; + uint32_t mask_n_head_log2; float m0; float m1; @@ -6111,6 +6111,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx const uint32_t nem1 = mask ? mask->ne[1] : 0; const uint32_t nem2 = mask ? mask->ne[2] : 0; + const uint32_t nem3 = mask ? mask->ne[3] : 0; const uint32_t HSK = nek0; const uint32_t HSV = nev0; @@ -6178,7 +6179,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } if (N == 1 && qk_ratio > 1 && qk_ratio <= max_gqa && - qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) { + qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) { // grouped query attention - make the N dimension equal to gqa_ratio, reduce // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1 // and change addressing calculations to index Q's dimension 2. @@ -6348,17 +6349,19 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } } + uint32_t mask_n_head_log2 = ((mask != nullptr) << 16) | n_head_log2; + const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, - nem1, nem2, + nem1, nem2, nem3, q_stride, (uint32_t)nbq2, (uint32_t)nbq3, k_stride, (uint32_t)nbk2, (uint32_t)nbk3, v_stride, (uint32_t)nbv2, (uint32_t)nbv3, scale, max_bias, logit_softcap, - mask != nullptr, n_head_log2, m0, m1, + mask_n_head_log2, m0, m1, gqa_ratio, split_kv, split_k }; ggml_vk_sync_buffers(subctx); @@ -10303,12 +10306,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { return false; } - // TODO: support broadcast - // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14449, but - // the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505 - if (op->src[0]->ne[3] != 1 || (op->src[3] && op->src[3]->ne[2] != 1)) { - return false; - } // It's straightforward to support different K/V dequant, but would // significantly increase the number of pipelines if (op->src[1]->type != op->src[2]->type) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 788a5e065d1..45c6e7736ac 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -101,8 +101,8 @@ void main() { uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2; #endif uint32_t m_offset = 0; - if (p.nem2 != 1) { - m_offset = (iq3 % p.nem2) * p.nem1 * KV; + if (p.nem2 != 1 || p.nem3 != 1) { + m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV; } [[dont_unroll]] @@ -149,7 +149,7 @@ void main() { } } - if (p.mask != 0) { + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) { uint32_t c = (idx + tid) % Bc; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp index 6609f0bad3d..7defe72b403 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp @@ -25,6 +25,7 @@ layout (push_constant) uniform parameter { uint32_t nev3; uint32_t nem1; uint32_t nem2; + uint32_t nem3; uint32_t nb01; uint32_t nb02; @@ -40,8 +41,7 @@ layout (push_constant) uniform parameter { float max_bias; float logit_softcap; - uint32_t mask; - uint32_t n_head_log2; + uint32_t mask_n_head_log2; float m0; float m1; @@ -50,6 +50,9 @@ layout (push_constant) uniform parameter { uint32_t k_num; } p; +#define MASK_ENABLE_BIT (1<<16) +#define N_LOG2_MASK 0xFFFF + layout (binding = 4) writeonly buffer O {D_TYPE data_o[];}; #if defined(A_TYPE_PACKED16) @@ -100,8 +103,10 @@ ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const i { const uint32_t h = iq2 + (r % p.gqa_ratio); - const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1); - const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1); + uint32_t n_head_log2 = p.mask_n_head_log2 & N_LOG2_MASK; + + const ACC_TYPE base = ACC_TYPE(h < n_head_log2 ? p.m0 : p.m1); + const int exph = int(h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1); return ACC_TYPE(pow(base, ACC_TYPE(exph))); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index e74e2fa9346..486735fe8b0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -126,8 +126,8 @@ void main() { uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2; #endif uint32_t m_offset = 0; - if (p.nem2 != 1) { - m_offset = (iq3 % p.nem2) * p.nem1 * KV; + if (p.nem2 != 1 || p.nem3 != 1) { + m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV; } [[dont_unroll]] @@ -182,7 +182,7 @@ void main() { barrier(); } - if (p.mask != 0) { + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) { uint32_t c = (idx + tid) % Bc; uint32_t r = (idx + tid) / Bc; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index 8792d5195e4..274f48fcabd 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -131,8 +131,8 @@ void main() { } uint32_t m_offset = 0; - if (p.nem2 != 1) { - m_offset = (iq3 % p.nem2) * p.nem1 * KV * 2 /*sizeof(float16_t)*/; + if (p.nem2 != 1 || p.nem3 != 1) { + m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV * 2 /*sizeof(float16_t)*/; } [[dont_unroll]] @@ -153,7 +153,7 @@ void main() { } } - if (p.mask != 0) { + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp); tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1); From 9d0c408260f597fddb1817fc0f512750606b75fe Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 6 Jul 2025 03:08:16 -0500 Subject: [PATCH 0077/1348] vulkan: fix rms_norm+mul fusion (llama/14545) The fused operation was grabbing the epsilon value from the wrong place. Add an env var to disable fusion. Add some missing checks for supported shapes/types. Handle fused rms_norm+mul in check_results. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 106 ++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e8df00d4183..7b8faf17879 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -501,6 +501,8 @@ struct vk_device_struct { ggml_backend_buffer_type buffer_type; + bool disable_fusion; + #ifdef GGML_VULKAN_MEMORY_DEBUG std::unique_ptr memory_logger; #endif @@ -1091,8 +1093,8 @@ static size_t vk_skip_checks; static size_t vk_output_tensor; static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); -static void ggml_vk_check_results_0(ggml_tensor * tensor); -static void ggml_vk_check_results_1(ggml_tensor * tensor); +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); #endif typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); @@ -3507,6 +3509,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->idx = idx; + device->disable_fusion = getenv("GGML_VK_DISABLE_FUSION") != nullptr; + return device; } @@ -7654,8 +7658,7 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun); } -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { - float * op_params = (float *)dst->op_params; +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -8885,7 +8888,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } } -static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready); +static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready); // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. @@ -9146,9 +9149,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr // fused rms_norm + mul ggml_tensor *mul = cgraph->nodes[node_idx + 1]; ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0]; - ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, dryrun); + ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params, dryrun); } else { - ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, dryrun); + ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params, dryrun); } break; case GGML_OP_RMS_NORM_BACK: @@ -9308,7 +9311,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr ctx->compute_ctx.reset(); - bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready); + bool ok = ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, false, almost_ready); if (!ok) { if (node->op == GGML_OP_UNARY) { std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; @@ -9323,7 +9326,8 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr return true; } -static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) { +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) { + GGML_UNUSED(cgraph); ggml_backend_buffer * buf = nullptr; switch (tensor->op) { @@ -9433,7 +9437,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * // Only run if ctx hasn't been submitted yet if (!subctx->seqs.empty()) { #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(tensor); + ggml_vk_check_results_0(ctx, cgraph, tensor_idx); use_fence = true; #endif @@ -9453,7 +9457,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * ggml_vk_wait_for_fence(ctx); } #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_1(tensor); + ggml_vk_check_results_1(ctx, cgraph, tensor_idx); #endif } @@ -9900,6 +9904,37 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; } +static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { + if (!ggml_can_fuse(cgraph, node_idx, ops)) { + return false; + } + + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) { + // additional constraints specific to this fusion + const ggml_tensor *rms_norm = cgraph->nodes[node_idx]; + const ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + + GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(rms_norm->type == GGML_TYPE_F32); + // rms_norm only supports f32 + if (mul->src[0]->type != GGML_TYPE_F32 || + mul->src[1]->type != GGML_TYPE_F32 || + mul->type != GGML_TYPE_F32) { + return false; + } + // if rms_norm is the B operand, then we don't handle broadcast + if (rms_norm == mul->src[1] && + mul->src[0]->ne[1] != rms_norm->ne[1]) { + return false; + } + // rms_norm shader assumes contiguous rows + if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) { + return false; + } + } + return true; +} + static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; @@ -9913,7 +9948,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + if (!ctx->device->disable_fusion && ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { ctx->num_additional_fused_ops = 1; } ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false); @@ -9983,7 +10018,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } - if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + if (!ctx->device->disable_fusion && ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { ctx->num_additional_fused_ops = 1; } @@ -10760,11 +10795,21 @@ void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; -static void ggml_vk_check_results_0(ggml_tensor * tensor) { +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { + ggml_tensor * tensor = cgraph->nodes[tensor_idx]; if (tensor->op == GGML_OP_TRANSPOSE) { return; } + bool fused_rms_norm_mul = false; + int rms_norm_idx = -1; + if (ctx->num_additional_fused_ops == 1 && + tensor->op == GGML_OP_RMS_NORM && + cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) { + fused_rms_norm_mul = true; + tensor = cgraph->nodes[tensor_idx + 1]; + } + check_counter++; if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { return; @@ -10792,6 +10837,15 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { for (int i = 0; i < 6; i++) { ggml_tensor * srci = tensor->src[i]; + if (fused_rms_norm_mul) { + rms_norm_idx = tensor->src[0]->op == GGML_OP_RMS_NORM ? 0 : 1; + ggml_tensor *rms_norm = tensor->src[rms_norm_idx]; + switch (i) { + case 0: srci = rms_norm->src[0]; break; + case 1: srci = tensor->src[1 - rms_norm_idx]; break; + default: continue; + } + } if (srci == nullptr) { continue; } @@ -10849,7 +10903,12 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { } else if (tensor->op == GGML_OP_SUB) { tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]); } else if (tensor->op == GGML_OP_MUL) { - tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]); + if (fused_rms_norm_mul) { + tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->src[rms_norm_idx]->op_params); + tensor_clone = ggml_mul(ggml_ctx, tensor_clone, src_clone[1 - rms_norm_idx]); + } else { + tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]); + } } else if (tensor->op == GGML_OP_DIV) { tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]); } else if (tensor->op == GGML_OP_CONCAT) { @@ -11040,10 +11099,10 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { GGML_ABORT("fatal error"); } - ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); - ggml_build_forward_expand(cgraph, tensor_clone); + ggml_cgraph * cgraph_cpu = ggml_new_graph(ggml_ctx); + ggml_build_forward_expand(cgraph_cpu, tensor_clone); - ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8); + ggml_graph_compute_with_ctx(ggml_ctx, cgraph_cpu, 8); if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { ggml_vk_print_tensor(tensor_clone, "tensor_clone"); @@ -11066,10 +11125,19 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")"); } -static void ggml_vk_check_results_1(ggml_tensor * tensor) { +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { + ggml_tensor * tensor = cgraph->nodes[tensor_idx]; if (tensor->op == GGML_OP_TRANSPOSE) { return; } + bool fused_rms_norm_mul = false; + if (ctx->num_additional_fused_ops == 1 && + tensor->op == GGML_OP_RMS_NORM && + cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) { + fused_rms_norm_mul = true; + tensor = cgraph->nodes[tensor_idx + 1]; + } + if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { return; } From e753b9a95274ef7bd5bc6d8709a8f560a3de9f8a Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sun, 6 Jul 2025 10:29:36 +0000 Subject: [PATCH 0078/1348] vulkan: increase LOAD_VEC_A to 8 (IQ1/IQ2) or 4 (IQ3) (llama/14485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit taken from remyoudompheng's PR https://github.com/ggml-org/llama.cpp/pull/12260 Co-authored-by: Rémy Oudompheng --- .../ggml-vulkan/vulkan-shaders/mul_mm.comp | 152 ++++++++++-------- .../vulkan-shaders/vulkan-shaders-gen.cpp | 4 +- 2 files changed, 83 insertions(+), 73 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 26163b167c7..888ce79f6ec 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -500,10 +500,9 @@ void main() { const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx % 128) / 4; - const int i8 = 2 * int(idx % 4); + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 32; const float d = float(data_a[ib].d); const uint qh = data_a[ib].qh[ib32]; @@ -512,22 +511,16 @@ void main() { const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]); - const ivec2 gvec = ivec2( - bitfieldExtract(grid, 2 * (i8), 2), - bitfieldExtract(grid, 2 * (i8 + 1), 2) - ); - const vec2 v = dl * (vec2(gvec) + delta); - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + [[unroll]] for (int k = 0; k < 8; ++k) { + buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta)); + } #elif defined(DATA_A_IQ1_M) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib8 = (idx % 128) / 4; + const uint ib = idx / 32; // 8 values per idx + const uint ib8 = idx % 32; const uint ib16 = ib8 / 2; - const int i8 = 2 * int(idx % 4); const uint16_t[4] scales = data_a[ib].scales; const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12; @@ -538,21 +531,17 @@ void main() { const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1); const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA; const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]); - const ivec2 gvec = ivec2( - bitfieldExtract(grid, 2 * (i8), 2), - bitfieldExtract(grid, 2 * (i8 + 1), 2) - ); - const vec2 v = dl * (vec2(gvec) + delta); - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + [[unroll]] for (int k = 0; k < 8; ++k) { + buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta)); + } #elif defined(DATA_A_IQ2_XXS) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx / 4) % 4; + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 4; const float d = float(data_a[ib].d); const uint qs = data_a[ib].qs[8 * ib32 + ib8]; @@ -562,63 +551,81 @@ void main() { data_a[ib].qs[8*ib32 + 6], data_a[ib].qs[8*ib32 + 7] )); - const float db = d * 0.25 * (0.5 + (signs >> 28)); + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + (signs >> 28))); const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7); - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq2xxs_grid[qs][(idx % 4) / 2] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + const uint sign = sign7 | (bitCount(sign7) << 7); + const uvec2 grid = iq2xxs_grid[qs]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); + + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); #elif defined(DATA_A_IQ2_XS) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx / 4) % 4; // 0..3 + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 4; // 0..3 const float d = float(data_a[ib].d); const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; - const float db = d * 0.25 * (0.5 + scale); + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale)); const uint qs = data_a[ib].qs[4 * ib32 + ib8]; const uint sign7 = qs >> 9; - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq2xs_grid[qs & 511][(idx % 4) / 2] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + const uint sign = sign7 | (bitCount(sign7) << 7); + const uvec2 grid = iq2xs_grid[qs & 511]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); + + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); #elif defined(DATA_A_IQ2_S) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib8 = (idx % 128) / 4; // 0..31 - const uint ib32 = ib8 / 4; // 0..7 + const uint ib = idx / 32; // 8 values per idx + const uint ib8 = idx % 32; // 0..31 + const uint ib32 = ib8 / 4; // 0..7 const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; const uint qs = data_a[ib].qs[ib8]; const uint qh = data_a[ib].qh[ib32]; const uint qhshift = 2 * (ib8 % 4); - const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4)); + const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8]; const float d = float(data_a[ib].d); - const float db = d * 0.25 * (0.5 + scale); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1]; - const vec2 v = db * vec2(sign01) * vec2(unpack8(uint32_t(grid)).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale)); + const uvec2 grid = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); + + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); #elif defined(DATA_A_IQ3_XXS) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint iqs = (idx % 128) / 2; // 0..63 + const uint ib = idx / 64; // 4 values per idx + const uint iqs = idx % 64; // 0..63 const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values const float d = float(data_a[ib].d); @@ -631,33 +638,36 @@ void main() { )); const float db = d * 0.5 * (0.5 + (signs >> 28)); const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (4 * (idx % 2)); + const uint grid = iq3xxs_grid[qs]; + const vec4 v = db * vec4(unpack8(grid)); + + buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y); + buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z); + buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w); #elif defined(DATA_A_IQ3_S) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint iqs = (idx % 128) / 2; // 0..63 + const uint ib = idx / 64; // 4 values per idx + const uint iqs = idx % 64; // 0..63 const uint iqh = iqs / 8; const float d = float(data_a[ib].d); const uint qs = data_a[ib].qs[iqs]; const uint qh = data_a[ib].qh[iqh]; - const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (2 * (idx % 4))); + const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (4 * (idx % 2))); const uint scale = data_a[ib].scales[iqs / 16]; const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign))); const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); - const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)]; + const vec4 v = db * vec4(unpack8(grid)); - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y); + buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z); + buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w); #elif defined(DATA_A_IQ4_XS) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 2698522ed71..30f78fabb3c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -360,9 +360,9 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool for (const auto& tname : type_names) { std::string load_vec_quant = "2"; - if ((tname == "q4_0") || (tname == "q4_1")) + if ((tname == "q4_0") || (tname == "q4_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s")) load_vec_quant = "8"; - else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl")) + else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl")) load_vec_quant = "4"; if (tname == "bf16") { From 8e545f466c10bc115fa913d261945b2de489ca10 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 7 Jul 2025 21:45:43 +0800 Subject: [PATCH 0079/1348] CUDA: add bf16 and i32 to getrows (llama/14529) --- ggml/src/ggml-cuda/getrows.cu | 8 ++++++++ ggml/src/ggml-cuda/ggml-cuda.cu | 2 ++ 2 files changed, 10 insertions(+) diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 963e4d03dd7..f77b2629a19 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -168,6 +168,10 @@ static void ggml_cuda_get_rows_switch_src0_type( get_rows_cuda_float((const float *) src0_d, src1_d, dst_d, ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; + case GGML_TYPE_I32: + get_rows_cuda_float((const int32_t *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; case GGML_TYPE_BF16: get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d, ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); @@ -210,6 +214,10 @@ void get_rows_cuda( ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d, ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; + case GGML_TYPE_I32: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (int32_t *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; case GGML_TYPE_F16: ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d, ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index af5ad1ed52c..b6b7960f121 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3200,6 +3200,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g switch (op->src[0]->type) { case GGML_TYPE_F16: case GGML_TYPE_F32: + case GGML_TYPE_BF16: + case GGML_TYPE_I32: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: From 7e9c6bbab2b77c8de919b696231755f52a07c63f Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Tue, 8 Jul 2025 07:58:30 +0800 Subject: [PATCH 0080/1348] musa: fix build warnings (unused variable) (llama/14561) Signed-off-by: Xiaodong Ye --- ggml/src/ggml-cuda/common.cuh | 23 +++++++++++++---------- ggml/src/ggml-cuda/fattn-tile-f32.cu | 16 ++++++++-------- ggml/src/ggml-cuda/fattn-vec-f32.cuh | 16 +++++++++------- 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 954f74d408f..1a2708ec9df 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -176,17 +176,20 @@ static const char * cu_get_error_str(CUresult err) { #endif #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) -#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ - do { \ - static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; \ - const int id = ggml_cuda_get_device(); \ - if (!shared_memory_limit_raised[id]) { \ - CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \ - shared_memory_limit_raised[id] = true; \ - } \ - } while (0) +# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ + do { \ + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \ + const int id = ggml_cuda_get_device(); \ + if (!shared_memory_limit_raised[id]) { \ + CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \ + shared_memory_limit_raised[id] = true; \ + } \ + } while (0) #else -#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) do {} while (0) +# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ + do { \ + GGML_UNUSED(nbytes); \ + } while (0) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA) diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index 124d5d3e891..908c76dbdd2 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -299,14 +299,14 @@ static __global__ void flash_attn_tile_ext_f32( GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); - GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); - GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); - GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); - GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); - GGML_UNUSED(ne2); GGML_UNUSED(ne3); + GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); + GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); + GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); + GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); + GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3); NO_DEVICE_CODE; #endif // FLASH_ATTN_AVAILABLE } diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh index c22baf41764..b2f1724c955 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -337,13 +337,15 @@ static __global__ void flash_attn_vec_ext_f32( GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); - GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); - GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); - GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); - GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); - GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); - GGML_UNUSED(ne2); GGML_UNUSED(ne3); + GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); + GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); + GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); + GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); + GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); + GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3); NO_DEVICE_CODE; #endif // FLASH_ATTN_AVAILABLE } From 98e7beac6c8b6258dc116f4c9ae8af04ffc6dfdd Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 8 Jul 2025 10:11:18 +0800 Subject: [PATCH 0081/1348] CUDA: add bilinear interpolation for upscale (llama/14563) --- ggml/include/ggml.h | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 1 - ggml/src/ggml-cuda/upscale.cu | 98 +++++++++++++++++++++++++++++++-- 3 files changed, 93 insertions(+), 8 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 74068f2be60..8b0bec8738d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -495,7 +495,7 @@ extern "C" { GGML_OP_POOL_1D, GGML_OP_POOL_2D, GGML_OP_POOL_2D_BACK, - GGML_OP_UPSCALE, // nearest interpolate + GGML_OP_UPSCALE, GGML_OP_PAD, GGML_OP_PAD_REFLECT_1D, GGML_OP_ROLL, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b6b7960f121..da1e8f8f4e4 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3375,7 +3375,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_GROUP_NORM: return ggml_is_contiguous(op->src[0]); case GGML_OP_UPSCALE: - return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_PAD: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu index 524e9795742..ef48aa5f97b 100644 --- a/ggml/src/ggml-cuda/upscale.cu +++ b/ggml/src/ggml-cuda/upscale.cu @@ -22,17 +22,88 @@ static __global__ void upscale_f32(const float * x, float * dst, dst[index] = *( (const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00) ); } +static __global__ void upscale_f32_bilinear(const float * x, float * dst, + const int nb00, const int nb01, const int nb02, const int nb03, + const int ne00_src, const int ne01_src, + const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst, + const float sf0, const float sf1, const float sf2, const float sf3, + const float pixel_offset) { + const int64_t index = threadIdx.x + blockIdx.x * blockDim.x; + const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst; + + if (index >= dst_total_elements) { + return; + } + + const int i10_dst = index % ne10_dst; + const int i11_dst = (index / ne10_dst) % ne11_dst; + const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst; + const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst); + + const int i02_src = (int)(i12_dst / sf2); + const int i03_src = (int)(i13_dst / sf3); + + const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset; + int y0_src = (int)floorf(y_src_f); + int y1_src = y0_src + 1; + + y0_src = max(0, min(y0_src, ne01_src - 1)); + y1_src = max(0, min(y1_src, ne01_src - 1)); + + float dy = y_src_f - (float)y0_src; + dy = max(0.0f, min(dy, 1.0f)); + + float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset; + int x0_src = (int)floorf(x_src_f); + int x1_src = x0_src + 1; + + x0_src = max(0, min(x0_src, ne00_src - 1)); + x1_src = max(0, min(x1_src, ne00_src - 1)); + + float dx = x_src_f - (float)x0_src; + dx = max(0.0f, min(dx, 1.0f)); + + const float * p_a = (const float *)((const char *)x + (int64_t)x0_src * nb00 + (int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03); + const float * p_b = (const float *)((const char *)x + (int64_t)x1_src * nb00 + (int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03); + const float * p_c = (const float *)((const char *)x + (int64_t)x0_src * nb00 + (int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03); + const float * p_d = (const float *)((const char *)x + (int64_t)x1_src * nb00 + (int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03); + + const float val_a = *p_a; + const float val_b = *p_b; + const float val_c = *p_c; + const float val_d = *p_d; + + float result = val_a * (1.0f - dx) * (1.0f - dy) + + val_b * dx * (1.0f - dy) + + val_c * (1.0f - dx) * dy + + val_d * dx * dy; + + dst[index] = result; +} + static void upscale_f32_cuda(const float * x, float * dst, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int ne13, const float sf0, const float sf1, const float sf2, const float sf3, cudaStream_t stream) { - int dst_size = ne10 * ne11 * ne12 * ne13; - int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; + const int64_t dst_size = ne10 * ne11 * ne12 * ne13; + const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; upscale_f32<<>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3); } +static void upscale_f32_bilinear_cuda(const float * x, float * dst, + const int nb00, const int nb01, const int nb02, const int nb03, + const int ne00_src, const int ne01_src, + const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst, + const float sf0, const float sf1, const float sf2, const float sf3, + const float pixel_offset, cudaStream_t stream) { + const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst; + const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; + + upscale_f32_bilinear<<>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset); +} + void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; @@ -42,10 +113,25 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - const float sf0 = (float)dst->ne[0]/src0->ne[0]; - const float sf1 = (float)dst->ne[1]/src0->ne[1]; - const float sf2 = (float)dst->ne[2]/src0->ne[2]; + const int mode_flags = dst->op_params[0]; + const ggml_scale_mode mode = (ggml_scale_mode)(mode_flags & 0xFF); + + float sf0 = (float)dst->ne[0]/src0->ne[0]; + float sf1 = (float)dst->ne[1]/src0->ne[1]; + float sf2 = (float)dst->ne[2]/src0->ne[2]; const float sf3 = (float)dst->ne[3]/src0->ne[3]; - upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream); + if (mode == GGML_SCALE_MODE_NEAREST) { + upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream); + } else if (mode == GGML_SCALE_MODE_BILINEAR) { + float pixel_offset = 0.5f; + if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) { + sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1); + sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1); + pixel_offset = 0.0f; + } + upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + sf0, sf1, sf2, sf3, pixel_offset, stream); + } } From c3942b3db62d77344fe4fb13fd8f00db8e7c314d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 8 Jul 2025 10:15:21 +0300 Subject: [PATCH 0082/1348] cuda : fix rope with partial rotation and non-cont src (llama/14580) * cuda : fix rope non-cont ggml-ci * cont : fix multi-rope + add test ggml-ci * sycl : try fix ggml-ci * cont : fix sycl + clean-up cuda ggml-ci --- ggml/src/ggml-cuda/rope.cu | 48 ++++++++++++++++--------------------- ggml/src/ggml-sycl/rope.cpp | 33 ++++++++++++------------- 2 files changed, 36 insertions(+), 45 deletions(-) diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 18f691b2d31..d058504cd6c 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -50,21 +50,19 @@ static __global__ void rope_norm( const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; - if (i0 >= n_dims) { - const int i = row_dst*ne0 + i0; - - dst[i + 0] = x[i + 0]; - dst[i + 1] = x[i + 1]; - - return; - } - const int row_x = row_dst % ne1; const int channel_x = row_dst / ne1; const int idst = row_dst*ne0 + i0; const int ix = channel_x*s2 + row_x*s1 + i0; + if (i0 >= n_dims) { + dst[idst + 0] = x[ix + 0]; + dst[idst + 1] = x[ix + 1]; + + return; + } + const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -94,21 +92,19 @@ static __global__ void rope_neox( const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; - if (i0 >= n_dims) { - const int i = row_dst*ne0 + i0; - - dst[i + 0] = x[i + 0]; - dst[i + 1] = x[i + 1]; - - return; - } - const int row_x = row_dst % ne1; const int channel_x = row_dst / ne1; const int idst = row_dst*ne0 + i0/2; const int ix = channel_x*s2 + row_x*s1 + i0/2; + if (i0 >= n_dims) { + dst[idst + i0/2 + 0] = x[ix + i0/2 + 0]; + dst[idst + i0/2 + 1] = x[ix + i0/2 + 1]; + + return; + } + const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -138,21 +134,19 @@ static __global__ void rope_multi( const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; - if (i0 >= n_dims) { - const int i = row_dst*ne0 + i0; - - dst[i + 0] = x[i + 0]; - dst[i + 1] = x[i + 1]; - - return; - } - const int row_x = row_dst % ne1; const int channel_x = row_dst / ne1; const int idst = row_dst*ne0 + i0/2; const int ix = channel_x*s2 + row_x*s1 + i0/2; + if (i0 >= n_dims) { + dst[idst + i0/2 + 0] = x[ix + i0/2 + 0]; + dst[idst + i0/2 + 1] = x[ix + i0/2 + 1]; + + return; + } + const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3]; const int sec_w = sections.v[1] + sections.v[0]; const int sector = (i0 / 2) % sect_dims; diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index e44c6b6ef8f..1b60226dcd5 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -47,18 +47,17 @@ static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); - if (i0 >= n_dims) { - const int i = row * ne0 + i0; - *reinterpret_cast *>(dst + i) = *reinterpret_cast *>(x + i); - return; - } - const int row0 = row % ne1; const int channel0 = row / ne1; const int i = row * ne0 + i0; const int i2 = channel0 * s2 + row0 * s1 + i0; + if (i0 >= n_dims) { + *reinterpret_cast *>(dst + i) = *reinterpret_cast *>(x + i2); + return; + } + const float theta_base = pos[channel0] * sycl::pow(theta_scale, i0 / 2.0f); const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f; @@ -88,18 +87,17 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); - if (i0 >= n_dims) { - const int i = row * ne0 + i0; - *reinterpret_cast *>(dst + i) = *reinterpret_cast *>(x + i); - return; - } - const int row0 = row % ne1; const int channel0 = row / ne1; const int i = row * ne0 + i0 / 2; const int i2 = channel0 * s2 + row0 * s1 + i0 / 2; + if (i0 >= n_dims) { + *reinterpret_cast *>(dst + i + i0 / 2) = *reinterpret_cast *>(x + i2 + i0 / 2); + return; + } + const float theta_base = pos[channel0] * sycl::pow(theta_scale, i0 / 2.0f); const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f; @@ -129,17 +127,16 @@ static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const } const int row_dst = (item_ct1.get_group(2) * item_ct1.get_local_range(2)) + item_ct1.get_local_id(2); - if (i0 >= n_dims) { - const int i = row_dst*ne0 + i0; - *reinterpret_cast *>(dst + i) = *reinterpret_cast *>(x + i); - return; - } - const int row_x = row_dst % ne1; const int channel_x = row_dst / ne1; const int idst = (row_dst * ne0) + (i0 / 2); const size_t ix = ((size_t) channel_x * s2) + ((size_t) row_x * s1) + (i0 / 2); + if (i0 >= n_dims) { + *reinterpret_cast *>(dst + idst + i0 / 2) = *reinterpret_cast *>(x + i0 / 2 + ix); + return; + } + const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3]; const int sec_w = sections.v[1] + sections.v[0]; const int sector = (i0 / 2) % sect_dims; From 9750e4c98891d6aa54462f9e72d911992f5025c2 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 8 Jul 2025 08:21:21 -0500 Subject: [PATCH 0083/1348] vulkan : fix rope with partial rotation and non-cont src (llama/14582) --- .../ggml-vulkan/vulkan-shaders/rope_multi.comp | 16 +++++++--------- .../ggml-vulkan/vulkan-shaders/rope_neox.comp | 16 +++++++--------- .../ggml-vulkan/vulkan-shaders/rope_norm.comp | 16 +++++++--------- 3 files changed, 21 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp index 4f5b1a0ecaf..5808710ccf9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp @@ -14,21 +14,19 @@ void main() { const uint row_dst = gl_GlobalInvocationID.x; - if (i0 >= p.n_dims) { - const uint i = row_dst*ne0 + i0; - - data_d[i + 0] = data_a[i + 0]; - data_d[i + 1] = data_a[i + 1]; - - return; - } - const uint row_x = row_dst % ne1; const uint channel_x = row_dst / ne1; const uint idst = row_dst*ne0 + i0/2; const uint ix = channel_x*p.s2 + row_x*p.s1 + i0/2; + if (i0 >= p.n_dims) { + data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0]; + data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1]; + + return; + } + const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3]; const int sec_w = p.sections[1] + p.sections[0]; const uint sector = (i0 / 2) % sect_dims; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp index db775c456ca..366a7b1c47c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp @@ -13,21 +13,19 @@ void main() { const uint row_dst = gl_GlobalInvocationID.x; - if (i0 >= p.n_dims) { - const uint i = row_dst*ne0 + i0; - - data_d[i + 0] = data_a[i + 0]; - data_d[i + 1] = data_a[i + 1]; - - return; - } - const uint row_x = row_dst % ne1; const uint channel_x = row_dst / ne1; const uint idst = row_dst*ne0 + i0/2; const uint ix = channel_x*p.s2 + row_x*p.s1 + i0/2; + if (i0 >= p.n_dims) { + data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0]; + data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1]; + + return; + } + const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f); const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp index 4ad35e549d7..9643bca96ac 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp @@ -13,21 +13,19 @@ void main() { const uint row_dst = gl_GlobalInvocationID.x; - if (i0 >= p.n_dims) { - const uint i = row_dst*ne0 + i0; - - data_d[i + 0] = data_a[i + 0]; - data_d[i + 1] = data_a[i + 1]; - - return; - } - const uint row_x = row_dst % ne1; const uint channel_x = row_dst / ne1; const uint idst = row_dst*ne0 + i0; const uint ix = channel_x*p.s2 + row_x*p.s1 + i0; + if (i0 >= p.n_dims) { + data_d[idst + 0] = data_a[ix + 0]; + data_d[idst + 1] = data_a[ix + 1]; + + return; + } + const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f); const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f; From fadb3233b6dcf86f6b5e998dbabb84e43d78e0d7 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 8 Jul 2025 13:11:42 -0500 Subject: [PATCH 0084/1348] vulkan: optimize flash attention split_k_reduce (llama/14554) * vulkan: allow FA split_k with smaller KV values * vulkan: spread split_k_reduce work across more threads k_num can get rather large. Use the whole workgroup to reduce the M/L values. Launch a thread for each element in the HSV dimension of the output. Helps a lot for large HSV (like deepseek). --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 8 ++-- .../flash_attn_split_k_reduce.comp | 46 +++++++++++++++---- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 7b8faf17879..2245a655498 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2706,7 +2706,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 4 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1); for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) { @@ -6252,13 +6252,13 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count : 16; // Try to use split_k when KV is large enough to be worth the overhead - if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) { + if (workgroups_x == 1 && shader_core_count > 0) { // Try to run two workgroups per SM. split_k = shader_core_count * 2 / (workgroups_y * workgroups_z); if (split_k > 1) { // Try to evenly split KV into split_k chunks, but it needs to be a multiple // of "align", so recompute split_k based on that. - split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align); + split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), pipelines[1]->align); split_k = CEIL_DIV(KV, split_kv); workgroups_x = split_k; } @@ -6392,7 +6392,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, - pc2, { (uint32_t)ne1, 1, (uint32_t)ne3 }); + pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 }); } else { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp index 599cef072e9..0a17a9df23f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp @@ -2,9 +2,9 @@ #extension GL_EXT_control_flow_attributes : enable -#define BLOCK_SIZE 32 +layout(constant_id = 0) const uint BLOCK_SIZE = 32; -layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {float data_a[];}; layout (binding = 1) writeonly buffer D {float data_d[];}; @@ -16,6 +16,8 @@ layout (push_constant) uniform parameter { uint k_num; } p; +shared float tmpsh[BLOCK_SIZE]; + void main() { // Each workgroup handles a row const uint n = gl_WorkGroupID.x; @@ -32,23 +34,51 @@ void main() { // Compute the max m value for the row float m_max = -1.0/0.0; - [[unroll]] for (uint k = 0; k < k_num; ++k) { - float m = data_a[m_offset + k * lm_stride]; + for (uint k = 0; k + tid < k_num; k += BLOCK_SIZE) { + float m = data_a[m_offset + (k + tid) * lm_stride]; m_max = max(m_max, m); } + // reduce across the workgroup + tmpsh[tid] = m_max; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { + if (tid < s) { + m_max = max(m_max, tmpsh[tid + s]); + tmpsh[tid] = m_max; + } + barrier(); + } + m_max = tmpsh[0]; + + barrier(); + // Compute L based on m_max float L = 0; - [[unroll]] for (uint k = 0; k < k_num; ++k) { - float l = data_a[l_offset + k * lm_stride]; - float m = data_a[m_offset + k * lm_stride]; + for (uint k = 0; k + tid < k_num; k += BLOCK_SIZE) { + float l = data_a[l_offset + (k + tid) * lm_stride]; + float m = data_a[m_offset + (k + tid) * lm_stride]; L += exp(m - m_max) * l; } + // reduce across the workgroup + tmpsh[tid] = L; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { + if (tid < s) { + L += tmpsh[tid + s]; + tmpsh[tid] = L; + } + barrier(); + } + L = tmpsh[0]; + L = 1.0 / L; + // D dimension is split across workgroups in the y dimension + uint d = tid + gl_WorkGroupID.y * BLOCK_SIZE; // Scale and sum the O contributions based on m_max and store the result to memory - for (uint d = tid; d < D; d += BLOCK_SIZE) { + if (d < D) { float O = 0.0; [[unroll]] for (uint k = 0; k < k_num; ++k) { uint o_offset = D * N * (k + iq3 * k_num) + D * n + d; From 48b18f9eb811a16b7b3b651eeb96b43ebe882c86 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 9 Jul 2025 20:33:53 +0800 Subject: [PATCH 0085/1348] ggml : prevent integer overflow in gguf tensor size calculation (llama/14595) --- ggml/src/gguf.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 5ffd12b8b27..53504399c57 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -631,7 +631,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par gguf_free(ctx); return nullptr; } - ctx->size += GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment); + size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment); + if (SIZE_MAX - ctx->size < padded_size) { + GGML_LOG_ERROR("%s: tensor '%s' size overflow, cannot accumulate size %zu + %zu\n", + __func__, ti.t.name, ctx->size, padded_size); + gguf_free(ctx); + return nullptr; + } + ctx->size += padded_size; } } From 2021870fb8aef022e6f741ba3b4884c091030dda Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 9 Jul 2025 18:16:12 +0200 Subject: [PATCH 0086/1348] ggml : add ggml_scale_bias (llama/14417) * ggml : add ggml_scale_bias * ggml_vec_mad1_f32 * add more simd * add CUDA * sycl * vulkan * cann (placeholder) * opencl * will this fix cpu? * fix cuda * suggestions from coderabbit * fix cann compile error * vDSP_vsmsa * rm __ARM_FEATURE_SVE * use memcpy for op params * make code looks more consistent * use scalar for __ARM_FEATURE_SVE * add x param to ggml_vec_mad1_f32 --- ggml/include/ggml.h | 13 +++++++ ggml/src/ggml-cann/ggml-cann.cpp | 5 ++- ggml/src/ggml-cpu/ops.cpp | 28 +++++++++---- ggml/src/ggml-cpu/vec.h | 39 +++++++++++++++++++ ggml/src/ggml-cuda/scale.cu | 14 ++++--- ggml/src/ggml-metal/ggml-metal.m | 5 ++- ggml/src/ggml-metal/ggml-metal.metal | 6 ++- ggml/src/ggml-opencl/ggml-opencl.cpp | 5 ++- ggml/src/ggml-opencl/kernels/scale.cl | 5 ++- ggml/src/ggml-sycl/ggml-sycl.cpp | 14 ++++--- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 +- .../src/ggml-vulkan/vulkan-shaders/scale.comp | 2 +- ggml/src/ggml.c | 28 ++++++++++--- 13 files changed, 132 insertions(+), 34 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8b0bec8738d..13044602fd8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1294,6 +1294,19 @@ extern "C" { struct ggml_tensor * a, float s); + // x = s * a + b + GGML_API struct ggml_tensor * ggml_scale_bias( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s, + float b); + + GGML_API struct ggml_tensor * ggml_scale_bias_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s, + float b); + // b -> view(a,offset,nb1,nb2,3), return modified a GGML_API struct ggml_tensor * ggml_set( struct ggml_context * ctx, diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index eae575cc040..ccb17eb072e 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2188,7 +2188,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_RMS_NORM: - case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SQRT: case GGML_OP_CLAMP: @@ -2210,6 +2209,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_PAD_REFLECT_1D: case GGML_OP_COUNT_EQUAL: return true; + case GGML_OP_SCALE: + float bias; + memcpy(&bias, (float*)op->op_params + 1, sizeof(float)); + return bias == 0.0f; // TODO: support bias != 0.0f case GGML_OP_SOFT_MAX: // TODO: support broadcast // ref: https://github.com/ggml-org/llama.cpp/pull/14435 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index aaeee614ab9..fd77e9a6aba 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -4643,9 +4643,11 @@ static void ggml_compute_forward_scale_f32( GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - // scale factor - float v; - memcpy(&v, dst->op_params, sizeof(float)); + float s; // scale factor + float b; // bias + + memcpy(&s, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&b, (float *) dst->op_params + 1, sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -4664,12 +4666,22 @@ static void ggml_compute_forward_scale_f32( const size_t nb1 = dst->nb[1]; - for (int i1 = ir0; i1 < ir1; i1++) { - if (dst->data != src0->data) { - // src0 is same shape as dst => same indices - memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); + if (b == 0.0f) { + for (int i1 = ir0; i1 < ir1; i1++) { + if (dst->data != src0->data) { + // src0 is same shape as dst => same indices + // TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy + memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); + } + ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s); + } + } else { + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_mad1_f32(nc, + (float *) ((char *) dst->data + i1*nb1), + (float *) ((char *) src0->data + i1*nb1), + s, b); } - ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); } } diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 1f5857a23e3..d18783a00a1 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int #endif } +inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) { +#if defined(GGML_USE_ACCELERATE) + vDSP_vsmsa(x, 1, &s, &b, y, 1, n); +#elif defined(GGML_SIMD) + #if defined(__ARM_FEATURE_SVE) + // scalar ; TODO: Write SVE code + for (int i = 0; i < n; ++i) { + y[i] = x[i]*s + b; + } + #else + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vs = GGML_F32_VEC_SET1(s); + GGML_F32_VEC vb = GGML_F32_VEC_SET1(b); + + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb); + + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] = x[i]*s + b; + } + #endif +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] = x[i]*s + b; + } +#endif +} + //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #if defined(GGML_USE_ACCELERATE) diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu index 1405e066e86..2ee9e588992 100644 --- a/ggml/src/ggml-cuda/scale.cu +++ b/ggml/src/ggml-cuda/scale.cu @@ -1,18 +1,18 @@ #include "scale.cuh" -static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) { +static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { return; } - dst[i] = scale * x[i]; + dst[i] = scale * x[i] + bias; } -static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { +static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; - scale_f32<<>>(x, dst, scale, k); + scale_f32<<>>(x, dst, scale, bias, k); } void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -25,7 +25,9 @@ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT( dst->type == GGML_TYPE_F32); float scale; - memcpy(&scale, dst->op_params, sizeof(float)); + float bias; + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&bias, (float *) dst->op_params + 1, sizeof(float)); - scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream); + scale_f32_cuda(src0_d, dst_d, scale, bias, ggml_nelements(src0), stream); } diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 40fc315e82f..83a0739809a 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -2256,7 +2256,9 @@ static bool ggml_metal_encode_node( GGML_ASSERT(ggml_is_contiguous(src0)); float scale; - memcpy(&scale, dst->op_params, sizeof(scale)); + float bias; + memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&bias, ((const int32_t *) dst->op_params) + 1, sizeof(float)); int64_t n = ggml_nelements(dst); @@ -2273,6 +2275,7 @@ static bool ggml_metal_encode_node( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; + [encoder setBytes:&bias length:sizeof(bias) atIndex:3]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 22240bab472..239ec31fbcb 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1014,16 +1014,18 @@ kernel void kernel_scale( device const float * src0, device float * dst, constant float & scale, + constant float & bias, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * scale; + dst[tpig] = src0[tpig] * scale + bias; } kernel void kernel_scale_4( device const float4 * src0, device float4 * dst, constant float & scale, + constant float & bias, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * scale; + dst[tpig] = src0[tpig] * scale + bias; } kernel void kernel_clamp( diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index a9fc0390387..43d8e5c72c9 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -5587,7 +5587,9 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; float scale; - memcpy(&scale, dst->op_params, sizeof(scale)); + float bias; + memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float)); ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -5602,6 +5604,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias)); int n = ggml_nelements(dst)/4; diff --git a/ggml/src/ggml-opencl/kernels/scale.cl b/ggml/src/ggml-opencl/kernels/scale.cl index 8cfd518fa5a..aeca8a456e4 100644 --- a/ggml/src/ggml-opencl/kernels/scale.cl +++ b/ggml/src/ggml-opencl/kernels/scale.cl @@ -8,9 +8,10 @@ kernel void kernel_scale( ulong offset0, global float4 * dst, ulong offsetd, - float scale + float scale, + float bias ) { src0 = (global float4*)((global char*)src0 + offset0); dst = (global float4*)((global char*)dst + offsetd); - dst[get_global_id(0)] = src0[get_global_id(0)] * scale; + dst[get_global_id(0)] = src0[get_global_id(0)] * scale + bias; } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 21c81e99a19..cd15bbdb29f 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1695,7 +1695,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; } -static void scale_f32(const float * x, float * dst, const float scale, const int k, +static void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -1704,7 +1704,7 @@ static void scale_f32(const float * x, float * dst, const float scale, const int return; } - dst[i] = scale * x[i]; + dst[i] = scale * x[i] + bias; } @@ -1842,7 +1842,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( -static void scale_f32_sycl(const float *x, float *dst, const float scale, +static void scale_f32_sycl(const float *x, float *dst, const float scale, const float bias, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; stream->parallel_for( @@ -1850,7 +1850,7 @@ static void scale_f32_sycl(const float *x, float *dst, const float scale, sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - scale_f32(x, dst, scale, k, item_ct1); + scale_f32(x, dst, scale, bias, k, item_ct1); }); } @@ -2319,9 +2319,11 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * ds float * dst_dd = static_cast(dst->data); float scale; - memcpy(&scale, dst->op_params, sizeof(float)); + float bias; + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&bias, (float *) dst->op_params + 1, sizeof(float)); - scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream); + scale_f32_sycl(src0_dd, dst_dd, scale, bias, ggml_nelements(dst->src[0]), main_stream); /* DPCT1010:87: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 2245a655498..c36e1a6d3bf 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -7508,7 +7508,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, con (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - op_params[0], 0.0f, + op_params[0], op_params[1], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp index 4663428dee0..f10b0a02b50 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp @@ -18,7 +18,7 @@ void main() { continue; } - data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1)); + data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1) + FLOAT_TYPE(p.param2)); idx += num_threads; } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e2d9d616a59..c8d1f144d03 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3061,12 +3061,14 @@ static struct ggml_tensor * ggml_scale_impl( struct ggml_context * ctx, struct ggml_tensor * a, float s, + float b, bool inplace) { GGML_ASSERT(ggml_is_padded_1d(a)); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_set_op_params(result, &s, sizeof(s)); + float params[2] = { s, b }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_SCALE; result->src[0] = a; @@ -3078,14 +3080,30 @@ struct ggml_tensor * ggml_scale( struct ggml_context * ctx, struct ggml_tensor * a, float s) { - return ggml_scale_impl(ctx, a, s, false); + return ggml_scale_impl(ctx, a, s, 0.0, false); } struct ggml_tensor * ggml_scale_inplace( struct ggml_context * ctx, struct ggml_tensor * a, float s) { - return ggml_scale_impl(ctx, a, s, true); + return ggml_scale_impl(ctx, a, s, 0.0, true); +} + +struct ggml_tensor * ggml_scale_bias( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s, + float b) { + return ggml_scale_impl(ctx, a, s, b, false); +} + +struct ggml_tensor * ggml_scale_bias_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s, + float b) { + return ggml_scale_impl(ctx, a, s, b, true); } // ggml_set @@ -5769,7 +5787,7 @@ static void ggml_compute_backward( } break; case GGML_OP_MEAN: { if (src0_needs_grads) { - ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false)); + ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], 0.0, false)); } } break; case GGML_OP_REPEAT: { @@ -5846,7 +5864,7 @@ static void ggml_compute_backward( if (src0_needs_grads) { float s; memcpy(&s, tensor->op_params, sizeof(float)); - ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, false)); + ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, 0.0, false)); } } break; case GGML_OP_SET: { From fb885fa48b5404986664c7967f320a41d3d82023 Mon Sep 17 00:00:00 2001 From: compilade Date: Wed, 9 Jul 2025 23:54:38 -0400 Subject: [PATCH 0087/1348] cuda : support Falcon-H1 state size for SSM_SCAN (llama/14602) --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++-- ggml/src/ggml-cuda/ssm-scan.cu | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index da1e8f8f4e4..72406f0af36 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3335,8 +3335,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_SSM_SCAN: { if (op->src[3]->ne[0] == 1) { // Mamba2 - // (kernel only supports d_state == 128 && d_head % 16 == 0) - return op->src[0]->ne[0] == 128 && op->src[0]->ne[1] % 16 == 0; + // (kernel only supports (d_state == 128 || d_state == 256) && d_head % 16 == 0) + return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % 16 == 0; } else { // Mamba // (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1) diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu index dc3b1a9a8cb..c9184398b42 100644 --- a/ggml/src/ggml-cuda/ssm-scan.cu +++ b/ggml/src/ggml-cuda/ssm-scan.cu @@ -201,11 +201,11 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim, const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq, cudaStream_t stream) { - const int threads = 128; // NOTE: if you change conditions here, be sure to update the corresponding supports_op condition! if (src3_nb1 == sizeof(float)) { // Mamba-2 if (d_state == 128) { + const int threads = 128; GGML_ASSERT(d_state % threads == 0); // NOTE: can be any power of two between 4 and 64 const int splitH = 16; @@ -215,10 +215,21 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); + } else if (d_state == 256) { // Falcon-H1 + const int threads = 256; + // NOTE: can be any power of two between 8 and 64 + const int splitH = 16; + GGML_ASSERT(head_dim % splitH == 0); + const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1); + ssm_scan_f32_group<16, 256><<>>( + src0, src1, src2, src3, src4, src5, src6, dst, + src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, + src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); } else { - GGML_ABORT("doesn't support d_state!=128."); + GGML_ABORT("doesn't support d_state!=(128 or 256)."); } } else { + const int threads = 128; // Mamba-1 GGML_ASSERT(n_head % threads == 0); GGML_ASSERT(head_dim == 1); From 3c21cde5400f0f59f46aaf7955fba36b6aa6bb82 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Thu, 10 Jul 2025 13:59:38 +0530 Subject: [PATCH 0088/1348] SYCL: Initial set_rows kernel implementation (llama/14562) * SYCL: Initial set_rows kernel implementation * Revert max_threads to 256 * Refactor set_rows and address review comments * Deduplicate conversion function * Remove guard before kernel launch and refactor * Fix and add back SFINAE --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 6 +- ggml/src/ggml-sycl/set_rows.cpp | 131 +++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/set_rows.hpp | 8 ++ 4 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-sycl/set_rows.cpp create mode 100644 ggml/src/ggml-sycl/set_rows.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index f78a36ddf8f..f839a42bc90 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -30,6 +30,7 @@ #include "outprod.hpp" #include "quants.hpp" #include "rope.hpp" +#include "set_rows.hpp" #include "softmax.hpp" #include "tsembd.hpp" #include "wkv.hpp" diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index cd15bbdb29f..65b26fd0276 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -41,6 +41,7 @@ #include "ggml-sycl/element_wise.hpp" #include "ggml-sycl/presets.hpp" #include "ggml-sycl/gemm.hpp" +#include "ggml-sycl/set_rows.hpp" #include "ggml-sycl/sycl_hw.hpp" #include "ggml-sycl/getrows.hpp" #include "ggml.h" @@ -3605,6 +3606,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_GET_ROWS: ggml_sycl_get_rows(ctx, dst); break; + case GGML_OP_SET_ROWS: + ggml_sycl_op_set_rows(ctx, dst); + break; case GGML_OP_DUP: ggml_sycl_dup(ctx, dst); break; @@ -4299,7 +4303,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g { // TODO: add support // ref: https://github.com/ggml-org/llama.cpp/pull/14274 - return false; + return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64)); } break; case GGML_OP_CPY: { diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp new file mode 100644 index 00000000000..4a76a63d354 --- /dev/null +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -0,0 +1,131 @@ +#include "set_rows.hpp" + +namespace utils { +template +static constexpr bool is_arithmetic_v() { + return std::is_arithmetic_v || std::is_same_v || std::is_same_v; +} +} +template +static inline std::enable_if_t() && utils::is_arithmetic_v(), void> +convert (const char* src, char* dst) { + auto src_val = *reinterpret_cast(src); + auto dst_val = sycl::vec(src_val).template convert()[0]; + *reinterpret_cast(dst) = dst_val;; +} + +template +static void k_set_rows( + const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst, + const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12, + const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + const size_t src_type_size, const size_t dst_type_size, + const sycl::nd_item<3> & item_ct1) { + + const int i03 = item_ct1.get_group(0); + const int i02 = item_ct1.get_group(1); + const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); // Row index + + if (i01 >= ne01) { + return; + } + + const int i12 = i03 % ne12; + const int i11 = i02 % ne11; + const int i10 = i01; + + const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12})); + + const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03}); + char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3; + + for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) { + const char * src_elem = src0_row + col * src_type_size; + char * dst_elem = dst_row_ptr + col * dst_type_size; + convert(src_elem, dst_elem); + } +} + +template +static void set_rows_sycl( + const char * src0_d, const int64_t * src1_d, char * dst_d, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + const size_t src_type_size, const size_t dst_type_size, + queue_ptr stream) { + + constexpr int max_threads_per_row = 64; // KEEPING 64 for now + const int threads_per_row = std::min((int)ne00, max_threads_per_row); + + constexpr int max_threads_per_block = 64; + const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row); + + const sycl::range<3> block_size(1, rows_per_block, threads_per_row); + const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block); + + sycl_parallel_for( + stream, + sycl::nd_range<3>(grid_size * block_size, block_size), + [=](sycl::nd_item<3> item_ct1) { + k_set_rows( + src0_d, src1_d, dst_d, + ne00, ne01, ne11, ne12, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + src_type_size, dst_type_size, + item_ct1 + ); + } + ); +} + + +void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I64); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t * src1_dd = static_cast(src1->data); + + dpct::queue_ptr stream = ctx.stream(); + switch (dst->type) { + case GGML_TYPE_F32: + set_rows_sycl( + (const char *)src0->data, src1_dd, (char *)dst->data, + ne00, ne01, ne02, ne03, + ne11, ne12, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + sizeof(float), sizeof(float), + stream + ); + break; + case GGML_TYPE_F16: + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + set_rows_sycl( + (const char *)src0->data, src1_dd, (char *)dst->data, + ne00, ne01, ne02, ne03, + ne11, ne12, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + sizeof(float), sizeof(sycl::half), + stream + ); + break; + default: + GGML_ABORT("Unsupported tensor type!"); + break; + } +} diff --git a/ggml/src/ggml-sycl/set_rows.hpp b/ggml/src/ggml-sycl/set_rows.hpp new file mode 100644 index 00000000000..27fcc8f9017 --- /dev/null +++ b/ggml/src/ggml-sycl/set_rows.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_SET_ROWS_HPP +#define GGML_SYCL_SET_ROWS_HPP + +#include "common.hpp" + +void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_SET_ROWS_HPP From 6f113cbcaad5f6db9e4d29f0c683d13af6ac5c35 Mon Sep 17 00:00:00 2001 From: lhez Date: Thu, 10 Jul 2025 11:48:52 -0700 Subject: [PATCH 0089/1348] opencl: add `set_rows` for `f16` and `f32` (llama/14547) * opencl: add `set_rows` for `f16` and `f32` * opencl: better choose workgroup size for `set_rows` --- ggml/src/ggml-opencl/CMakeLists.txt | 1 + ggml/src/ggml-opencl/ggml-opencl.cpp | 153 ++++++++++++++++++++++- ggml/src/ggml-opencl/kernels/set_rows.cl | 95 ++++++++++++++ 3 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 ggml/src/ggml-opencl/kernels/set_rows.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 45a48833480..03e77650d7e 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -88,6 +88,7 @@ set(GGML_OPENCL_KERNELS rms_norm rope scale + set_rows sigmoid silu softmax_4_f32 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 43d8e5c72c9..91b66c3bd74 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -351,6 +351,7 @@ struct ggml_backend_opencl_context { cl_program program_gemv_noshuffle_general; cl_program program_gemv_noshuffle; cl_program program_get_rows; + cl_program program_set_rows; cl_program program_glu; cl_program program_im2col_f16; cl_program program_im2col_f32; @@ -412,6 +413,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_soft_max, kernel_soft_max_4; cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16; cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0; + cl_kernel kernel_set_rows_f32, kernel_set_rows_f16; cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16; cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16; cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32; @@ -529,6 +531,16 @@ struct ggml_backend_opencl_context { fclose(ftrace); } + size_t get_kernel_workgroup_size(cl_kernel kernel) const { + size_t workgroup_size = 0; + size_t ret_size = 0; + CL_CHECK( + clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, + sizeof(size_t), &workgroup_size, &ret_size)); + GGML_ASSERT(sizeof(size_t) == ret_size); + return workgroup_size; + } + void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) { #ifdef GGML_OPENCL_PROFILING cl_event evt; @@ -1431,6 +1443,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve } } + // set_rows + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "set_rows.cl.h" + }; +#else + const std::string kernel_src = read_file("set_rows.cl"); +#endif + backend_ctx->program_set_rows = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err)); + CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err)); + GGML_LOG_CONT("."); + } + // mul_mv_id_q4_0_f32_8x_flat { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -2233,8 +2262,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te { // TODO: add support // ref: https://github.com/ggml-org/llama.cpp/pull/14274 - return false; - } break; + if (op->src[0]->type != GGML_TYPE_F32) { + return false; + } + switch (op->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return true; + default: + return false; + } + } case GGML_OP_CPY: case GGML_OP_DUP: case GGML_OP_CONT: @@ -3374,6 +3412,111 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } +static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + // ne0 = ne00 + // ne2 = ne02 + // ne3 = ne03 + + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const cl_ulong nb01 = src0->nb[1]; + const cl_ulong nb02 = src0->nb[2]; + const cl_ulong nb03 = src0->nb[3]; + + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + + const cl_ulong nb10 = src1->nb[0]; + const cl_ulong nb11 = src1->nb[1]; + const cl_ulong nb12 = src1->nb[2]; + + const int ne0 = dst->ne[0]; + + const cl_ulong nb1 = dst->nb[1]; + const cl_ulong nb2 = dst->nb[2]; + const cl_ulong nb3 = dst->nb[3]; + + const int nblk0 = ne0/ggml_blck_size(dst->type); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + cl_kernel kernel; + + switch (dst->type) { + case GGML_TYPE_F32: + kernel = backend_ctx->kernel_set_rows_f32; + break; + case GGML_TYPE_F16: + kernel = backend_ctx->kernel_set_rows_f16; + break; + default: + GGML_ABORT("not implemented"); + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3)); + + int nth0 = 64; + if (backend_ctx->gpu_family == INTEL) { + nth0 = 32; + } else if (backend_ctx->gpu_family == ADRENO) { + nth0 = 64; + } + + int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel); + while (nth0 < nblk0 && nth0 < max_workgroup_size) { + nth0 *= 2; + } + + int rows_per_workgroup = 1; + if (nth0 > nblk0) { + rows_per_workgroup = nth0 / nblk0; + nth0 = nblk0; + } + + size_t global_work_size[] = { + (size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0, + (size_t)ne02*rows_per_workgroup, + (size_t)ne03}; + size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} + static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -6388,6 +6531,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_get_rows; break; + case GGML_OP_SET_ROWS: + if (!any_on_device) { + return false; + } + func = ggml_cl_set_rows; + break; case GGML_OP_CPY: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/set_rows.cl b/ggml/src/ggml-opencl/kernels/set_rows.cl new file mode 100644 index 00000000000..a94b4361b4d --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/set_rows.cl @@ -0,0 +1,95 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +kernel void kernel_set_rows_f32( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + int ne01, + ulong nb01, + ulong nb02, + ulong nb03, + int ne11, + int ne12, + ulong nb10, + ulong nb11, + ulong nb12, + int nblk0, + ulong nb1, + ulong nb2, + ulong nb3 +) { + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; + + int i03 = get_group_id(2); + int i02 = get_group_id(1); + int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1); + + if (i01 >= ne01) { + return; + } + + int i12 = i03%ne12; + int i11 = i02%ne11; + + int i10 = i01; + long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0]; + + global float * dst_row = (global float *) (dst + i1*nb1 + i02*nb2 + i03*nb3); + global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03); + + for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) { + dst_row[ind] = (float)src_row[ind]; + } +} + +kernel void kernel_set_rows_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + int ne01, + ulong nb01, + ulong nb02, + ulong nb03, + int ne11, + int ne12, + ulong nb10, + ulong nb11, + ulong nb12, + int nblk0, + ulong nb1, + ulong nb2, + ulong nb3 +) { + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; + + int i03 = get_group_id(2); + int i02 = get_group_id(1); + int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1); + + if (i01 >= ne01) { + return; + } + + int i12 = i03%ne12; + int i11 = i02%ne11; + + int i10 = i01; + long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0]; + + global half * dst_row = (global half *) (dst + i1*nb1 + i02*nb2 + i03*nb3); + global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03); + + for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) { + dst_row[ind] = src_row[ind]; + } +} From b0754136be6e82ecf23e5d1221f99eec60ce963c Mon Sep 17 00:00:00 2001 From: rmatif Date: Thu, 10 Jul 2025 23:58:12 +0200 Subject: [PATCH 0090/1348] opencl: add tiled mul_mat_f16_f32 (llama/14535) * add tiled mul_mat_f16_f32 * fix trailing whitespace * add insightful comments --- ggml/src/ggml-opencl/CMakeLists.txt | 1 + ggml/src/ggml-opencl/ggml-opencl.cpp | 82 +++++++++++ .../ggml-opencl/kernels/mul_mat_f16_f32.cl | 130 ++++++++++++++++++ 3 files changed, 213 insertions(+) create mode 100644 ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 03e77650d7e..ec5d8cf5955 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -104,6 +104,7 @@ set(GGML_OPENCL_KERNELS tanh pad repeat + mul_mat_f16_f32 ) foreach (K ${GGML_OPENCL_KERNELS}) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 91b66c3bd74..58830b733a8 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -368,6 +368,7 @@ struct ggml_backend_opencl_context { cl_program program_mul_mv_f16_f32; cl_program program_mul_mv_f32_f32; cl_program program_mul; + cl_program program_mul_mat_f16_f32_tiled; cl_program program_div; cl_program program_sub; cl_program program_norm; @@ -422,6 +423,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_f16_f32_1row; cl_kernel kernel_mul_mat_f16_f32; cl_kernel kernel_mul_mat_f16_f32_l4; + cl_kernel kernel_mul_mat_f16_f32_tiled; cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v; cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0; cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; @@ -1015,6 +1017,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mat_f16_f32_tiled + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mat_f16_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mat_f16_f32.cl"); +#endif + backend_ctx->program_mul_mat_f16_f32_tiled = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err)); + GGML_LOG_CONT("."); + } + // mul { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -4927,6 +4945,58 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } +static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + const int M = src0->ne[1]; + const int N = src1->ne[1]; + const int K = src0->ne[0]; + + cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd)); + + // Tiling parameters. These need to be tuned for optimal performance. + // They must match the #defines in the kernel mul_mat_f16_f32.cl. + // + // OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN. + // TPWM / TPWN: Threads per Work-group. This is the work-group size. + // OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements. + // + // The following relationships must hold: + // OPWM = TPWM * OPTM + // OPWN = TPWN * OPTN + // + const int OPWM = 64; + const int OPWN = 64; + const int TPWM = 16; + const int TPWN = 8; + + size_t local_work_size[2] = { TPWM, TPWN }; + size_t global_work_size[2] = { + (size_t) ((M + OPWM - 1) / OPWM) * TPWM, + (size_t) ((N + OPWN - 1) / OPWN) * TPWN, + }; + + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); +} + static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -4940,6 +5010,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 && + src0->ne[1] > 32 && // M > 32 + src1->ne[1] > 32 && // N > 32 + src0->ne[0] > 32 && // K > 32 + src0->ne[2] == 1 && src0->ne[3] == 1 && + src1->ne[2] == 1 && src1->ne[3] == 1 && + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && + backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) { + ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst); + return; + } + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; diff --git a/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl new file mode 100644 index 00000000000..73a888494dc --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl @@ -0,0 +1,130 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#if defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#else +#define REQD_SUBGROUP_SIZE_128 +#endif + +#define OPWM 64 +#define OPWN 64 +#define CPWK 8 +#define OPTM 4 +#define OPTN 8 + +#define WG_M (OPWM / OPTM) +#define WG_N (OPWN / OPTN) +#define VEC_K (CPWK / 4) + +REQD_SUBGROUP_SIZE_128 +__kernel void mul_mat_f16_f32( + const int M, const int N, const int K, + __global const void* A_void, ulong A_offset, + __global const void* B_void, ulong B_offset, + __global void* C_void, ulong C_offset) { + + __global const half* A = (__global const half* )((__global const char*)A_void + A_offset); + __global const float* B = (__global const float*)((__global const char*)B_void + B_offset); + __global float* C = (__global float*)((__global char*)C_void + C_offset); + + const int lidm = get_local_id(0); + const int lidn = get_local_id(1); + const int lid = lidn * WG_M + lidm; + + const int offsetM = get_group_id(0) * OPWM; + const int offsetN = get_group_id(1) * OPWN; + + __local half4 Alocal[OPWM][VEC_K]; + __local float4 Blocal[OPWN][VEC_K]; + + float sum[OPTM][OPTN]; + + for (int wm = 0; wm < OPTM; wm++) { + for (int wn = 0; wn < OPTN; wn++) { + sum[wm][wn] = 0.0f; + } + } + + const int numTiles = (K + CPWK - 1) / CPWK; + + const int load_row_a = lid % OPWM; + const int load_vec_k_a = lid / OPWM; + const int global_row_a = offsetM + load_row_a; + + const int load_row_b = lid % OPWN; + const int load_vec_k_b = lid / OPWN; + const int global_row_b = offsetN + load_row_b; + + for (int t = 0; t < numTiles; t++) { + const int k_start = t * CPWK; + const int k_vec_start_a = k_start + load_vec_k_a * 4; + const int k_vec_start_b = k_start + load_vec_k_b * 4; + + if (global_row_a < M && k_vec_start_a < K) { + if (k_vec_start_a + 3 < K) { + Alocal[load_row_a][load_vec_k_a] = vload4(0, A + global_row_a * K + k_vec_start_a); + } else { + half4 tempA = (half4)(0.0h); + if (k_vec_start_a < K) tempA.s0 = A[global_row_a * K + k_vec_start_a]; + if (k_vec_start_a + 1 < K) tempA.s1 = A[global_row_a * K + k_vec_start_a + 1]; + if (k_vec_start_a + 2 < K) tempA.s2 = A[global_row_a * K + k_vec_start_a + 2]; + Alocal[load_row_a][load_vec_k_a] = tempA; + } + } else { + Alocal[load_row_a][load_vec_k_a] = (half4)(0.0h); + } + + if (global_row_b < N && k_vec_start_b < K) { + if (k_vec_start_b + 3 < K) { + Blocal[load_row_b][load_vec_k_b] = vload4(0, B + global_row_b * K + k_vec_start_b); + } else { + float4 tempB = (float4)(0.0f); + if (k_vec_start_b < K) tempB.s0 = B[global_row_b * K + k_vec_start_b]; + if (k_vec_start_b + 1 < K) tempB.s1 = B[global_row_b * K + k_vec_start_b + 1]; + if (k_vec_start_b + 2 < K) tempB.s2 = B[global_row_b * K + k_vec_start_b + 2]; + Blocal[load_row_b][load_vec_k_b] = tempB; + } + } else { + Blocal[load_row_b][load_vec_k_b] = (float4)(0.0f); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + #pragma unroll + for (int k_vec = 0; k_vec < VEC_K; k_vec++) { + float4 a_fvecs[OPTM]; + int current_row_a = lidm; + for (int wm = 0; wm < OPTM; wm++) { + a_fvecs[wm] = convert_float4(Alocal[current_row_a][k_vec]); + current_row_a += WG_M; + } + + float4 b_fvecs[OPTN]; + int current_row_b = lidn; + for (int wn = 0; wn < OPTN; wn++) { + b_fvecs[wn] = Blocal[current_row_b][k_vec]; + current_row_b += WG_N; + } + + for (int wm = 0; wm < OPTM; wm++) { + for (int wn = 0; wn < OPTN; wn++) { + sum[wm][wn] += dot(a_fvecs[wm], b_fvecs[wn]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + for (int wm = 0; wm < OPTM; wm++) { + int globalRow = offsetM + lidm + wm * WG_M; + if (globalRow < M) { + for (int wn = 0; wn < OPTN; wn++) { + int globalCol = offsetN + lidn + wn * WG_N; + if (globalCol < N) { + C[globalCol * M + globalRow] = sum[wm][wn]; + } + } + } + } +} From a4ff4ec9cb48e3420e25d207eb40e4c07639b50b Mon Sep 17 00:00:00 2001 From: Slobodan Josic <127323561+slojosic-amd@users.noreply.github.com> Date: Fri, 11 Jul 2025 18:55:00 +0200 Subject: [PATCH 0091/1348] HIP : Add HIP 7.0+ compatibility for hipBLAS compute types (llama/14634) --- ggml/src/ggml-cuda/vendors/hip.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 1a28831b7a9..184d445f5c0 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -10,9 +10,6 @@ #include "rocblas/rocblas.h" #endif // __HIP_PLATFORM_AMD__ -#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT #define CUBLAS_OP_N HIPBLAS_OP_N @@ -30,7 +27,6 @@ #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }} #define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width) #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasCreate hipblasCreate #define cublasDestroy hipblasDestroy #define cublasGemmEx hipblasGemmEx @@ -42,7 +38,6 @@ #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t #define cublasOperation_t hipblasOperation_t -#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess @@ -144,6 +139,20 @@ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000 +#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F +#define cublasComputeType_t hipblasComputeType_t +#define cudaDataType_t hipDataType +#else +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define cublasComputeType_t hipblasDatatype_t +#define cudaDataType_t hipblasDatatype_t +#endif + #define __CUDA_ARCH__ 1300 #if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) From 74f6d479049db01094a8323bcd90a47d050d1ebb Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 11 Jul 2025 20:27:01 +0200 Subject: [PATCH 0092/1348] model : support LiquidAI LFM2 hybrid family (llama/14620) **Important** LFM2 was [merged ](https://github.com/huggingface/transformers/pull/39340)into transformers, but has not yet been released. To convert into gguf, install transformers from source ```shell pip install "transformers @ git+https://github.com/huggingface/transformers.git@main" ``` --- ggml/src/ggml-cuda/ssm-conv.cu | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index f6375719637..41979733601 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -107,8 +107,11 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int if (nc == 4) { ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); + } else if (nc == 3) { + ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, + dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { - GGML_ABORT("Only support kernel size = 4 now."); + GGML_ABORT("Only support kernel size = 3 or size = 4 right now."); } } else { if (nc == 4) { @@ -116,8 +119,13 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); ssm_conv_long_token_f32<<>>( src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); + } else if (nc == 3) { + const int64_t split_n_t = 32; + dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); + ssm_conv_long_token_f32<<>>( + src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { - GGML_ABORT("Only support kernel size = 4 right now."); + GGML_ABORT("Only support kernel size = 3 or size = 4 right now."); } } } From 8670a3fd5def4cbcb5742bbc862c609acbc9c061 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 12 Jul 2025 04:51:58 -0500 Subject: [PATCH 0093/1348] vulkan: optimizations for deepseek prompt processing (llama/14555) * vulkan: allow unclamped loads in coopmat2 mul_mat_id shader * vulkan: increase coopmat2 mul_mat_id tile size * vulkan: optimize mat_mul_id row_ids search to batch loads, and port to coopmat1 path * vulkan: use smaller FA row size when head size is large. applies to both scalar and CM2 paths (CM1 isn't used due to shared memory limits) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 27 +++++++---- .../ggml-vulkan/vulkan-shaders/mul_mm.comp | 48 ++++++++++++++++++- .../vulkan-shaders/mul_mm_cm2.comp | 47 ++++++++++++++---- 3 files changed, 104 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c36e1a6d3bf..cdddf868fbe 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1735,7 +1735,14 @@ static FaHeadSizes fa_get_head_sizes(uint32_t hsk, uint32_t hsv) { // number of rows/cols for flash attention shader static constexpr uint32_t flash_attention_num_small_rows = 32; static constexpr uint32_t scalar_flash_attention_num_small_rows = 1; -static constexpr uint32_t scalar_flash_attention_num_large_rows = 8; + +static uint32_t get_fa_scalar_num_large_rows(uint32_t hsv) { + if (hsv >= 512) { + return 2; + } else { + return 8; + } +} // The FA coopmat1 shader assumes 16x16x16 matrix multiply support. // 128 threads split into four subgroups, each subgroup does 1/4 @@ -1760,7 +1767,7 @@ static std::array fa_rows_cols(FaCodePath path, uint32_t hsk, uint3 if (small_rows) { return {scalar_flash_attention_num_small_rows, 64}; } else { - return {scalar_flash_attention_num_large_rows, 32}; + return {get_fa_scalar_num_large_rows(hsv), 32}; } } @@ -1779,7 +1786,11 @@ static std::array fa_rows_cols(FaCodePath path, uint32_t hsk, uint3 // small cols to reduce register count if (ggml_is_quantized(type) || hsk >= 256) { - return {64, 32}; + if (hsk >= 512) { + return {32, 32}; + } else { + return {64, 32}; + } } return {64, 64}; } @@ -1821,7 +1832,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec const uint32_t warps = warptile[0] / warptile[10]; const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size; - const uint32_t mmid_row_ids = mul_mat_id ? 4096 * sizeof(uint32_t) : 0; + const uint32_t mmid_row_ids = mul_mat_id ? (4096 * sizeof(uint32_t) + 4/*_ne1*/) : 0; const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0; const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size; @@ -1946,10 +1957,10 @@ static void ggml_vk_load_shaders(vk_device& device) { s_mmq_wg_denoms_k = { 32, 32, 1 }; // spec constants and tile sizes for quant matmul_id - l_warptile_mmqid = { 256, 128, 64, 16, 0 }; + l_warptile_mmqid = { 256, 128, 128, 16, 0 }; m_warptile_mmqid = { 256, 128, 64, 16, 0 }; s_warptile_mmqid = { 256, 128, 64, 16, 0 }; - l_mmqid_wg_denoms = { 128, 64, 1 }; + l_mmqid_wg_denoms = { 128, 128, 1 }; m_mmqid_wg_denoms = { 128, 64, 1 }; s_mmqid_wg_denoms = { 128, 64, 1 }; @@ -6048,7 +6059,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con // Needs to be kept up to date on shader changes GGML_UNUSED(hsv); const uint32_t wg_size = scalar_flash_attention_workgroup_size; - const uint32_t Br = scalar_flash_attention_num_large_rows; + const uint32_t Br = get_fa_scalar_num_large_rows(hsv); const uint32_t Bc = scalar_flash_attention_Bc; const uint32_t tmpsh = wg_size * sizeof(float); @@ -6173,7 +6184,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx case FA_SCALAR: case FA_COOPMAT1: // We may switch from coopmat1 to scalar, so use the scalar limit for both - max_gqa = scalar_flash_attention_num_large_rows; + max_gqa = get_fa_scalar_num_large_rows(HSV); break; case FA_COOPMAT2: max_gqa = get_fa_num_small_rows(FA_COOPMAT2); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 888ce79f6ec..f481549911b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -18,6 +18,7 @@ #extension GL_KHR_cooperative_matrix : enable #extension GL_KHR_memory_scope_semantics : enable #extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_ballot : enable #endif #ifdef MUL_MAT_ID @@ -104,6 +105,10 @@ shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE]; #ifdef MUL_MAT_ID shared u16vec2 row_ids[4096]; +uint _ne1; +#ifdef COOPMAT +shared uint _ne1_sh; +#endif #endif // MUL_MAT_ID #define NUM_WARPS (BLOCK_SIZE / WARP) @@ -172,7 +177,47 @@ void main() { const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK; #ifdef MUL_MAT_ID - uint _ne1 = 0; +#ifdef COOPMAT + // Spread the search across all elements in the first subgroup + if (gl_SubgroupID == 0) { + _ne1 = 0; + uint num_elements = p.nei1 * p.nei0; + + uint ids[16]; + uint iter = 0; + + for (uint j = 0; j < num_elements; j += gl_SubgroupSize) { + // prefetch up to 16 elements + if (iter == 0) { + [[unroll]] for (uint k = 0; k < 16; ++k) { + uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize; + bool in_range = i < num_elements; + uint ii1 = i / p.nei0; + uint ii0 = i % p.nei0; + ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; + } + } + uint i = j + gl_SubgroupInvocationID; + bool in_range = i < num_elements; + uint ii1 = i / p.nei0; + uint ii0 = i % p.nei0; + uint id = ids[iter++]; + uvec4 ballot = subgroupBallot(in_range && id == expert_idx); + uint idx = subgroupBallotExclusiveBitCount(ballot); + if (in_range && id == expert_idx) { + row_ids[_ne1 + idx] = u16vec2(ii0, ii1); + } + _ne1 += subgroupBallotBitCount(ballot); + iter &= 15; + } + _ne1_sh = _ne1; + } + + barrier(); + + _ne1 = _ne1_sh; +#else + _ne1 = 0; for (uint ii1 = 0; ii1 < p.nei1; ii1++) { for (uint ii0 = 0; ii0 < p.nei0; ii0++) { if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) { @@ -183,6 +228,7 @@ void main() { } barrier(); +#endif // Workgroup has no work if (ic * BN >= _ne1) return; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp index 91846575732..29e4b5c9ce2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -162,17 +162,32 @@ void main() { _ne1 = 0; uint num_elements = p.nei1 * p.nei0; - for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) { + uint ids[16]; + uint iter = 0; + + for (uint j = 0; j < num_elements; j += gl_SubgroupSize) { + // prefetch up to 16 elements + if (iter == 0) { + [[unroll]] for (uint k = 0; k < 16; ++k) { + uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize; + bool in_range = i < num_elements; + uint ii1 = i / p.nei0; + uint ii0 = i % p.nei0; + ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; + } + } + uint i = j + gl_SubgroupInvocationID; bool in_range = i < num_elements; - uint ii0 = i % p.nei0; uint ii1 = i / p.nei0; - uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; + uint ii0 = i % p.nei0; + uint id = ids[iter++]; uvec4 ballot = subgroupBallot(in_range && id == expert_idx); uint idx = subgroupBallotExclusiveBitCount(ballot); if (in_range && id == expert_idx) { row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0); } _ne1 += subgroupBallotBitCount(ballot); + iter &= 15; } _ne1_sh = _ne1; } @@ -414,17 +429,31 @@ void main() { fetch_scales(ir * BM, pos_a, stride_a, block_k + BK, tid, false); } - coopmat mat_a; - coopmat mat_b; + if ((ir + 1) * BM <= p.M && block_k + BK <= end_k) { + coopmat mat_a; + coopmat mat_b; - coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); #ifdef MUL_MAT_ID - coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); #else - coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); #endif - sum = coopMatMulAdd(mat_a, mat_b, sum); + sum = coopMatMulAdd(mat_a, mat_b, sum); + } else { + coopmat mat_a; + coopmat mat_b; + + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); +#ifdef MUL_MAT_ID + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); +#else + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); +#endif + + sum = coopMatMulAdd(mat_a, mat_b, sum); + } } // Convert from ACC_TYPE to D_TYPE From 915fc153a528d4f9a1c8616658aa042f0366febb Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 12 Jul 2025 05:12:26 -0500 Subject: [PATCH 0094/1348] vulkan: support SET_ROWS (llama/14587) * vulkan: support SET_ROWS Add variants of the copy_to_quant shader that do the SET_ROWS operation. Change these shaders to spread the work across the workgroup. The memory access pattern is probably not great (one thread per quant block), but should be fine for now. * vulkan: optimize set_rows Larger workgroups for non-quant types. Set "norepeat" (there is manual repeat logic). Use fastmod. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 116 +++++++++++++++--- .../vulkan-shaders/copy_to_quant.comp | 65 ++++++++-- .../vulkan-shaders/vulkan-shaders-gen.cpp | 5 + 3 files changed, 164 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index cdddf868fbe..9540fd6b788 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -437,6 +437,7 @@ struct vk_device_struct { vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16; vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT]; vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_set_rows[GGML_TYPE_COUNT]; vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; @@ -2749,19 +2750,41 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); if (device->float_controls_rte_fp16) { - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); } else { - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + } + + if (device->float_controls_rte_fp16) { + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F32], "set_rows_f32", set_rows_f32_rte_len, set_rows_f32_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F16], "set_rows_f16", set_rows_f16_rte_len, set_rows_f16_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_BF16], "set_rows_bf16", set_rows_bf16_rte_len, set_rows_bf16_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_0], "set_rows_q4_0", set_rows_q4_0_rte_len, set_rows_q4_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_1], "set_rows_q4_1", set_rows_q4_1_rte_len, set_rows_q4_1_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_0], "set_rows_q5_0", set_rows_q5_0_rte_len, set_rows_q5_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_1], "set_rows_q5_1", set_rows_q5_1_rte_len, set_rows_q5_1_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q8_0], "set_rows_q8_0", set_rows_q8_0_rte_len, set_rows_q8_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_IQ4_NL], "set_rows_iq4_nl", set_rows_iq4_nl_rte_len, set_rows_iq4_nl_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + } else { + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F32], "set_rows_f32", set_rows_f32_len, set_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F16], "set_rows_f16", set_rows_f16_len, set_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_BF16], "set_rows_bf16", set_rows_bf16_len, set_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_0], "set_rows_q4_0", set_rows_q4_0_len, set_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_1], "set_rows_q4_1", set_rows_q4_1_len, set_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_0], "set_rows_q5_0", set_rows_q5_0_len, set_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_1], "set_rows_q5_1", set_rows_q5_1_len, set_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q8_0], "set_rows_q8_0", set_rows_q8_0_len, set_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_IQ4_NL], "set_rows_iq4_nl", set_rows_iq4_nl_len, set_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); } ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); @@ -6527,6 +6550,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const case GGML_OP_CONT: case GGML_OP_DUP: return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type); + case GGML_OP_SET_ROWS: + return ctx->device->pipeline_set_rows[dst->type]; case GGML_OP_SILU_BACK: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_silu_back_f32; @@ -6765,6 +6790,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_RMS_NORM: case GGML_OP_CONV_2D_DW: case GGML_OP_IM2COL: + case GGML_OP_SET_ROWS: return true; default: return false; @@ -7078,6 +7104,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co ne *= ggml_type_size(src0->type) / 2; } } + // copy_to_quant has block size of 32, and each thread does QUANT_K elements. + // Splitting into 512x512xZ wouldn't work well since each workgroup does 1024 elements. + // So divide by block size here before splitting into 512x512 groups. + if (op == GGML_OP_CPY && !ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { + ne = CEIL_DIV(ne, ggml_blck_size(dst->type)); + } if (ne > 262144) { elements = { 512, 512, CEIL_DIV(ne, 262144) }; } else if (ne > 512) { @@ -7086,6 +7118,25 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co elements = { ne, 1, 1 }; } } break; + case GGML_OP_SET_ROWS: + { + uint32_t ne = ggml_nelements(src0); + if (ggml_is_quantized(dst->type)) { + // quants run 32 threads each doing QUANT_K elements + ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type)); + } else { + // scalar types do one element per thread, running 512 threads + ne = CEIL_DIV(ne, 512); + } + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + } + break; default: elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; break; @@ -7648,6 +7699,21 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const }, dryrun); } +static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }, dryrun); +} + static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); } @@ -8968,6 +9034,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_CLAMP: case GGML_OP_PAD: case GGML_OP_CPY: + case GGML_OP_SET_ROWS: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_SILU_BACK: @@ -9034,6 +9101,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_CLAMP: case GGML_OP_PAD: case GGML_OP_CPY: + case GGML_OP_SET_ROWS: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_SILU_BACK: @@ -9142,6 +9210,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_DUP: ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_SET_ROWS: + ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node, dryrun); + break; case GGML_OP_SILU_BACK: ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun); @@ -9357,6 +9429,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_OP_CLAMP: case GGML_OP_PAD: case GGML_OP_CPY: + case GGML_OP_SET_ROWS: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_SILU_BACK: @@ -10422,9 +10495,20 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm } break; case GGML_OP_SET_ROWS: { - // TODO: add support - // ref: https://github.com/ggml-org/llama.cpp/pull/14274 - return false; + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return true; + default: + return false; + } } break; case GGML_OP_CONT: case GGML_OP_CPY: @@ -11039,6 +11123,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * } else { tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]); } + } else if (tensor->op == GGML_OP_SET_ROWS) { + tensor_clone = ggml_set_rows(ggml_ctx, src_clone[0], src_clone[1]); } else if (tensor->op == GGML_OP_CONT) { tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); } else if (tensor->op == GGML_OP_RESHAPE) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp index 9c76437d9b0..e06547e48f7 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp @@ -6,17 +6,25 @@ spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bi #endif // RTE16 #include "types.comp" -#include "generic_unary_head.comp" -#if defined(DATA_A_IQ4_NL) -// 16 invocations needed for init_iq4nl_shmem -layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in; +#if defined(SET_ROWS) && QUANT_K == 1 +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; +const uint BLOCK_SIZE = 512; #else -layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +const uint BLOCK_SIZE = 32; #endif layout (binding = 0) readonly buffer S {float data_s[];}; + +#if defined(SET_ROWS) +#include "generic_binary_head.comp" +layout (binding = 1) readonly buffer C {uvec2 data_i[];}; +layout (binding = 2) writeonly buffer Q {A_TYPE data_q[];}; +#else +#include "generic_unary_head.comp" layout (binding = 1) writeonly buffer Q {A_TYPE data_q[];}; +#endif #if defined(DATA_A_Q4_0) void quantize(uint dst_idx, uint src_idx) @@ -221,15 +229,56 @@ void quantize(uint dst_idx, uint src_idx) } #endif +#if defined(DATA_A_F32) || defined(DATA_A_F16) +void quantize(uint dst_idx, uint src_idx) +{ + data_q[dst_idx] = A_TYPE(data_s[src_idx]); +} +#endif + +#if defined(DATA_A_BF16) +void quantize(uint dst_idx, uint src_idx) +{ + data_q[dst_idx] = A_TYPE(fp32_to_bf16(data_s[src_idx])); +} +#endif + +#if defined(SET_ROWS) + void main() { #ifdef NEEDS_INIT_IQ_SHMEM init_iq_shmem(gl_WorkGroupSize); - if (gl_LocalInvocationIndex.x != 0) { +#endif + + const uint idx = ((gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x) * BLOCK_SIZE + gl_LocalInvocationID.x) * QUANT_K; + + if (idx >= p.ne) { return; } + + uint i00, i01, i02, i03; + get_indices(idx, i00, i01, i02, i03); + + uint i12 = fastmod(i03, p.ne12); + uint i11 = fastmod(i02, p.ne11); + uint i10 = i01; + + uint i1 = data_i[src1_idx(i10, i11, i12, 0) + get_boffset()].x; + + uint src0_idx = src0_idx(i00, i01, i02, i03) + get_aoffset(); + uint dst_idx = dst_idx(i00 / QUANT_K, i1, i02, i03) + get_doffset(); + + quantize(dst_idx, src0_idx); +} + +#else + +void main() { +#ifdef NEEDS_INIT_IQ_SHMEM + init_iq_shmem(gl_WorkGroupSize); #endif - const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K; + const uint idx = (gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x) * QUANT_K; if (idx >= p.ne) { return; @@ -240,3 +289,5 @@ void main() { quantize(dst_idx, src_idx); } + +#endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 30f78fabb3c..f4ba3aa0d8f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -518,6 +518,11 @@ void process_shaders() { string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); } + for (std::string t : {"f32", "f16", "bf16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) { + string_to_spv("set_rows_" + t, "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); + string_to_spv("set_rows_" + t + "_rte", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}}); + } + auto get_type_str = [](bool f16) { return f16 ? "float16_t" : "float"; }; From 85dcc74b881f653d28eee3c60e148c297fa2cce4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 12 Jul 2025 14:39:52 +0300 Subject: [PATCH 0095/1348] sync : resolve conflicts (ggml/0) ggml-ci --- ggml/CMakeLists.txt | 7 + ggml/include/ggml.h | 3 + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 214 +++++++++--------- .../ggml-vulkan/vulkan-shaders/upscale.comp | 74 +++++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 + ggml/src/ggml.c | 8 + 6 files changed, 195 insertions(+), 113 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 215eb234868..675f3bf7567 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -360,6 +360,13 @@ write_basic_package_version_file( VERSION ${GGML_INSTALL_VERSION} COMPATIBILITY SameMajorVersion) +target_compile_definitions(ggml-base PRIVATE + GGML_VERSION="${GGML_INSTALL_VERSION}" + GGML_COMMIT="${GGML_BUILD_COMMIT}" +) +message(STATUS "ggml version: ${GGML_INSTALL_VERSION}") +message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}") + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 13044602fd8..8a8775be365 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -648,6 +648,9 @@ extern "C" { // misc + GGML_API const char * ggml_version(void); + GGML_API const char * ggml_commit(void); + GGML_API void ggml_time_init(void); // call this once at the beginning of the program GGML_API int64_t ggml_time_ms(void); GGML_API int64_t ggml_time_us(void); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 9540fd6b788..416ee3bd3f7 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -425,13 +425,14 @@ struct vk_device_struct { vk_pipeline pipeline_div_norepeat[2][2][2]; vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; - vk_pipeline pipeline_upscale_f32; + vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32; vk_pipeline pipeline_scale_f32; vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_sin_f32; vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; + vk_pipeline pipeline_roll_f32; vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16; vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16; @@ -694,6 +695,37 @@ struct vk_op_unary_push_constants { }; static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); +static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) { + GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst))); + ne = ne != 0 ? ne : ggml_nelements(dst); + GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); + + vk_op_unary_push_constants p{}; + p.ne = (uint32_t)ne; + + size_t src0_tsize = ggml_type_size(src0->type); + p.ne00 = (uint32_t)src0->ne[0]; + p.ne01 = (uint32_t)src0->ne[1]; + p.ne02 = (uint32_t)src0->ne[2]; + p.ne03 = (uint32_t)src0->ne[3]; + p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); + p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); + p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); + p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); + + size_t dst_tsize = ggml_type_size(dst->type); + p.ne10 = (uint32_t)dst->ne[0]; + p.ne11 = (uint32_t)dst->ne[1]; + p.ne12 = (uint32_t)dst->ne[2]; + p.ne13 = (uint32_t)dst->ne[3]; + p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); + p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); + p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); + p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); + + return p; // fastdiv values and offsets are initialized later in ggml_vk_op +} + // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division // can be computed using a multiply (high 32b of 64b result) @@ -863,6 +895,7 @@ struct vk_op_conv2d_dw_push_constants { struct vk_op_upscale_push_constants { uint32_t ne; uint32_t a_offset; uint32_t d_offset; + uint32_t ne00; uint32_t ne01; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; float sf0; float sf1; float sf2; float sf3; @@ -2824,7 +2857,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1); + ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1); ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -2836,6 +2871,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -6502,8 +6539,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_UPSCALE: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) { - return ctx->device->pipeline_upscale_f32; + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + int mode = ggml_get_op_params_i32(dst, 0); + switch (mode) { + case GGML_SCALE_MODE_NEAREST: + return ctx->device->pipeline_upscale_nearest_f32; + case GGML_SCALE_MODE_BILINEAR: + return ctx->device->pipeline_upscale_bilinear_f32; + case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS: + return ctx->device->pipeline_upscale_bilinear_ac_f32; + } } return nullptr; case GGML_OP_SCALE: @@ -6536,6 +6581,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_pad_f32; } return nullptr; + case GGML_OP_ROLL: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_roll_f32; + } + return nullptr; case GGML_OP_REPEAT: if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { return ctx->device->pipeline_repeat_f32; @@ -7085,6 +7135,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_ROLL: case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_CPY: @@ -7546,14 +7597,21 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0); - const float sf0 = (float)dst->ne[0] / src0->ne[0]; - const float sf1 = (float)dst->ne[1] / src0->ne[1]; - const float sf2 = (float)dst->ne[2] / src0->ne[2]; - const float sf3 = (float)dst->ne[3] / src0->ne[3]; + float sf0 = (float)dst->ne[0] / src0->ne[0]; + float sf1 = (float)dst->ne[1] / src0->ne[1]; + float sf2 = (float)dst->ne[2] / src0->ne[2]; + float sf3 = (float)dst->ne[3] / src0->ne[3]; + + if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) { + sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1); + sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1); + } ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, { (uint32_t)ggml_nelements(dst), 0, 0, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3], sf0, sf1, sf2, sf3, @@ -7561,123 +7619,64 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - float * op_params = (float *)dst->op_params; - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = ggml_get_op_params_f32(dst, 0); + p.param2 = ggml_get_op_params_f32(dst, 1); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - op_params[0], op_params[1], - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun); } static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun); } static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun); } static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun); } static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - float * op_params = (float *)dst->op_params; - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = ggml_get_op_params_f32(dst, 0); + p.param2 = ggml_get_op_params_f32(dst, 1); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, { - (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - op_params[0], op_params[1], - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun); } static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun); +} - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, { - (uint32_t)ggml_nelements(dst), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); +static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + const int32_t s0 = ggml_get_op_params_i32(dst, 0); + const int32_t s1 = ggml_get_op_params_i32(dst, 1); + const int32_t s2 = ggml_get_op_params_i32(dst, 2); + const int32_t s3 = ggml_get_op_params_i32(dst, 3); + const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000); + const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000); + + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + memcpy(&p.param1, &s01_packed, sizeof(float)); + memcpy(&p.param2, &s23_packed, sizeof(float)); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun); } static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, { - (uint32_t)ggml_nelements(dst), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun); } static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, { - (uint32_t)ggml_nelements(dst), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun); } static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - const uint32_t src0_type_size = ggml_type_size(src0->type); - const uint32_t dst_type_size = ggml_type_size(dst->type); - uint32_t ne = (uint32_t)ggml_nelements(src0); if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { // Convert from number of logical elements to 2- or 4-byte units. @@ -7689,14 +7688,8 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const } } - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { - ne, - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, dryrun); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun); } static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -9033,6 +9026,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_ROLL: case GGML_OP_CPY: case GGML_OP_SET_ROWS: case GGML_OP_CONT: @@ -9204,6 +9198,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_PAD: ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_ROLL: + ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun); + break; case GGML_OP_CPY: case GGML_OP_CONT: @@ -9428,6 +9426,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_ROLL: case GGML_OP_CPY: case GGML_OP_SET_ROWS: case GGML_OP_CONT: @@ -10594,13 +10593,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_UPSCALE: - return op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_ACC: case GGML_OP_CONCAT: case GGML_OP_SCALE: case GGML_OP_PAD: + case GGML_OP_ROLL: case GGML_OP_DIAG_MASK_INF: - return true; case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp index 6f607380df8..74771def0f9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp @@ -3,6 +3,7 @@ layout (push_constant) uniform parameter { uint ne; uint a_offset; uint d_offset; + uint ne00; uint ne01; uint nb00; uint nb01; uint nb02; uint nb03; uint ne10; uint ne11; uint ne12; uint ne13; float sf0; float sf1; float sf2; float sf3; @@ -15,6 +16,61 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; +// from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag +#define NEAREST 0 +#define BILINEAR 1 +#define ALIGN_CORNERS (1 << 8) + +layout (constant_id = 0) const uint scale_mode = 0; + +float fetch_nearest(uint i10, uint i11, uint i12, uint i13) { + const uint i00 = uint(i10 / p.sf0); + const uint i01 = uint(i11 / p.sf1); + const uint i02 = uint(i12 / p.sf2); + const uint i03 = uint(i13 / p.sf3); + + return data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]; +} + +float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) { + const uint i02 = uint(i12 / p.sf2); + const uint i03 = uint(i13 / p.sf3); + const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02; + + const float v00 = data_a[base + c0.y * p.nb01 + c0.x * p.nb00]; + const float v01 = data_a[base + c0.y * p.nb01 + c1.x * p.nb00]; + const float v10 = data_a[base + c1.y * p.nb01 + c0.x * p.nb00]; + const float v11 = data_a[base + c1.y * p.nb01 + c1.x * p.nb00]; + + return + v00 * (1.0-d.x) * (1.0-d.y) + + v01 * d.x * (1.0-d.y) + + v10 * (1.0-d.x) * d.y + + v11 * d.x * d.y; +} + +float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) { + const ivec2 ne0 = ivec2(p.ne00, p.ne01); + + const vec2 c = (vec2(i10, i11) + 0.5) / vec2(p.sf0, p.sf1) - 0.5; + const vec2 c0f = floor(c); + const vec2 d = c - c0f; + const ivec2 c0 = max(ivec2(c0f), 0); + const ivec2 c1 = min(ivec2(c0f + 1), ne0 - 1); + + return fetch_bilinear(c0, c1, d, i12, i13); +} + +float interpolate_bilinear_align_corners(uint i10, uint i11, uint i12, uint i13) { + const vec2 c = vec2(i10, i11) / vec2(p.sf0, p.sf1); + const vec2 c0f = floor(c); + const vec2 d = c - c0f; + const ivec2 c0 = ivec2(c0f); + const ivec2 c1 = c0 + 1; + + return fetch_bilinear(c0, c1, d, i12, i13); +} + void main() { const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; @@ -27,10 +83,18 @@ void main() { const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12; const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13; - const uint i00 = uint(i10 / p.sf0); - const uint i01 = uint(i11 / p.sf1); - const uint i02 = uint(i12 / p.sf2); - const uint i03 = uint(i13 / p.sf3); + float result; + switch (scale_mode) { + case NEAREST: + result = fetch_nearest(i10, i11, i12, i13); + break; + case BILINEAR: + result = interpolate_bilinear(i10, i11, i12, i13); + break; + case BILINEAR | ALIGN_CORNERS: + result = interpolate_bilinear_align_corners(i10, i11, i12, i13); + break; + } - data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]); + data_d[p.d_offset + idx] = D_TYPE(result); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index f4ba3aa0d8f..d4a4e4c5290 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -653,6 +653,8 @@ void process_shaders() { string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); + string_to_spv("roll_f32", "roll.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + for (auto &c : compiles) { c.wait(); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c8d1f144d03..5ae1c527df6 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -473,6 +473,14 @@ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) { return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0; } +const char * ggml_version(void) { + return GGML_VERSION; +} + +const char * ggml_commit(void) { + return GGML_COMMIT; +} + // // timing // From 6d64e4abf31813611cf81892eeb98b540f1f0ed8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 12 Jul 2025 16:22:40 +0300 Subject: [PATCH 0096/1348] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 37344329c69..ca009adb83b 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -e3b764668ffa41996815c175ab7e94da87cdb5b1 +d62df60a07ba3deeb85e5cfc9b1ee07645ff35e2 From 6ddff4d96aff99db0dc702d27aa18a7462c1b04d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 12 Jul 2025 16:26:16 +0300 Subject: [PATCH 0097/1348] talk-llama : sync llama.cpp ggml-ci --- examples/talk-llama/llama-arch.cpp | 195 +- examples/talk-llama/llama-arch.h | 18 +- examples/talk-llama/llama-batch.cpp | 28 +- examples/talk-llama/llama-batch.h | 9 +- examples/talk-llama/llama-chat.cpp | 15 + examples/talk-llama/llama-chat.h | 1 + examples/talk-llama/llama-graph.cpp | 241 +- examples/talk-llama/llama-graph.h | 106 +- examples/talk-llama/llama-hparams.cpp | 8 +- examples/talk-llama/llama-hparams.h | 3 + .../llama-kv-cache-unified-iswa.cpp | 44 +- .../talk-llama/llama-kv-cache-unified-iswa.h | 6 +- .../talk-llama/llama-kv-cache-unified.cpp | 277 +- examples/talk-llama/llama-kv-cache-unified.h | 86 +- examples/talk-llama/llama-kv-cells.h | 72 +- examples/talk-llama/llama-memory-hybrid.cpp | 11 +- examples/talk-llama/llama-memory-hybrid.h | 4 +- .../talk-llama/llama-memory-recurrent.cpp | 20 +- examples/talk-llama/llama-model.cpp | 3179 +++++++++++++---- examples/talk-llama/llama-model.h | 17 + examples/talk-llama/llama-quant.cpp | 1 + examples/talk-llama/llama-vocab.cpp | 15 +- examples/talk-llama/llama-vocab.h | 41 + examples/talk-llama/llama.h | 40 - 24 files changed, 3289 insertions(+), 1148 deletions(-) diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp index aa21108a4bd..e63ab284bc3 100644 --- a/examples/talk-llama/llama-arch.cpp +++ b/examples/talk-llama/llama-arch.cpp @@ -45,6 +45,9 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA3N, "gemma3n" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_MAMBA2, "mamba2" }, + { LLM_ARCH_JAMBA, "jamba" }, + { LLM_ARCH_FALCON_H1, "falcon-h1" }, { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COHERE2, "cohere2" }, @@ -70,6 +73,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ARWKV7, "arwkv7" }, { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, + { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" }, { LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_PLM, "plm" }, @@ -77,6 +81,9 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DOTS1, "dots1" }, { LLM_ARCH_ARCEE, "arcee" }, { LLM_ARCH_ERNIE4_5, "ernie4_5" }, + { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, + { LLM_ARCH_SMOLLM3, "smollm3" }, + { LLM_ARCH_LFM2, "lfm2" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -149,7 +156,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, - { LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, @@ -170,6 +176,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" }, { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, @@ -182,6 +189,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, + { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -1004,6 +1013,77 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, }, }, + { + LLM_ARCH_MAMBA2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + }, + }, + { + LLM_ARCH_JAMBA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" }, + { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, + { + LLM_ARCH_FALCON_H1, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_XVERSE, { @@ -1564,6 +1644,43 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, + { + LLM_ARCH_GRANITE_HYBRID, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + // mamba(2) ssm layers + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + // attention layers + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + // dense FFN + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + // moe FFN + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + // shared expert + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + }, + }, { LLM_ARCH_CHAMELEON, { @@ -1676,6 +1793,67 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_HUNYUAN_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, + { + LLM_ARCH_SMOLLM3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_LFM2, + { + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" }, + { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" }, + { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" }, + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + } + }, { LLM_ARCH_UNKNOWN, { @@ -1760,7 +1938,11 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}}, {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}}, + {LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -1839,6 +2021,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, + {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} @@ -1894,6 +2079,7 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) { bool llm_arch_is_recurrent(const llm_arch & arch) { switch (arch) { case LLM_ARCH_MAMBA: + case LLM_ARCH_MAMBA2: case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: case LLM_ARCH_RWKV7: @@ -1905,9 +2091,12 @@ bool llm_arch_is_recurrent(const llm_arch & arch) { } bool llm_arch_is_hybrid(const llm_arch & arch) { - // TODO: There are currently no hybrid models! Once there are, this will be - // the place to identify them switch (arch) { + case LLM_ARCH_JAMBA: + case LLM_ARCH_FALCON_H1: + case LLM_ARCH_GRANITE_HYBRID: + case LLM_ARCH_LFM2: + return true; default: return false; } diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h index 0771ec3ebad..1f973259524 100644 --- a/examples/talk-llama/llama-arch.h +++ b/examples/talk-llama/llama-arch.h @@ -49,6 +49,9 @@ enum llm_arch { LLM_ARCH_GEMMA3N, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, + LLM_ARCH_MAMBA2, + LLM_ARCH_JAMBA, + LLM_ARCH_FALCON_H1, LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, LLM_ARCH_COHERE2, @@ -74,6 +77,7 @@ enum llm_arch { LLM_ARCH_ARWKV7, LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, + LLM_ARCH_GRANITE_HYBRID, LLM_ARCH_CHAMELEON, LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_PLM, @@ -81,6 +85,9 @@ enum llm_arch { LLM_ARCH_DOTS1, LLM_ARCH_ARCEE, LLM_ARCH_ERNIE4_5, + LLM_ARCH_HUNYUAN_MOE, + LLM_ARCH_SMOLLM3, + LLM_ARCH_LFM2, LLM_ARCH_UNKNOWN, }; @@ -153,7 +160,6 @@ enum llm_kv { LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, - LLM_KV_ATTENTION_LAYER_INDICES, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, @@ -174,6 +180,7 @@ enum llm_kv { LLM_KV_SSM_CONV_KERNEL, LLM_KV_SSM_STATE_SIZE, LLM_KV_SSM_TIME_STEP_RANK, + LLM_KV_SSM_GROUP_COUNT, LLM_KV_SSM_DT_B_C_RMS, LLM_KV_WKV_HEAD_SIZE, @@ -221,6 +228,8 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + LLM_KV_SHORTCONV_L_CACHE, + // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, @@ -291,8 +300,12 @@ enum llm_tensor { LLM_TENSOR_SSM_CONV1D, LLM_TENSOR_SSM_X, LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_DT_NORM, LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_B_NORM, + LLM_TENSOR_SSM_C_NORM, LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_NORM, LLM_TENSOR_SSM_OUT, LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, @@ -386,6 +399,9 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, + LLM_TENSOR_SHORTCONV_CONV, + LLM_TENSOR_SHORTCONV_INPROJ, + LLM_TENSOR_SHORTCONV_OUTPROJ, }; enum llm_tensor_layer { diff --git a/examples/talk-llama/llama-batch.cpp b/examples/talk-llama/llama-batch.cpp index 91b1d6078a2..3bc8554e51c 100644 --- a/examples/talk-llama/llama-batch.cpp +++ b/examples/talk-llama/llama-batch.cpp @@ -166,6 +166,8 @@ bool llama_batch_allocr::init( // note: tracking the other way around is not necessary for now //seq_cpl[s0][s1] = true; + + has_cpl = true; } } } @@ -405,6 +407,10 @@ uint32_t llama_batch_allocr::get_n_outputs() const { return n_outputs; } +uint32_t llama_batch_allocr::get_n_used() const { + return n_used; +} + std::vector & llama_batch_allocr::get_out_ids() { return out_ids; } @@ -420,6 +426,8 @@ llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const { void llama_batch_allocr::split_reset() { out_ids.clear(); + n_used = 0; + used.clear(); used.resize(get_n_tokens(), false); @@ -444,6 +452,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) { idxs.push_back(cur_idx); used[cur_idx] = true; + ++n_used; ++cur_idx; @@ -459,9 +468,17 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) { return ubatch_add(idxs, idxs.size(), false); } -llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) { +llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) { + if (sequential && has_cpl) { + LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__); + + return {}; + } + std::vector cur_seq_set; + llama_seq_id last_seq_id = -1; + // determine the non-overlapping sequence sets participating in this ubatch for (int32_t i = 0; i < batch.n_tokens; ++i) { if (used[i]) { @@ -478,9 +495,16 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) { } } + // accept only increasing sequence ids + if (sequential) { + add = add && (cur_seq_set.empty() || batch.seq_id[i][0] == last_seq_id + 1); + } + if (add) { cur_seq_set.push_back(seq_set[i]); + last_seq_id = batch.seq_id[i][0]; + if (cur_seq_set.size() > n_ubatch) { break; } @@ -529,6 +553,7 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) { idxs_per_seq[s].push_back(idx); used[idx] = true; + ++n_used; ++cur_idx[s]; } @@ -570,6 +595,7 @@ llama_ubatch llama_batch_allocr::split_seq(uint32_t n_ubatch) { idxs.push_back(cur_idx); used[cur_idx] = true; + ++n_used; if (idxs.size() >= n_ubatch) { break; diff --git a/examples/talk-llama/llama-batch.h b/examples/talk-llama/llama-batch.h index d2c5376188a..3420803ff94 100644 --- a/examples/talk-llama/llama-batch.h +++ b/examples/talk-llama/llama-batch.h @@ -54,6 +54,7 @@ class llama_batch_allocr { uint32_t get_n_tokens() const; uint32_t get_n_outputs() const; + uint32_t get_n_used() const; // the array of output indices in the order they were encountered during the ubatch splitting std::vector & get_out_ids(); @@ -69,7 +70,8 @@ class llama_batch_allocr { llama_ubatch split_simple(uint32_t n_ubatch); // make ubatches of equal-length sequences sets - llama_ubatch split_equal(uint32_t n_ubatch); + // if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids + llama_ubatch split_equal(uint32_t n_ubatch, bool sequential); // sequence-set-wise split - each ubatch contains a single sequence-set llama_ubatch split_seq(uint32_t n_ubatch); @@ -112,6 +114,9 @@ class llama_batch_allocr { using pos_set_t = std::set; using seq_cpl_t = std::vector; + // helper flag to quickly determine if there are any coupled sequences in the batch + bool has_cpl; + std::vector seq_pos; // seq_pos[s]: the set of positions in sequence s std::vector seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1 @@ -125,6 +130,8 @@ class llama_batch_allocr { // batch indices of the output std::vector out_ids; + uint32_t n_used; + // used[i] indicates if token i has already been used in a previous ubatch std::vector used; diff --git a/examples/talk-llama/llama-chat.cpp b/examples/talk-llama/llama-chat.cpp index 5d317f4ee62..cbc19d3c40c 100644 --- a/examples/talk-llama/llama-chat.cpp +++ b/examples/talk-llama/llama-chat.cpp @@ -64,6 +64,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "bailing", LLM_CHAT_TEMPLATE_BAILING }, { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM }, + { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -185,6 +186,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_LLAMA4; } else if (tmpl_contains("<|endofuserprompt|>")) { return LLM_CHAT_TEMPLATE_DOTS1; + } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) { + return LLM_CHAT_TEMPLATE_HUNYUAN_MOE; } return LLM_CHAT_TEMPLATE_UNKNOWN; } @@ -665,6 +668,18 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|response|>"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) { + // tencent/Hunyuan-A13B-Instruct + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "<|startoftext|>" << message->content << "<|extra_4|>"; + } else if (role == "assistant") { + ss << "<|startoftext|>" << message->content << "<|eos|>"; + } else { + ss << "<|startoftext|>" << message->content << "<|extra_0|>"; + } + } } else { // template not supported return -1; diff --git a/examples/talk-llama/llama-chat.h b/examples/talk-llama/llama-chat.h index 38800010ae4..b621fda2816 100644 --- a/examples/talk-llama/llama-chat.h +++ b/examples/talk-llama/llama-chat.h @@ -44,6 +44,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_LLAMA4, LLM_CHAT_TEMPLATE_SMOLVLM, LLM_CHAT_TEMPLATE_DOTS1, + LLM_CHAT_TEMPLATE_HUNYUAN_MOE, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp index 010300df609..a248a7ec223 100644 --- a/examples/talk-llama/llama-graph.cpp +++ b/examples/talk-llama/llama-graph.cpp @@ -281,19 +281,22 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { } void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) { - if (self_kq_mask) { - mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); - } + mctx->set_input_k_idxs(self_k_idxs, ubatch); + mctx->set_input_v_idxs(self_v_idxs, ubatch); + + mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) { - if (self_kq_mask) { - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); - } + mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); + mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); - if (self_kq_mask_swa) { - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); - } + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + + mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); + mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); + + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { @@ -333,27 +336,8 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { } void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { - if (self_kq_mask) { - mctx->get_attn()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); - } - - const int64_t n_rs = mctx->get_recr()->get_n_rs(); - - if (s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); - int32_t * data = (int32_t *) s_copy->data; - - // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n - for (uint32_t i = 0; i < n_rs; ++i) { - data[i] = mctx->get_recr()->s_copy(i); - } - } -} - -void llm_graph_input_one::set_input(const llama_ubatch *) { - GGML_ASSERT(one && ggml_nelements(one) == 1); - float f_one = 1.0f; - ggml_backend_tensor_set(one, &f_one, 0, sizeof(float)); + inp_attn->set_input(ubatch); + inp_rs->set_input(ubatch); } // @@ -987,33 +971,6 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t return pos_bias; } -llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { - const auto * mctx_cur = static_cast(mctx); - - auto inp = std::make_unique(hparams, cparams, mctx_cur); - - { - GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Hybrid recurrent is not supported with SWA attention layers"); - - const auto n_kv = inp->mctx->get_attn()->get_n_kv(); - - inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp->self_kq_mask, "KQ_mask", -1); - ggml_set_input(inp->self_kq_mask); - - inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; - } - - { - const auto n_rs = mctx_cur->get_recr()->get_n_rs(); - - inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs); - ggml_set_input(inp->s_copy); - } - - return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp)); -} - ggml_tensor * llm_graph_context::build_attn_mha( ggml_cgraph * gf, ggml_tensor * q, @@ -1135,8 +1092,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con auto inp = std::make_unique(hparams, cparams); // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch - inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp_kq_mask, "KQ_mask", -1); + inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); ggml_set_input(inp->kq_mask); inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask; @@ -1188,8 +1144,12 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } -llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const { - const auto * mctx_cur = static_cast(mctx); +static std::unique_ptr build_attn_inp_kv_unified_impl( + ggml_context * ctx0, + const llama_ubatch & ubatch, + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_unified_context * mctx_cur) { auto inp = std::make_unique(hparams, cparams, mctx_cur); @@ -1197,14 +1157,25 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA"); const auto n_kv = mctx_cur->get_n_kv(); + const auto n_tokens = ubatch.n_tokens; + + inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch); + inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); - inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp->self_kq_mask, "KQ_mask", -1); + inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; } + return inp; +} + +llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const { + const auto * mctx_cur = static_cast(mctx); + + auto inp = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur); + return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp)); } @@ -1226,12 +1197,15 @@ ggml_tensor * llm_graph_context::build_attn( ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - const auto * mctx_cur = static_cast(mctx); + const auto * mctx_cur = inp->mctx; // store to KV cache { - ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il)); - ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il)); + const auto & k_idxs = inp->get_k_idxs(); + const auto & v_idxs = inp->get_v_idxs(); + + ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il)); + ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, v_idxs, il)); } const auto & kq_mask = inp->get_kq_mask(); @@ -1282,7 +1256,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_build_forward_expand(gf, v_cur); } - const auto * mctx_iswa = static_cast(mctx); + const auto * mctx_iswa = inp->mctx; const bool is_swa = hparams.is_swa(il); @@ -1290,11 +1264,15 @@ ggml_tensor * llm_graph_context::build_attn( // optionally store to KV cache if (k_cur) { - ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il)); + const auto & k_idxs = is_swa ? inp->get_k_idxs_swa() : inp->get_k_idxs(); + + ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il)); } if (v_cur) { - ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il)); + const auto & v_idxs = is_swa ? inp->get_v_idxs_swa() : inp->get_v_idxs(); + + ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, v_idxs, il)); } const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask(); @@ -1326,7 +1304,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; - inp->cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); ggml_set_input(inp->cross_kq_mask); inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask; @@ -1376,56 +1354,9 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } -ggml_tensor * llm_graph_context::build_attn( - llm_graph_input_mem_hybrid * inp, - ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - ggml_tensor * kq_b, - ggml_tensor * v_mla, - float kq_scale, - int il) const { - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(gf, q_cur); - ggml_build_forward_expand(gf, k_cur); - ggml_build_forward_expand(gf, v_cur); - - const auto * mctx_cur = static_cast(mctx)->get_attn(); - - // store to KV cache - { - ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il)); - ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il)); - } - - const auto & kq_mask = inp->get_kq_mask(); - - ggml_tensor * q = q_cur; - ggml_tensor * k = mctx_cur->get_k(ctx0, il); - ggml_tensor * v = mctx_cur->get_v(ctx0, il); - - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); - cb(cur, "kqv_out", il); - - if (wo) { - cur = build_lora_mm(wo, cur); - if (arch == LLM_ARCH_GLM4) { - // GLM4 seems to have numerical issues with half-precision accumulators - ggml_mul_mat_set_prec(cur, GGML_PREC_F32); - } - } - - if (wo_b) { - cur = ggml_add(ctx0, cur, wo_b); - } - - return cur; -} - +// TODO: maybe separate the inner implementation into a separate function +// like with the non-sliding window equivalent +// once sliding-window hybrid caches are a thing. llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const { const auto * mctx_cur = static_cast(mctx); @@ -1434,8 +1365,10 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif { const auto n_kv = mctx_cur->get_base()->get_n_kv(); - inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp->self_kq_mask, "KQ_mask", -1); + inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch); + inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch); + + inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1446,8 +1379,10 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif const auto n_kv = mctx_cur->get_swa()->get_n_kv(); - inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1); + inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch); + inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch); + + inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); ggml_set_input(inp->self_kq_mask_swa); inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; @@ -1466,7 +1401,7 @@ ggml_tensor * llm_graph_context::build_rs( uint32_t kv_head, uint32_t kv_size, int32_t rs_zero, - bool avoid_copies) const { + const llm_graph_get_rows_fn & get_state_rows) const { ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_size); @@ -1475,19 +1410,11 @@ ggml_tensor * llm_graph_context::build_rs( ggml_tensor * state_zero = ggml_view_1d(ctx0, states, state_size*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0)); ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0)); - ggml_tensor * output_states; - - if (!avoid_copies) { - // copy states - // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv - // {state_size, kv_size} -> {state_size, n_seqs} - output_states = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0)); - ggml_build_forward_expand(gf, output_states); - } else { - // FIXME: make the gathering operation happen before the copy below - // (maybe with an optional lambda function passed as a parameter instead of `avoid_copies`?) - output_states = states; - } + // copy states + // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv + // {state_size, kv_size} -> {state_size, n_seqs} + ggml_tensor * output_states = get_state_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0)); + ggml_build_forward_expand(gf, output_states); // copy extra states which won't be changed further (between n_seqs and n_kv) ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0])); @@ -1499,8 +1426,9 @@ ggml_tensor * llm_graph_context::build_rs( return output_states; } -llm_graph_input_rs * llm_graph_context::build_rs_inp() const { - const auto * mctx_cur = static_cast(mctx); +static std::unique_ptr build_rs_inp_impl( + ggml_context * ctx0, + const llama_memory_recurrent_context * mctx_cur) { auto inp = std::make_unique(mctx_cur); @@ -1509,31 +1437,27 @@ llm_graph_input_rs * llm_graph_context::build_rs_inp() const { inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs); ggml_set_input(inp->s_copy); - return (llm_graph_input_rs *) res->add_input(std::move(inp)); + return inp; } -ggml_tensor * llm_graph_context::build_rs( - llm_graph_input_rs * inp, - ggml_cgraph * gf, - ggml_tensor * s, - int32_t state_size, - int32_t n_seqs, - bool avoid_copies) const { +llm_graph_input_rs * llm_graph_context::build_rs_inp() const { const auto * mctx_cur = static_cast(mctx); - return build_rs(gf, s, inp->s_copy, state_size, n_seqs, mctx_cur->get_n_rs(), mctx_cur->get_head(), mctx_cur->get_size(), mctx_cur->get_rs_z(), avoid_copies); + auto inp = build_rs_inp_impl(ctx0, mctx_cur); + + return (llm_graph_input_rs *) res->add_input(std::move(inp)); } ggml_tensor * llm_graph_context::build_rs( - llm_graph_input_mem_hybrid * inp, + llm_graph_input_rs * inp, ggml_cgraph * gf, ggml_tensor * s, int32_t state_size, int32_t n_seqs, - bool avoid_copies) const { - const auto * mctx_cur = static_cast(mctx)->get_recr(); + const llm_graph_get_rows_fn & get_state_rows) const { + const auto * kv_state = inp->mctx; - return build_rs(gf, s, inp->s_copy, state_size, n_seqs, mctx_cur->get_n_rs(), mctx_cur->get_head(), mctx_cur->get_size(), mctx_cur->get_rs_z(), avoid_copies); + return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows); } ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( @@ -1578,6 +1502,17 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store( ); } +llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { + const auto * mctx_cur = static_cast(mctx); + + auto inp_rs = build_rs_inp_impl(ctx0, mctx_cur->get_recr()); + auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn()); + + auto inp = std::make_unique(std::move(inp_attn), std::move(inp_rs), mctx_cur); + + return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp)); +} + void llm_graph_context::build_pooling( ggml_cgraph * gf, ggml_tensor * cls, diff --git a/examples/talk-llama/llama-graph.h b/examples/talk-llama/llama-graph.h index ceddb6021f1..fbf8e288956 100644 --- a/examples/talk-llama/llama-graph.h +++ b/examples/talk-llama/llama-graph.h @@ -228,8 +228,8 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i { ggml_tensor * get_kq_mask() const { return kq_mask_cnv; } - ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch] - ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch] + ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1] + ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1] const llama_hparams & hparams; const llama_cparams & cparams; @@ -249,10 +249,16 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + ggml_tensor * get_k_idxs() const { return self_k_idxs; } + ggml_tensor * get_v_idxs() const { return self_v_idxs; } + ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } - ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch] + ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] + + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1] const llama_hparams & hparams; const llama_cparams & cparams; @@ -274,13 +280,23 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + ggml_tensor * get_k_idxs() const { return self_k_idxs; } + ggml_tensor * get_v_idxs() const { return self_v_idxs; } + ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; } + ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; } + ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } - ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch] - ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch] - ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch] + ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] + ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] + + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1] + ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch, 1, 1] + ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch, 1, 1] const llama_hparams & hparams; const llama_cparams & cparams; @@ -297,8 +313,8 @@ class llm_graph_input_attn_cross : public llm_graph_input_i { ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; } - ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch] - ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch] + ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1] + ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1] const llama_cross * cross = nullptr; }; @@ -306,41 +322,25 @@ class llm_graph_input_attn_cross : public llm_graph_input_i { class llm_graph_input_mem_hybrid : public llm_graph_input_i { public: llm_graph_input_mem_hybrid( - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_memory_hybrid_context * mctx) : - hparams(hparams), - cparams(cparams), - mctx(mctx) { - } + std::unique_ptr inp_attn, + std::unique_ptr inp_rs, + const llama_memory_hybrid_context * mctx) : + inp_attn(std::move(inp_attn)), + inp_rs(std::move(inp_rs)), + mctx(mctx) { } virtual ~llm_graph_input_mem_hybrid() = default; void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * s_copy; // I32 [kv_size] + std::unique_ptr inp_attn; + std::unique_ptr inp_rs; - ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } - - ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch] - - const llama_hparams & hparams; - const llama_cparams & cparams; + llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); } + llm_graph_input_rs * get_recr() const { return inp_rs.get(); } const llama_memory_hybrid_context * mctx; }; -// TODO: remove this when ggml_scale_add is implemented -class llm_graph_input_one : public llm_graph_input_i { -public: - llm_graph_input_one() {} - virtual ~llm_graph_input_one() = default; - - void set_input(const llama_ubatch *) override; - - ggml_tensor * one = nullptr; // F32 -}; - // // llm_graph_result // @@ -424,6 +424,9 @@ struct llm_graph_params { const llm_graph_cb & cb; }; +// used in build_rs to properly order writes and avoid unnecessary copies +using llm_graph_get_rows_fn = std::function; + struct llm_graph_context { const llm_arch arch; @@ -554,8 +557,6 @@ struct llm_graph_context { ggml_tensor * build_inp_pos_bucket_dec() const; ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const; - llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const; - // // attention // @@ -631,18 +632,6 @@ struct llm_graph_context { float kq_scale, int il) const; - ggml_tensor * build_attn( - llm_graph_input_mem_hybrid * inp, - ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] - ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] - ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] - ggml_tensor * kq_b, - ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] - float kq_scale, - int il) const; // // recurrent // @@ -663,7 +652,7 @@ struct llm_graph_context { uint32_t kv_head, uint32_t kv_size, int32_t rs_zero, - bool avoid_copies = false) const; + const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const; llm_graph_input_rs * build_rs_inp() const; @@ -673,15 +662,7 @@ struct llm_graph_context { ggml_tensor * s, int32_t state_size, int32_t n_seqs, - bool avoid_copies = false) const; - - ggml_tensor * build_rs( - llm_graph_input_mem_hybrid * inp, - ggml_cgraph * gf, - ggml_tensor * s, - int32_t state_size, - int32_t n_seqs, - bool avoid_copies = false) const; + const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const; ggml_tensor * build_rwkv_token_shift_load( llm_graph_input_rs * inp, @@ -693,6 +674,11 @@ struct llm_graph_context { ggml_tensor * token_shift, const llama_ubatch & ubatch, int il) const; + // + // hybrid + // + + llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const; // // pooling diff --git a/examples/talk-llama/llama-hparams.cpp b/examples/talk-llama/llama-hparams.cpp index bba7a12dc54..7aa736e2f39 100644 --- a/examples/talk-llama/llama-hparams.cpp +++ b/examples/talk-llama/llama-hparams.cpp @@ -71,9 +71,15 @@ uint32_t llama_hparams::n_embd_r() const { return token_shift_count * n_embd; } + if (n_shortconv_l_cache != 0) { + // for LFM2 models + return n_embd * (n_shortconv_l_cache - 1); + } + // TODO: maybe support other convolution strides than 1 // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed - return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; + // Corresponds to Mamba's conv_states size + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); } uint32_t llama_hparams::n_embd_s() const { diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h index e85afe145a9..d0500e4d0fd 100644 --- a/examples/talk-llama/llama-hparams.h +++ b/examples/talk-llama/llama-hparams.h @@ -55,6 +55,8 @@ struct llama_hparams { struct llama_hparams_posnet posnet; struct llama_hparams_convnext convnext; + uint32_t n_shortconv_l_cache = 0; + std::array n_head_arr; std::array n_head_kv_arr; std::array n_ff_arr; @@ -114,6 +116,7 @@ struct llama_hparams { uint32_t ssm_d_inner = 0; uint32_t ssm_d_state = 0; uint32_t ssm_dt_rank = 0; + uint32_t ssm_n_group = 0; // for hybrid state space models std::array recurrent_layer_arr; diff --git a/examples/talk-llama/llama-kv-cache-unified-iswa.cpp b/examples/talk-llama/llama-kv-cache-unified-iswa.cpp index d1f839b63aa..fe207ad5360 100644 --- a/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +++ b/examples/talk-llama/llama-kv-cache-unified-iswa.cpp @@ -113,20 +113,25 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all ubatches.push_back(std::move(ubatch)); // NOLINT } - auto heads_base = kv_base->prepare(ubatches); - if (heads_base.empty()) { + if (balloc.get_n_used() < balloc.get_n_tokens()) { + // failed to find a suitable split break; } - auto heads_swa = kv_swa->prepare(ubatches); - if (heads_swa.empty()) { + auto sinfos_base = kv_base->prepare(ubatches); + if (sinfos_base.empty()) { break; } - assert(heads_base.size() == heads_swa.size()); + auto sinfos_swa = kv_swa->prepare(ubatches); + if (sinfos_swa.empty()) { + break; + } + + assert(sinfos_base.size() == sinfos_swa.size()); return std::make_unique( - this, std::move(heads_base), std::move(heads_swa), std::move(ubatches)); + this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches)); } while (false); // if it fails, try equal split @@ -135,7 +140,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all std::vector ubatches; while (true) { - auto ubatch = balloc.split_equal(n_ubatch); + auto ubatch = balloc.split_equal(n_ubatch, false); if (ubatch.n_tokens == 0) { break; @@ -144,20 +149,25 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all ubatches.push_back(std::move(ubatch)); // NOLINT } - auto heads_base = kv_base->prepare(ubatches); - if (heads_base.empty()) { + if (balloc.get_n_used() < balloc.get_n_tokens()) { + // failed to find a suitable split + break; + } + + auto sinfos_base = kv_base->prepare(ubatches); + if (sinfos_base.empty()) { break; } - auto heads_swa = kv_swa->prepare(ubatches); - if (heads_swa.empty()) { + auto sinfos_swa = kv_swa->prepare(ubatches); + if (sinfos_swa.empty()) { break; } - assert(heads_base.size() == heads_swa.size()); + assert(sinfos_base.size() == sinfos_swa.size()); return std::make_unique( - this, std::move(heads_base), std::move(heads_swa), std::move(ubatches)); + this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches)); } while (false); // TODO: if we fail again, we should attempt different splitting strategies @@ -220,13 +230,13 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa * kv, - std::vector heads_base, - std::vector heads_swa, + slot_info_vec_t sinfos_base, + slot_info_vec_t sinfos_swa, std::vector ubatches) : ubatches(std::move(ubatches)), // note: here we copy the ubatches. not sure if this is ideal - ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(heads_base), this->ubatches)), - ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(heads_swa), this->ubatches)), + ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(sinfos_base), this->ubatches)), + ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)), status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { } diff --git a/examples/talk-llama/llama-kv-cache-unified-iswa.h b/examples/talk-llama/llama-kv-cache-unified-iswa.h index 46c1ed614f2..23205d826b2 100644 --- a/examples/talk-llama/llama-kv-cache-unified-iswa.h +++ b/examples/talk-llama/llama-kv-cache-unified-iswa.h @@ -74,6 +74,8 @@ class llama_kv_cache_unified_iswa : public llama_memory_i { class llama_kv_cache_unified_iswa_context : public llama_memory_context_i { public: + using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; + // used for errors llama_kv_cache_unified_iswa_context(llama_memory_status status); @@ -90,8 +92,8 @@ class llama_kv_cache_unified_iswa_context : public llama_memory_context_i { // used to create a batch processing context from a batch llama_kv_cache_unified_iswa_context( llama_kv_cache_unified_iswa * kv, - std::vector heads_base, - std::vector heads_swa, + slot_info_vec_t sinfos_base, + slot_info_vec_t sinfos_swa, std::vector ubatches); virtual ~llama_kv_cache_unified_iswa_context(); diff --git a/examples/talk-llama/llama-kv-cache-unified.cpp b/examples/talk-llama/llama-kv-cache-unified.cpp index 7f7b162ffd7..d3129cc5328 100644 --- a/examples/talk-llama/llama-kv-cache-unified.cpp +++ b/examples/talk-llama/llama-kv-cache-unified.cpp @@ -156,6 +156,13 @@ llama_kv_cache_unified::llama_kv_cache_unified( const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG"); debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0; + + const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); + supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0; + + if (!supports_set_rows) { + LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__); + } } void llama_kv_cache_unified::clear(bool data) { @@ -353,13 +360,18 @@ llama_memory_context_ptr llama_kv_cache_unified::init_batch( ubatches.push_back(std::move(ubatch)); // NOLINT } - auto heads = prepare(ubatches); - if (heads.empty()) { + if (balloc.get_n_used() < balloc.get_n_tokens()) { + // failed to find a suitable split + break; + } + + auto sinfos = prepare(ubatches); + if (sinfos.empty()) { break; } return std::make_unique( - this, std::move(heads), std::move(ubatches)); + this, std::move(sinfos), std::move(ubatches)); } while (false); return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); @@ -402,12 +414,13 @@ llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lct return std::make_unique(this, lctx, do_shift, std::move(dinfo)); } -llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::vector & ubatches) { - llama_kv_cache_unified::ubatch_heads res; +llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const std::vector & ubatches) { + llama_kv_cache_unified::slot_info_vec_t res; struct state { uint32_t head_old; // old position of the head, before placing the ubatch - uint32_t head_new; // new position of the head, after placing the ubatch + + slot_info sinfo; // slot info for the ubatch llama_kv_cells_unified cells; // copy of the old cells, before placing the ubatch }; @@ -418,26 +431,29 @@ llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std:: bool success = true; for (const auto & ubatch : ubatches) { + // non-continuous slots require support for ggml_set_rows() + const bool cont = supports_set_rows ? false : true; + // only find a suitable slot for the ubatch. don't modify the cells yet - const int32_t head_new = find_slot(ubatch); - if (head_new < 0) { + const auto sinfo_new = find_slot(ubatch, cont); + if (sinfo_new.empty()) { success = false; break; } // remeber the position that we found - res.push_back(head_new); + res.push_back(sinfo_new); // store the old state of the cells in the recovery stack - states.push_back({head, (uint32_t) head_new, cells.cp(head_new, ubatch.n_tokens)}); + states.push_back({head, sinfo_new, cells.cp(sinfo_new.idxs)}); // now emplace the ubatch - apply_ubatch(head_new, ubatch); + apply_ubatch(sinfo_new, ubatch); } // iterate backwards and restore the cells to their original state for (auto it = states.rbegin(); it != states.rend(); ++it) { - cells.set(it->head_new, it->cells); + cells.set(it->sinfo.idxs, it->cells); head = it->head_old; } @@ -539,7 +555,7 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d return updated; } -int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { +llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const { const uint32_t n_tokens = ubatch.n_tokens; uint32_t head_cur = this->head; @@ -552,7 +568,7 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { if (n_tokens > cells.size()) { LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size()); - return -1; + return { }; } if (debug > 0) { @@ -615,15 +631,26 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { uint32_t n_tested = 0; + // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head + // for non-continuous slots, we test the tokens one by one + const uint32_t n_test = cont ? n_tokens : 1; + + slot_info res; + + auto & idxs = res.idxs; + + idxs.reserve(n_tokens); + while (true) { - if (head_cur + n_tokens > cells.size()) { + if (head_cur + n_test > cells.size()) { n_tested += cells.size() - head_cur; head_cur = 0; continue; } - bool found = true; - for (uint32_t i = 0; i < n_tokens; i++) { + for (uint32_t i = 0; i < n_test; i++) { + const auto idx = head_cur; + //const llama_pos pos = ubatch.pos[i]; //const llama_seq_id seq_id = ubatch.seq_id[i][0]; @@ -633,19 +660,19 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { // - (disabled) mask causally, if the sequence is the same as the one we are inserting // - mask SWA, using current max pos for that sequence in the cache // always insert in the cell with minimum pos - bool can_use = cells.is_empty(head_cur + i); + bool can_use = cells.is_empty(idx); - if (!can_use && cells.seq_count(head_cur + i) == 1) { - const llama_pos pos_cell = cells.pos_get(head_cur + i); + if (!can_use && cells.seq_count(idx) == 1) { + const llama_pos pos_cell = cells.pos_get(idx); // (disabled) causal mask // note: it's better to purge any "future" tokens beforehand - //if (cells.seq_has(head_cur + i, seq_id)) { + //if (cells.seq_has(idx, seq_id)) { // can_use = pos_cell >= pos; //} if (!can_use) { - const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i); + const llama_seq_id seq_id_cell = cells.seq_get(idx); // SWA mask if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) { @@ -654,28 +681,39 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { } } - if (!can_use) { - found = false; - head_cur += i + 1; - n_tested += i + 1; + head_cur++; + n_tested++; + + if (can_use) { + idxs.push_back(idx); + } else { break; } } - if (found) { + if (idxs.size() == n_tokens) { break; } + if (cont) { + idxs.clear(); + } + if (n_tested >= cells.size()) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); - return -1; + return { }; } } - return head_cur; + // we didn't find a suitable slot - return empty result + if (idxs.size() < n_tokens) { + res.clear(); + } + + return res; } -void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) { +void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) { // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ]; @@ -683,22 +721,26 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch seq_pos_max_rm[s] = -1; } + assert(ubatch.n_tokens == sinfo.idxs.size()); + for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { - if (!cells.is_empty(head_cur + i)) { - assert(cells.seq_count(head_cur + i) == 1); + const auto idx = sinfo.idxs.at(i); - const llama_seq_id seq_id = cells.seq_get(head_cur + i); - const llama_pos pos = cells.pos_get(head_cur + i); + if (!cells.is_empty(idx)) { + assert(cells.seq_count(idx) == 1); + + const llama_seq_id seq_id = cells.seq_get(idx); + const llama_pos pos = cells.pos_get(idx); seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos); - cells.rm(head_cur + i); + cells.rm(idx); } - cells.pos_set(head_cur + i, ubatch.pos[i]); + cells.pos_set(idx, ubatch.pos[i]); for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) { - cells.seq_add(head_cur + i, ubatch.seq_id[i][s]); + cells.seq_add(idx, ubatch.seq_id[i][s]); } } @@ -719,7 +761,7 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch } // move the head at the end of the slot - head = head_cur + ubatch.n_tokens; + head = sinfo.idxs.back() + 1; } bool llama_kv_cache_unified::get_can_shift() const { @@ -772,47 +814,133 @@ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint 0); } -ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const { +ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const { const int32_t ikv = map_layer_ids.at(il); auto * k = layers[ikv].k; + const int64_t n_embd_k_gqa = k->ne[0]; const int64_t n_tokens = k_cur->ne[2]; + k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens); + + if (k_idxs && supports_set_rows) { + return ggml_set_rows(ctx, k, k_cur, k_idxs); + } + + // TODO: fallback to old ggml_cpy() method for backwards compatibility + // will be removed when ggml_set_rows() is adopted by all backends + ggml_tensor * k_view = ggml_view_1d(ctx, k, - n_tokens*hparams.n_embd_k_gqa(il), - ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head_cur); + n_tokens*n_embd_k_gqa, + ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head()); return ggml_cpy(ctx, k_cur, k_view); } -ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const { +ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const { const int32_t ikv = map_layer_ids.at(il); auto * v = layers[ikv].v; + const int64_t n_embd_v_gqa = v->ne[0]; const int64_t n_tokens = v_cur->ne[2]; - v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens); + v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens); + + if (v_idxs && supports_set_rows) { + if (!v_trans) { + return ggml_set_rows(ctx, v, v_cur, v_idxs); + } + + // the row becomes a single element + ggml_tensor * v_view = ggml_reshape_3d(ctx, v, 1, v->ne[1], v->ne[0]); + + // note: the V cache is transposed when not using flash attention + v_cur = ggml_permute(ctx, ggml_reshape_3d(ctx, v_cur, v_cur->ne[0], 1, v_cur->ne[1]), 2, 0, 1, 3); + + // note: we can be more explicit here at the cost of extra cont + // however, above we take advantage that a row of single element is always continuous regardless of the row stride + //v_cur = ggml_transpose(ctx, v_cur); + //v_cur = ggml_cont_3d(ctx, v_cur, 1, v_cur->ne[0], v_cur->ne[1]); + + // we broadcast the KV indices n_embd_v_gqa times + // v [1, n_kv, n_embd_v_gqa] + // v_cur [1, n_tokens, n_embd_v_gqa] + // v_idxs [n_tokens, 1, 1] + return ggml_set_rows(ctx, v_view, v_cur, v_idxs); + } + + // TODO: fallback to old ggml_cpy() method for backwards compatibility + // will be removed when ggml_set_rows() is adopted by all backends ggml_tensor * v_view = nullptr; if (!v_trans) { v_view = ggml_view_1d(ctx, v, - n_tokens*hparams.n_embd_v_gqa(il), - ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head_cur); + n_tokens*n_embd_v_gqa, + ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head()); } else { - // note: the V cache is transposed when not using flash attention - v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il), - (v->ne[1])*ggml_element_size(v), - (head_cur)*ggml_element_size(v)); - v_cur = ggml_transpose(ctx, v_cur); + + v_view = ggml_view_2d(ctx, v, n_tokens, n_embd_v_gqa, + (v->ne[1] )*ggml_element_size(v), + (sinfo.head())*ggml_element_size(v)); } return ggml_cpy(ctx, v_cur, v_view); } +ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { + const uint32_t n_tokens = ubatch.n_tokens; + + ggml_tensor * k_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens); + + ggml_set_input(k_idxs); + + return k_idxs; +} + +ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { + const uint32_t n_tokens = ubatch.n_tokens; + + ggml_tensor * v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens); + + ggml_set_input(v_idxs); + + return v_idxs; +} + +void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const { + if (!supports_set_rows) { + return; + } + + const uint32_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); + int64_t * data = (int64_t *) dst->data; + + for (int64_t i = 0; i < n_tokens; ++i) { + data[i] = sinfo.idxs.at(i); + } +} + +void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const { + if (!supports_set_rows) { + return; + } + + const uint32_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); + int64_t * data = (int64_t *) dst->data; + + for (int64_t i = 0; i < n_tokens; ++i) { + data[i] = sinfo.idxs.at(i); + } +} + void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { const uint32_t n_tokens = ubatch->n_tokens; @@ -1552,13 +1680,15 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell ubatch.seq_id[i] = &dest_seq_id; } - const auto head_cur = find_slot(ubatch); - if (head_cur < 0) { + const auto sinfo = find_slot(ubatch, true); + if (sinfo.empty()) { LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; } - apply_ubatch(head_cur, ubatch); + apply_ubatch(sinfo, ubatch); + + const auto head_cur = sinfo.head(); // keep the head at the old position because we will read the KV data into it in state_read_data() head = head_cur; @@ -1744,7 +1874,11 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_stat llama_kv_cache_unified_context::llama_kv_cache_unified_context( llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) { n_kv = kv->get_size(); - head = 0; + + // create a dummy slot info - the actual data is irrelevant. we just need to build the graph + sinfos.resize(1); + sinfos[0].idxs.resize(1); + sinfos[0].idxs[0] = 0; } llama_kv_cache_unified_context::llama_kv_cache_unified_context( @@ -1759,8 +1893,8 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context( llama_kv_cache_unified_context::llama_kv_cache_unified_context( llama_kv_cache_unified * kv, - llama_kv_cache_unified::ubatch_heads heads, - std::vector ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), heads(std::move(heads)), ubatches(std::move(ubatches)) { + llama_kv_cache_unified::slot_info_vec_t sinfos, + std::vector ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sinfos(std::move(sinfos)), ubatches(std::move(ubatches)) { } llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default; @@ -1768,7 +1902,7 @@ llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default; bool llama_kv_cache_unified_context::next() { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - if (++i_next >= ubatches.size()) { + if (++i_cur >= ubatches.size()) { return false; } @@ -1785,10 +1919,9 @@ bool llama_kv_cache_unified_context::apply() { return true; } - kv->apply_ubatch(heads[i_next], ubatches[i_next]); + kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]); n_kv = kv->get_n_kv(); - head = heads[i_next]; return true; } @@ -1800,7 +1933,7 @@ llama_memory_status llama_kv_cache_unified_context::get_status() const { const llama_ubatch & llama_kv_cache_unified_context::get_ubatch() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - return ubatches[i_next]; + return ubatches[i_cur]; } uint32_t llama_kv_cache_unified_context::get_n_kv() const { @@ -1815,18 +1948,34 @@ ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t return kv->get_v(ctx, il, n_kv); } -ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const { - return kv->cpy_k(ctx, k_cur, il, head); +ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const { + return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]); +} + +ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const { + return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]); +} + +ggml_tensor * llama_kv_cache_unified_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { + return kv->build_input_k_idxs(ctx, ubatch); } -ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const { - return kv->cpy_v(ctx, v_cur, il, head); +ggml_tensor * llama_kv_cache_unified_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { + return kv->build_input_v_idxs(ctx, ubatch); } void llama_kv_cache_unified_context::set_input_k_shift(ggml_tensor * dst) const { kv->set_input_k_shift(dst); } +void llama_kv_cache_unified_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const { + kv->set_input_k_idxs(dst, ubatch, sinfos[i_cur]); +} + +void llama_kv_cache_unified_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const { + kv->set_input_v_idxs(dst, ubatch, sinfos[i_cur]); +} + void llama_kv_cache_unified_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { kv->set_input_kq_mask(dst, ubatch, causal_attn); } diff --git a/examples/talk-llama/llama-kv-cache-unified.h b/examples/talk-llama/llama-kv-cache-unified.h index 4c53f1273ab..b8b0356e830 100644 --- a/examples/talk-llama/llama-kv-cache-unified.h +++ b/examples/talk-llama/llama-kv-cache-unified.h @@ -24,8 +24,6 @@ class llama_kv_cache_unified : public llama_memory_i { // this callback is used to filter out layers that should not be included in the cache using layer_filter_cb = std::function; - using ubatch_heads = std::vector; - struct defrag_info { bool empty() const { return ids.empty(); @@ -37,6 +35,32 @@ class llama_kv_cache_unified : public llama_memory_i { std::vector ids; }; + // for each ubatch, create a slot_info that contains information about where the ubatch should be inserted in the + // KV cells. for example, cell indices for each token, such that: token[i] -> goes to cells[idxs[i]] + struct slot_info { + // data for ggml_set_rows + using idx_vec_t = std::vector; + + idx_vec_t idxs; + + uint32_t head() const { + return idxs.at(0); + } + + bool empty() const { + return idxs.empty(); + } + + void clear() { + idxs.clear(); + } + + // TODO: implement + //std::vector seq_idxs; + }; + + using slot_info_vec_t = std::vector; + llama_kv_cache_unified( const llama_model & model, layer_filter_cb && filter, @@ -102,30 +126,37 @@ class llama_kv_cache_unified : public llama_memory_i { ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const; // store k_cur and v_cur in the cache based on the provided head location - ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const; - ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const; + ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const; + ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const; // // preparation API // - // find places for the provided ubatches in the cache, returns the head locations + // find places for the provided ubatches in the cache, returns the slot infos // return empty vector on failure - ubatch_heads prepare(const std::vector & ubatches); + slot_info_vec_t prepare(const std::vector & ubatches); bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo); - // return the cell position where we can insert the ubatch - // return -1 on failure to find a contiguous slot of kv cells - int32_t find_slot(const llama_ubatch & ubatch) const; + // find a slot of kv cells that can hold the ubatch + // if cont == true, then the slot must be continuous + // return empty slot_info on failure + slot_info find_slot(const llama_ubatch & ubatch, bool cont) const; - // emplace the ubatch context into slot: [head_cur, head_cur + ubatch.n_tokens) - void apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch); + // emplace the ubatch context into slot: [sinfo.idxs[0...ubatch.n_tokens - 1]] + void apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch); // - // set_input API + // input API // + ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; + ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; + + void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const; + void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const; + void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const; void set_input_k_shift (ggml_tensor * dst) const; void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const; @@ -157,8 +188,13 @@ class llama_kv_cache_unified : public llama_memory_i { // SWA const uint32_t n_swa = 0; + // env: LLAMA_KV_CACHE_DEBUG int debug = 0; + // env: LLAMA_SET_ROWS (temporary) + // ref: https://github.com/ggml-org/llama.cpp/pull/14285 + int supports_set_rows = false; + const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; std::vector ctxs; @@ -211,8 +247,8 @@ class llama_kv_cache_unified : public llama_memory_i { class llama_kv_cache_unified_context : public llama_memory_context_i { public: // some shorthands - using ubatch_heads = llama_kv_cache_unified::ubatch_heads; - using defrag_info = llama_kv_cache_unified::defrag_info; + using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; + using defrag_info = llama_kv_cache_unified::defrag_info; // used for errors llama_kv_cache_unified_context(llama_memory_status status); @@ -231,7 +267,7 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { // used to create a batch procesing context from a batch llama_kv_cache_unified_context( llama_kv_cache_unified * kv, - ubatch_heads heads, + slot_info_vec_t sinfos, std::vector ubatches); virtual ~llama_kv_cache_unified_context(); @@ -257,11 +293,16 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; // store k_cur and v_cur in the cache based on the provided head location - ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const; - ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const; + ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const; + ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const; + + ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; + ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; - void set_input_k_shift(ggml_tensor * dst) const; + void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const; + void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const; + void set_input_k_shift (ggml_tensor * dst) const; void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const; void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const; @@ -283,10 +324,10 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { // batch processing context // - // the index of the next ubatch to process - size_t i_next = 0; + // the index of the cur ubatch to process + size_t i_cur = 0; - ubatch_heads heads; + slot_info_vec_t sinfos; std::vector ubatches; @@ -297,7 +338,4 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { // a heuristic, to avoid attending the full cache if it is not yet utilized // as the cache gets filled, the benefit from this heuristic disappears int32_t n_kv; - - // the beginning of the current slot in which the ubatch will be inserted - int32_t head; }; diff --git a/examples/talk-llama/llama-kv-cells.h b/examples/talk-llama/llama-kv-cells.h index c95d635948b..0d0dd316fd0 100644 --- a/examples/talk-llama/llama-kv-cells.h +++ b/examples/talk-llama/llama-kv-cells.h @@ -105,10 +105,30 @@ class llama_kv_cells_unified { res.resize(n); for (uint32_t j = 0; j < n; ++j) { - res.pos[j] = pos[i + j]; - res.seq[j] = seq[i + j]; + const auto idx = i + j; - assert(shift[i + j] == 0); + res.pos[j] = pos[idx]; + res.seq[j] = seq[idx]; + + assert(shift[idx] == 0); + } + + return res; + } + + // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1]) + llama_kv_cells_unified cp(const std::vector & idxs) const { + llama_kv_cells_unified res; + + res.resize(idxs.size()); + + for (uint32_t j = 0; j < idxs.size(); ++j) { + const auto idx = idxs[j]; + + res.pos[j] = pos[idx]; + res.seq[j] = seq[idx]; + + assert(shift[idx] == 0); } return res; @@ -119,26 +139,58 @@ class llama_kv_cells_unified { assert(i + other.pos.size() <= pos.size()); for (uint32_t j = 0; j < other.pos.size(); ++j) { - if (pos[i + j] == -1 && other.pos[j] != -1) { + const auto idx = i + j; + + if (pos[idx] == -1 && other.pos[j] != -1) { used.insert(i + j); } - if (pos[i + j] != -1 && other.pos[j] == -1) { + if (pos[idx] != -1 && other.pos[j] == -1) { used.erase(i + j); } - if (pos[i + j] != -1) { + if (pos[idx] != -1) { seq_pos_rm(i + j); } - pos[i + j] = other.pos[j]; - seq[i + j] = other.seq[j]; + pos[idx] = other.pos[j]; + seq[idx] = other.seq[j]; - if (pos[i + j] != -1) { + if (pos[idx] != -1) { seq_pos_add(i + j); } - assert(shift[i + j] == 0); + assert(shift[idx] == 0); + } + } + + // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1]) + void set(const std::vector & idxs, const llama_kv_cells_unified & other) { + assert(idxs.size() == other.pos.size()); + + for (uint32_t j = 0; j < other.pos.size(); ++j) { + const auto idx = idxs[j]; + + if (pos[idx] == -1 && other.pos[j] != -1) { + used.insert(idx); + } + + if (pos[idx] != -1 && other.pos[j] == -1) { + used.erase(idx); + } + + if (pos[idx] != -1) { + seq_pos_rm(idx); + } + + pos[idx] = other.pos[j]; + seq[idx] = other.seq[j]; + + if (pos[idx] != -1) { + seq_pos_add(idx); + } + + assert(shift[idx] == 0); } } diff --git a/examples/talk-llama/llama-memory-hybrid.cpp b/examples/talk-llama/llama-memory-hybrid.cpp index 67cbf955482..6cd10db06b7 100644 --- a/examples/talk-llama/llama-memory-hybrid.cpp +++ b/examples/talk-llama/llama-memory-hybrid.cpp @@ -70,7 +70,7 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - ubatch = balloc.split_equal(n_ubatch); + ubatch = balloc.split_equal(n_ubatch, false); } if (ubatch.n_tokens == 0) { @@ -80,6 +80,11 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba ubatches.push_back(std::move(ubatch)); // NOLINT } + if (balloc.get_n_used() < balloc.get_n_tokens()) { + // failed to find a suitable split + break; + } + // prepare the recurrent batches first if (!mem_recr->prepare(ubatches)) { // TODO: will the recurrent cache be in an undefined context at this point? @@ -195,11 +200,11 @@ llama_memory_hybrid_context::llama_memory_hybrid_context( llama_memory_hybrid_context::llama_memory_hybrid_context( llama_memory_hybrid * mem, - std::vector heads_attn, + slot_info_vec_t sinfos_attn, std::vector ubatches) : ubatches(std::move(ubatches)), // note: here we copy the ubatches. not sure if this is ideal - ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)), + ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)), ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)), status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { } diff --git a/examples/talk-llama/llama-memory-hybrid.h b/examples/talk-llama/llama-memory-hybrid.h index f0c2420e9a2..4ac31817578 100644 --- a/examples/talk-llama/llama-memory-hybrid.h +++ b/examples/talk-llama/llama-memory-hybrid.h @@ -92,6 +92,8 @@ class llama_memory_hybrid : public llama_memory_i { class llama_memory_hybrid_context : public llama_memory_context_i { public: + using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; + // init failure explicit llama_memory_hybrid_context(llama_memory_status status); @@ -107,7 +109,7 @@ class llama_memory_hybrid_context : public llama_memory_context_i { // init success llama_memory_hybrid_context( llama_memory_hybrid * mem, - std::vector heads_attn, + slot_info_vec_t sinfos_attn, std::vector ubatches); ~llama_memory_hybrid_context() = default; diff --git a/examples/talk-llama/llama-memory-recurrent.cpp b/examples/talk-llama/llama-memory-recurrent.cpp index 6ed84057ccf..2c1ae67098c 100644 --- a/examples/talk-llama/llama-memory-recurrent.cpp +++ b/examples/talk-llama/llama-memory-recurrent.cpp @@ -25,9 +25,6 @@ llama_memory_recurrent::llama_memory_recurrent( uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) { const int32_t n_layer = hparams.n_layer; - LLAMA_LOG_INFO("%s: mem_size = %u, n_seq_max = %u, type_r = '%s', type_s = '%s', n_layer = %d\n", - __func__, mem_size, n_seq_max, ggml_type_name(type_r), ggml_type_name(type_s), n_layer); - head = 0; size = mem_size; used = 0; @@ -84,7 +81,7 @@ llama_memory_recurrent::llama_memory_recurrent( ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { - throw std::runtime_error("failed to create ggml context for kv cache"); + throw std::runtime_error("failed to create ggml context for rs cache"); } ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size); @@ -102,10 +99,10 @@ llama_memory_recurrent::llama_memory_recurrent( ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (!buf) { - throw std::runtime_error("failed to allocate buffer for kv cache"); + throw std::runtime_error("failed to allocate buffer for rs cache"); } ggml_backend_buffer_clear(buf, 0); - LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); bufs.emplace_back(buf); } @@ -113,8 +110,8 @@ llama_memory_recurrent::llama_memory_recurrent( const size_t memory_size_r = size_r_bytes(); const size_t memory_size_s = size_s_bytes(); - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__, - (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), + LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__, + (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max, ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f), ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f)); } @@ -374,7 +371,7 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - ubatch = balloc.split_equal(n_ubatch); + ubatch = balloc.split_equal(n_ubatch, false); } if (ubatch.n_tokens == 0) { @@ -384,6 +381,11 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & ubatches.push_back(std::move(ubatch)); // NOLINT } + if (balloc.get_n_used() < balloc.get_n_tokens()) { + // failed to find a suitable split + break; + } + if (!prepare(ubatches)) { break; } diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp index b15bf73c2a2..a322fc39352 100644 --- a/examples/talk-llama/llama-model.cpp +++ b/examples/talk-llama/llama-model.cpp @@ -40,17 +40,21 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_190M: return "190M"; case LLM_TYPE_220M: return "220M"; case LLM_TYPE_250M: return "250M"; + case LLM_TYPE_256M: return "256M"; case LLM_TYPE_270M: return "270M"; case LLM_TYPE_335M: return "335M"; + case LLM_TYPE_350M: return "350M"; case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; case LLM_TYPE_475M: return "475M"; + case LLM_TYPE_700M: return "700M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_3B: return "0.3B"; case LLM_TYPE_0_5B: return "0.5B"; case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; + case LLM_TYPE_1_2B: return "1.2B"; case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_5B: return "1.5B"; @@ -102,6 +106,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_57B_A14B: return "57B.A14B"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; + case LLM_TYPE_A13B: return "A13B"; case LLM_TYPE_30B_A3B: return "30B.A3B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_E2B: return "E2B"; @@ -208,23 +213,27 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_SSM_CONV: { - // FIXME - ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789); + const int64_t n_seq_tokens = 512; + const int64_t n_seqs = 3; + ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs); op_tensor = ggml_ssm_conv(ctx, conv_x, w); } break; case GGML_OP_SSM_SCAN: { - // FIXME - const int64_t d_state = w->ne[0]; - const int64_t d_inner = w->ne[1]; + // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2 + const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0]; + const int64_t n_head = w->ne[1]; + const int64_t head_dim = hparams.ssm_d_inner / n_head; + const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1; const int64_t n_seq_tokens = 512; - const int64_t n_seqs = 1; - ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs); - ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs); - ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs); - ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs); - ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs); - op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C); + const int64_t n_seqs = 3; + ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs); + ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs); + ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs); + ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); + ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); + ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs); + op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids); } break; case GGML_OP_RWKV_WKV6: { @@ -576,6 +585,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 22: type = LLM_TYPE_1B; break; case 26: type = LLM_TYPE_3B; break; case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B + case 30: type = LLM_TYPE_256M; break; // smoldocling 256M // granite uses a vocab with len 49152 case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break; case 36: type = LLM_TYPE_8B; break; // granite @@ -1081,6 +1091,58 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_MAMBA2: + { + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 24: + switch (hparams.n_embd) { + case 768: type = LLM_TYPE_SMALL; break; + default: type = LLM_TYPE_UNKNOWN; + } break; + case 48: + switch (hparams.n_embd) { + case 1024: type = LLM_TYPE_MEDIUM; break; + case 1536: type = LLM_TYPE_LARGE; break; + case 2048: type = LLM_TYPE_XL; break; + default: type = LLM_TYPE_UNKNOWN; + } break; + case 64: + switch (hparams.n_embd) { + case 2560: type = LLM_TYPE_3B; break; + case 4096: type = LLM_TYPE_7B; break; + default: type = LLM_TYPE_UNKNOWN; + } break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; + case LLM_ARCH_JAMBA: + { + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + } + + switch (hparams.n_layer) { + // TODO: Jamba layers are a bit heterogenous, so naming this is hard. + case 12: // 900M 8x???M + case 32: // 51B 16x?B + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_XVERSE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -1447,6 +1509,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); + // Granite uses rope_finetuned as a switch for rope, so default to true + bool rope_finetuned = true; + ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); + hparams.rope_finetuned = rope_finetuned; + switch (hparams.n_layer) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; @@ -1454,6 +1521,40 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } + // For Granite MoE Shared + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false); + } break; + case LLM_ARCH_GRANITE_HYBRID: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false); + ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false); + + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + + // Granite uses rope_finetuned as a switch for rope, so default to true + bool rope_finetuned = true; + ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); + hparams.rope_finetuned = rope_finetuned; + + // A layer is recurrent IFF the n_head_kv value is set to 0 + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + } + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + // TODO: Add llm type label (not sure this is useful) + default: type = LLM_TYPE_UNKNOWN; + } + // For Granite MoE Shared ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false); } break; @@ -1513,6 +1614,72 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_FALCON_H1: + { + // Common parameters + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + // SSM parameters + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + + std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true); + + switch (hparams.n_layer) { + case 36: + type = LLM_TYPE_0_5B; break; + case 24: + type = LLM_TYPE_1_5B; break; + case 66: + type = LLM_TYPE_1B; break; + case 32: + type = LLM_TYPE_3B; break; + case 44: + type = LLM_TYPE_7B; break; + case 72: + type = LLM_TYPE_34B; break; + default: + type = LLM_TYPE_UNKNOWN; + } + } break; + case LLM_ARCH_HUNYUAN_MOE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); + + switch (hparams.n_layer) { + case 32: type = LLM_TYPE_A13B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; + case LLM_ARCH_SMOLLM3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + hparams.n_no_rope_layer_step = 4; + + switch (hparams.n_layer) { + case 36: type = LLM_TYPE_3B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; + case LLM_ARCH_LFM2: + { + ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + for (uint32_t il = 0; il < hparams.n_layer; ++il) { + hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0; + } + switch (hparams.n_embd) { + case 1024: type = LLM_TYPE_350M; break; + case 1536: type = LLM_TYPE_700M; break; + case 2048: type = LLM_TYPE_1_2B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -3124,6 +3291,228 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0); } } break; + case LLM_ARCH_MAMBA2: + { + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t n_head = hparams.ssm_dt_rank; + const int64_t n_group = hparams.ssm_n_group; + const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head; + + // only an expansion factor of 2 is supported for now + GGML_ASSERT(2 * n_embd == d_inner); + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + { + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed, duplicated to allow offloading + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + // norm + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0); + + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0); + layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0); + + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0); + + // no "weight" suffix for these + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0); + layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0); + + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0); + + // out_proj + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0); + } + } break; + case LLM_ARCH_JAMBA: + { + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t dt_rank = hparams.ssm_dt_rank; + + // only an expansion factor of 2 is supported for now + GGML_ASSERT(2 * n_embd == d_inner); + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + { + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed, duplicated to allow offloading + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + const int64_t n_head_kv = hparams.n_head_kv(i); + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i); + + auto & layer = layers[i]; + + // norm + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + if (n_head_kv == 0) { + // Mamba layer + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0); + + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0); + layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0); + + layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0); + + layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0); + + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0); + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0); + + layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0); + layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0); + + // no "weight" suffix for these + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0); + layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0); + + // out_proj + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0); + } else { + // Attention layers + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + } + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED); + + if (layer.ffn_gate_inp) { + // MoE + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + } else { + // FFN (no MoE) + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } + } break; + case LLM_ARCH_GRANITE_HYBRID: + { + // mamba2 Mixer SSM params + // NOTE: int64_t for tensor dimensions + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t n_ssm_head = hparams.ssm_dt_rank; + const int64_t n_group = hparams.ssm_n_group; + const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head; + + // only an expansion factor of 2 is supported for now + GGML_ASSERT(2 * n_embd == d_inner); + + // embeddings + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + { + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed, duplicated to allow offloading + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + // norm + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + if (hparams.is_recurrent(i)) { + // ssm layers + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0); + + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0); + layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED); + + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0); + + // no "weight" suffix for these + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0); + layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0); + + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0); + + // out_proj + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0); + } else { + // attention layers (with optional bias) + const int64_t n_head_i = hparams.n_head(i); + const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i); + const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + } + + // feed forward (w/ optional biases) + if (n_expert > 0) { + // MoE FFN + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + + // For Granite MoE Shared + if (hparams.n_ff_shexp > 0) { + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0); + } + } else { + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + } + } + } break; case LLM_ARCH_XVERSE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4391,27 +4780,203 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; - default: - throw std::runtime_error("unknown architecture"); - } + case LLM_ARCH_FALCON_H1: + { + // Common + const int64_t hidden_size = hparams.n_embd; // hidden_size - if (n_moved_tensors > 0) { - LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n", - __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1, - ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft)); - } - } + // mamba2 Mixer SSM params + const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size + const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups + const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size + const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand + const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads + const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size; + const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads; - ml.done_getting_tensors(); + // attn params + const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head + const int64_t attn_num_key_value_head = hparams.n_head_kv(0); - ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr); - pimpl->mappings.reserve(ml.mappings.size()); + // ffn params + const int64_t ffn_intermediate_size = hparams.n_ff(0); - // create the backend buffers - std::vector> ctx_bufs; - ctx_bufs.reserve(ctx_map.size()); + // embeddings + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0); - // Ensure we have enough capacity for the maximum backend buffer we will potentially create + // output + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + /*SSM LAYERS*/ + // ssm in + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0); + // ssm 1d conv + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0); + layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED); + // ssm_dt + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0); + // no "weight" suffix for these + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); + layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); + // ssm_norm + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED); + // out_proj + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0); + + /*ATTENTION LAYERS*/ + // attention layers (with optional bias) + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0); + + + // feed forward (w/ optional biases) + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0); + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0); + + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED); + } + } break; + case LLM_ARCH_HUNYUAN_MOE: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0); + } + } break; + case LLM_ARCH_SMOLLM3: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; + case LLM_ARCH_LFM2: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + // ffn is same for transformer and conv layers + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // for operator_norm + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + if (!hparams.is_recurrent(i)) { + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + } else { + layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0); + layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0); + layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0); + } + } + } break; + default: + throw std::runtime_error("unknown architecture"); + } + + if (n_moved_tensors > 0) { + LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n", + __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1, + ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft)); + } + } + + ml.done_getting_tensors(); + + ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr); + pimpl->mappings.reserve(ml.mappings.size()); + + // create the backend buffers + std::vector> ctx_bufs; + ctx_bufs.reserve(ctx_map.size()); + + // Ensure we have enough capacity for the maximum backend buffer we will potentially create const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); pimpl->bufs.reserve(n_max_backend_buffer); @@ -4630,12 +5195,6 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); - LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); - LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); - LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); - LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); - LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); - if (!classifier_labels.empty()) { LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out); @@ -4646,6 +5205,19 @@ void llama_model::print_info() const { } } + if (arch == LLM_ARCH_MAMBA || + arch == LLM_ARCH_MAMBA2 || + arch == LLM_ARCH_JAMBA || + arch == LLM_ARCH_FALCON_H1 || + arch == LLM_ARCH_GRANITE_HYBRID) { + LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); + LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); + LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); + LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); + LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group); + LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); + } + LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str()); if (pimpl->n_elements >= 1e12) { LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12); @@ -4692,7 +5264,8 @@ void llama_model::print_info() const { if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || - arch == LLM_ARCH_GRANITE_MOE) { + arch == LLM_ARCH_GRANITE_MOE || + arch == LLM_ARCH_GRANITE_HYBRID) { LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); @@ -5582,12 +6155,10 @@ struct llm_build_falcon : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode @@ -5864,12 +6435,10 @@ struct llm_build_dbrx : public llm_graph_context { cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(cur, "wqkv_clamped", il); - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( @@ -6380,12 +6949,10 @@ struct llm_build_neo_bert : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // RoPE @@ -6615,8 +7182,8 @@ struct llm_build_mpt : public llm_graph_context { cb(cur, "wqkv_clamped", il); } - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); @@ -6636,6 +7203,12 @@ struct llm_build_mpt : public llm_graph_context { model.layers[il].attn_k_norm_b, LLM_NORM, il); cb(Kcur, "Kcur", il); + } else { + Qcur = ggml_cont(ctx0, Qcur); + cb(Qcur, "Qcur", il); + + Kcur = ggml_cont(ctx0, Kcur); + cb(Kcur, "Kcur", il); } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -6890,12 +7463,10 @@ struct llm_build_qwen : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode @@ -7660,21 +8231,21 @@ struct llm_build_phi2 : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); } else { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( @@ -7798,21 +8369,21 @@ struct llm_build_phi3 : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd)); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd)); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); } else { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( @@ -8168,12 +8739,10 @@ struct llm_build_codeshell : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( @@ -8589,8 +9158,6 @@ struct llm_build_minicpm3 : public llm_graph_context { ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); cb(k_pe, "k_pe", il); - // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont - kv_compressed = ggml_cont(ctx0, kv_compressed); kv_compressed = build_norm(kv_compressed, model.layers[il].attn_kv_a_norm, NULL, LLM_NORM_RMS, il); @@ -8617,12 +9184,6 @@ struct llm_build_minicpm3 : public llm_graph_context { v_states = ggml_cont(ctx0, v_states); cb(v_states, "v_states", il); - v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, - ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), - 0); - cb(v_states, "v_states", il); - - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this q_pe = ggml_rope_ext( ctx0, q_pe, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -8631,7 +9192,6 @@ struct llm_build_minicpm3 : public llm_graph_context { cb(q_pe, "q_pe", il); // shared RoPE key - k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this k_pe = ggml_rope_ext( ctx0, k_pe, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -9111,8 +9671,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { const int n_layer_sparsity = 10; // number of layers using activation sparsity const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) - ggml_tensor * one; // containing single element 1.0f - llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model), @@ -9124,14 +9682,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ggml_tensor * cur; ggml_tensor * inpL; - // TODO: remove this when ggml_scale_add is implemented - one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - { - auto inp = std::make_unique(); - inp->one = one; - res->add_input(std::move(inp)); - } - inpL = build_inp_embd(model.tok_embd); // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) @@ -9521,7 +10071,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cb(innovation, "innovation", il); ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens] - all_coefs = ggml_add(ctx0, all_coefs, one); + all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0 cb(all_coefs, "all_coefs", il); all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup] all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup] @@ -9664,81 +10214,32 @@ struct llm_build_starcoder2 : public llm_graph_context { } }; -struct llm_build_mamba : public llm_graph_context { - const llama_model & model; - - llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) { - ggml_tensor * cur; - ggml_tensor * inpL; - - // {n_embd, n_tokens} - inpL = build_inp_embd(model.tok_embd); - - auto * rs_inp = build_rs_inp(); - - ggml_tensor * inp_out_ids = build_inp_out_ids(); - - for (int il = 0; il < n_layer; ++il) { - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il); - - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // residual - cur = ggml_add(ctx0, cur, inpL); - - cur = build_cvec(cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - // final rmsnorm - cur = build_norm(inpL, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - - cb(cur, "result_norm", -1); - res->t_embd = cur; - - // lm_head - cur = build_lora_mm(model.output, cur); - - cb(cur, "result_output", -1); - res->t_logits = cur; - - ggml_build_forward_expand(gf, cur); - } +struct llm_graph_context_mamba : public llm_graph_context { + llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {} - // TODO: split ggml_tensor * build_mamba_layer( llm_graph_input_rs * inp, ggml_cgraph * gf, ggml_tensor * cur, + const llama_model & model, const llama_ubatch & ubatch, - int il) const { - const auto * mctx_cur = static_cast(mctx); + int il) { + + const auto * mctx_cur = inp->mctx; const auto kv_head = mctx_cur->get_head(); + const auto & layer = model.layers[il]; + const int64_t d_conv = hparams.ssm_d_conv; const int64_t d_inner = hparams.ssm_d_inner; const int64_t d_state = hparams.ssm_d_state; const int64_t dt_rank = hparams.ssm_dt_rank; + const int64_t n_head = d_inner; + const int64_t head_dim = 1; const int64_t n_seqs = ubatch.n_seqs; // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; - // Use the same RMS norm as the final layer norm - const float norm_rms_eps = hparams.f_norm_rms_eps; const int64_t n_seq_tokens = ubatch.n_seq_tokens; @@ -9749,21 +10250,14 @@ struct llm_build_mamba : public llm_graph_context { ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - // (ab)using the KV cache to store the states - ggml_tensor * conv = build_rs( - inp, gf, conv_states_all, - hparams.n_embd_r(), n_seqs); + ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); - ggml_tensor * ssm = build_rs( - inp, gf, ssm_states_all, - hparams.n_embd_s(), n_seqs); - ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} - ggml_tensor * xz = build_lora_mm(model.layers[il].ssm_in, cur); + ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur); // split the above in two // => {d_inner, n_seq_tokens, n_seqs} ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); @@ -9792,10 +10286,10 @@ struct llm_build_mamba : public llm_graph_context { // then permute away the ne[0] dimension, // and then you're left with the resulting x tensor. // For simultaneous sequences, all sequences need to have the same length. - x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); + x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d); // bias - x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b); + x = ggml_add(ctx0, x, layer.ssm_conv1d_b); x = ggml_silu(ctx0, x); } @@ -9803,76 +10297,387 @@ struct llm_build_mamba : public llm_graph_context { // ssm { // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} - ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x); + ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x); // split ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); - ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); - ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); - - // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers - if (ssm_dt_b_c_rms) { - dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); - B = ggml_rms_norm(ctx0, B, norm_rms_eps); - C = ggml_rms_norm(ctx0, C, norm_rms_eps); + ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); + ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); + + // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers + if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) { + dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il); + B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il); + C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il); } // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} - dt = build_lora_mm(model.layers[il].ssm_dt, dt); - dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); + dt = build_lora_mm(layer.ssm_dt, dt); + dt = ggml_add(ctx0, dt, layer.ssm_dt_b); + + cur = x; + x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs); + + ggml_tensor * A = layer.ssm_a; - // Custom operator to optimize the parallel associative scan - // as described in the Annex D of the Mamba paper. - // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} - ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C); + // use the states and the indices provided by build_recurrent_state + // (this is necessary in order to properly use the states before they are overwritten, + // while avoiding to make unnecessary copies of the states) + auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) { + ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size()); + + // Custom operator to optimize the parallel associative scan + // as described in the Annex D of the Mamba paper. + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); + }; + + ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); // store last states ggml_build_forward_expand(gf, ggml_cpy(ctx0, - ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]), ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); - ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); + ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0); // TODO: skip computing output earlier for unused tokens - // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} - y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); - y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); + y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d)); + y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y); // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} - cur = build_lora_mm(model.layers[il].ssm_out, y); + cur = build_lora_mm(layer.ssm_out, y); } // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); - //cb(cur, "mamba_out", il); return cur; } -}; -struct llm_build_command_r : public llm_graph_context { - llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + ggml_tensor * build_mamba2_layer( + llm_graph_input_rs * inp, + ggml_cgraph * gf, + ggml_tensor * cur, + const llama_model & model, + const llama_ubatch & ubatch, + int il) const { - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const auto * mctx_cur = inp->mctx; - const float f_logit_scale = hparams.f_logit_scale; + const auto kv_head = mctx_cur->get_head(); - ggml_tensor * cur; - ggml_tensor * inpL; + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t n_head = hparams.ssm_dt_rank; + const int64_t head_dim = d_inner / n_head; + const int64_t n_group = hparams.ssm_n_group; + const int64_t n_seqs = ubatch.n_seqs; - inpL = build_inp_embd(model.tok_embd); + const int64_t n_seq_tokens = ubatch.n_seq_tokens; - // inp_pos - contains the positions - ggml_tensor * inp_pos = build_inp_pos(); + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - auto * inp_attn = build_attn_inp_kv_unified(); + ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); + ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); + conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs); - for (int il = 0; il < n_layer; ++il) { - // norm + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads + + // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs} + ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur); + + // split the above in three + ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0); + ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt)); + ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt)); + + // conv + { + // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs} + ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0); + + // copy last (d_conv - 1) columns back into the state cache + ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv, + ggml_view_1d(ctx0, conv_states_all, + (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); + + // 1D convolution + // The equivalent is to make a self-overlapping view of conv_x + // over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weight, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // For simultaneous sequences, all sequences need to have the same length. + xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); + + // bias + xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b); + + xBC = ggml_silu(ctx0, xBC); + } + + // ssm + { + // These correspond to V K Q in SSM/attention duality + ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0); + ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC)); + ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC)); + + // {n_head, n_seq_tokens, n_seqs} + dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b); + + ggml_tensor * A = model.layers[il].ssm_a; + + // use the states and the indices provided by build_recurrent_state + // (this is necessary in order to properly use the states before they are overwritten, + // while avoiding to make unnecessary copies of the states) + auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) { + ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size()); + + // TODO: use semistructured matrices to implement state-space duality + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); + }; + + ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); + + // store last states + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + + ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0); + + // TODO: skip computing output earlier for unused tokens + + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); + y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y); + + // grouped RMS norm + if (model.layers[il].ssm_norm) { + y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs); + y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); + } + + y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs); + + // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + cur = build_lora_mm(model.layers[il].ssm_out, y); + } + + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + cb(cur, "mamba_out", il); + + return cur; + } +}; + +struct llm_build_mamba : public llm_graph_context_mamba { + llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + ggml_tensor * cur; + ggml_tensor * inpL; + + // {n_embd, n_tokens} + inpL = build_inp_embd(model.tok_embd); + + auto * rs_inp = build_rs_inp(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + if (model.arch == LLM_ARCH_MAMBA2) { + cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il); + } else { + cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // residual + cur = ggml_add(ctx0, cur, inpL); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + // final rmsnorm + cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + +}; + +struct llm_build_jamba : public llm_graph_context_mamba { + llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + ggml_tensor * cur; + ggml_tensor * inpL; + + // {n_embd, n_tokens} + inpL = build_inp_embd(model.tok_embd); + + auto * inp_hybrid = build_inp_mem_hybrid(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + if (n_head_kv == 0) { + cur = build_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il); + } else { + // Attention + + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // No RoPE :) + cur = build_attn(inp_hybrid->get_attn(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // residual + struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur); + cb(cur, "ffn_inp", il); + + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + if (model.layers[il].ffn_gate_inp == nullptr) { + // FFN + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + } + + // residual + cur = ggml_add(ctx0, ffn_inp, cur); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + // final rmsnorm + cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + +struct llm_build_command_r : public llm_graph_context { + llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + const float f_logit_scale = hparams.f_logit_scale; + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); @@ -10557,10 +11362,10 @@ struct llm_build_openelm : public llm_graph_context { cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0); cb(Qcur, "Qcur", il); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head); cb(Kcur, "Kcur", il); ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); @@ -10682,12 +11487,10 @@ struct llm_build_gptneox : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( @@ -11932,6 +12735,8 @@ struct llm_build_chatglm : public llm_graph_context { if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -11939,13 +12744,11 @@ struct llm_build_chatglm : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); } - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor); @@ -12066,6 +12869,8 @@ struct llm_build_glm4 : public llm_graph_context { if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -12073,13 +12878,11 @@ struct llm_build_glm4 : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); } - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( @@ -13178,13 +13981,11 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base { } }; - struct llm_build_granite : public llm_graph_context { llm_build_granite( const llama_model & model, const llm_graph_params & params, - ggml_cgraph * gf, - const bool use_rope = true) + ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -13199,14 +14000,12 @@ struct llm_build_granite : public llm_graph_context { // inp_pos - built only if rope enabled ggml_tensor * inp_pos = nullptr; - if (use_rope) { + if (hparams.rope_finetuned) { inp_pos = build_inp_pos(); } auto * inp_attn = build_attn_inp_kv_unified(); - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { @@ -13219,56 +14018,508 @@ struct llm_build_granite : public llm_graph_context { cb(cur, "attn_norm", il); // self-attention - { - // compute Q and K and (optionally) RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } + cur = build_attention_layer( + gf, cur, inp_pos, inp_attn, + model, n_embd_head, il); - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // ffn + cur = build_layer_ffn(cur, inpSA, model, il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + // For Granite architectures - scale logits + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + + ggml_tensor * build_attention_layer( + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * inp_pos, + llm_graph_input_attn_kv_unified * inp_attn, + const llama_model & model, + const int64_t n_embd_head, + const int il) { + + // compute Q and K and (optionally) RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens); + + const bool use_rope = hparams.rope_finetuned; + if (use_rope) { + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + cur = build_attn(inp_attn, gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + return cur; + } + + ggml_tensor * build_layer_ffn( + ggml_tensor * cur, + ggml_tensor * inpSA, + const llama_model & model, + const int il) { + + // For Granite architectures - scale residual + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network (non-MoE) + if (model.layers[il].ffn_gate_inp == nullptr) { + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + } else { + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + ggml_tensor * moe_out = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(moe_out, "ffn_moe_out", il); + + // For Granite MoE Shared + if (hparams.n_ff_shexp > 0) { + ggml_tensor * ffn_shexp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } else { + cur = moe_out; + } + } + + // For Granite architectures - scale residual + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + return cur; + } +}; + +struct llm_build_granite_hybrid : public llm_graph_context_mamba { + + llm_build_granite_hybrid( + const llama_model & model, + const llm_graph_params & params, + ggml_cgraph * gf) : + llm_graph_context_mamba(params) { + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + auto * inp = build_inp_mem_hybrid(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // Positional embeddings populated if rope enabled + ggml_tensor * inp_pos = nullptr; + if (hparams.rope_finetuned) { + inp_pos = build_inp_pos(); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + if (hparams.is_recurrent(il)) { + // ssm layer // + cur = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il); + } else { + // attention layer // + cur = build_attention_layer( + gf, cur, inp_pos, inp->get_attn(), model, + n_embd_head, il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // ffn + cur = build_layer_ffn(cur, inpSA, model, il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + // For Granite architectures - scale logits + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + } + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + + ggml_tensor * build_attention_layer( + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * inp_pos, + llm_graph_input_attn_kv_unified * inp_attn, + const llama_model & model, + const int64_t n_embd_head, + const int il) { + + // compute Q and K and (optionally) RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens); + + const bool use_rope = hparams.rope_finetuned; + if (use_rope) { + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + cur = build_attn(inp_attn, gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + return cur; + } + + ggml_tensor * build_layer_ffn( + ggml_tensor * cur, + ggml_tensor * inpSA, + const llama_model & model, + const int il) { + + // For Granite architectures - scale residual + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network (non-MoE) + if (model.layers[il].ffn_gate_inp == nullptr) { + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + } else { + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + ggml_tensor * moe_out = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(moe_out, "ffn_moe_out", il); + + // For Granite MoE Shared + if (hparams.n_ff_shexp > 0) { + ggml_tensor * ffn_shexp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } else { + cur = moe_out; + } + } + + // For Granite architectures - scale residual + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + return cur; + } +}; + +// ref: https://github.com/facebookresearch/chameleon +// based on the original build_llama() function, changes: +// * qk-norm +// * swin-norm +// * removed bias +// * removed MoE +struct llm_build_chameleon : public llm_graph_context { + llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + if (hparams.swin_norm) { + cur = inpL; + } else { + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + } + + // self-attention + { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); + + if (model.layers[il].attn_q_norm) { + Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur) * n_embd_head, + ggml_element_size(Qcur) * n_embd_head * n_head, + 0); + cb(Qcur, "Qcur", il); + + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, il); + cb(Qcur, "Qcur", il); + } + + if (model.layers[il].attn_k_norm) { + Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, + ggml_element_size(Kcur) * n_embd_head, + ggml_element_size(Kcur) * n_embd_head * n_head_kv, + 0); + cb(Kcur, "Kcur", il); + + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, il); + cb(Kcur, "Kcur", il); } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - if (use_rope) { - ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - } + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, gf, - model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); - cb(cur, "attn_out", il); + model.layers[il].wo, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -13276,66 +14527,38 @@ struct llm_build_granite : public llm_graph_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - // For Granite architectures - scale residual - cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + if (hparams.swin_norm) { + cur = build_norm(cur, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + } + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // feed-forward network (non-MoE) - if (model.layers[il].ffn_gate_inp == nullptr) { - + // feed-forward network + if (!hparams.swin_norm) { cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); + } - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); - } else { - // MoE branch - cur = build_norm(ffn_inp, + if (hparams.swin_norm) { + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - - ggml_tensor * moe_out = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - il); - cb(moe_out, "ffn_moe_out", il); - - // For Granite MoE Shared - if (hparams.n_ff_shexp > 0) { - ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(ffn_shexp, "ffn_shexp", il); - - cur = ggml_add(ctx0, moe_out, ffn_shexp); - cb(cur, "ffn_out", il); - } else { - cur = moe_out; - } } - // For Granite architectures - scale residual - cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -13352,37 +14575,197 @@ struct llm_build_granite : public llm_graph_context { model.output_norm, NULL, LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - res->t_embd = cur; + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output_with_img_logits", -1); + + // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. + // Needs to be removed once image outputs are supported. + int img_token_end_idx = 8196; + int img_token_start_idx = 4; + int num_img_tokens = img_token_end_idx - img_token_start_idx; + // creates 1d tensor of size num_img_tokens and values -FLT_MAX, + // which ensures that text token values are always at least larger than image token values + ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); + img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX); + cb(img_logits, "img_logits", -1); + + cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + +struct llm_build_wavtokenizer_dec : public llm_graph_context { + llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); + + cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1); + cur = ggml_add(ctx0, cur, model.conv1d_b); + + // posnet + for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { + const auto & layer = model.layers[il].posnet; + + inpL = cur; + + switch (il) { + case 0: + case 1: + case 3: + case 4: + { + cur = build_norm(cur, + layer.norm1, + layer.norm1_b, + LLM_NORM_GROUP, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv1_b); + + cur = build_norm(cur, + layer.norm2, + layer.norm2_b, + LLM_NORM_GROUP, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv2_b); + + cur = ggml_add(ctx0, cur, inpL); + } break; + case 2: + { + cur = build_norm(cur, + layer.attn_norm, + layer.attn_norm_b, + LLM_NORM_GROUP, 0); + + ggml_tensor * q; + ggml_tensor * k; + ggml_tensor * v; + + q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); + k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); + v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1); + + q = ggml_add(ctx0, q, layer.attn_q_b); + k = ggml_add(ctx0, k, layer.attn_k_b); + v = ggml_add(ctx0, v, layer.attn_v_b); + + q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); + k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + + kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); + + cur = ggml_mul_mat(ctx0, kq, v); + + cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.attn_o_b); + + cur = ggml_add(ctx0, cur, inpL); + } break; + case 5: + { + cur = build_norm(cur, + layer.norm, + layer.norm_b, + LLM_NORM_GROUP, 0); + } break; + default: GGML_ABORT("unknown posnet layer"); + }; + } + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = build_norm(cur, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, -1); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + inpL = cur; + + // convnext + for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) { + const auto & layer = model.layers[il].convnext; + + cur = inpL; + + cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.dw_b); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = build_norm(cur, + layer.norm, + layer.norm_b, + LLM_NORM, -1); + + cur = build_ffn(cur, + layer.pw1, layer.pw1_b, NULL, + NULL, NULL, NULL, + layer.pw2, layer.pw2_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + + cur = ggml_mul(ctx0, cur, layer.gamma); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + inpL = ggml_add(ctx0, cur, inpL); + } + + cur = inpL; + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = build_norm(cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); // lm_head cur = build_lora_mm(model.output, cur); - // For Granite architectures - scale logits - cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); - cb(cur, "result_output", -1); - res->t_logits = cur; + cur = ggml_add(ctx0, cur, model.output_b); + + cb(cur, "result_embd", -1); + res->t_embd = cur; ggml_build_forward_expand(gf, cur); } }; -// ref: https://github.com/facebookresearch/chameleon -// based on the original build_llama() function, changes: -// * qk-norm -// * swin-norm -// * removed bias -// * removed MoE -struct llm_build_chameleon : public llm_graph_context { - llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; +struct llm_build_plm : public llm_graph_context { + llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k)); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; ggml_tensor * cur; ggml_tensor * inpL; + // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions @@ -13396,78 +14779,103 @@ struct llm_build_chameleon : public llm_graph_context { ggml_tensor * inpSA = inpL; // norm - if (hparams.swin_norm) { - cur = inpL; - } else { - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - } + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); - // self-attention + // self_attention { - // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + ggml_tensor * q = NULL; + q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(q, "q", il); - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + // split into {n_head * n_embd_head_qk_nope, n_tokens} + ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + 0); + cb(q_nope, "q_nope", il); - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); + // and {n_head * n_embd_head_qk_rope, n_tokens} + ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); - if (model.layers[il].attn_q_norm) { - Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur) * n_embd_head, - ggml_element_size(Qcur) * n_embd_head * n_head, - 0); - cb(Qcur, "Qcur", il); + // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} + ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_pe_compresseed, "kv_pe_compresseed", il); - Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, - model.layers[il].attn_q_norm_b, - LLM_NORM, il); - cb(Qcur, "Qcur", il); - } + // split into {kv_lora_rank, n_tokens} + ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, + kv_pe_compresseed->nb[1], + 0); + cb(kv_compressed, "kv_compressed", il); - if (model.layers[il].attn_k_norm) { - Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, - ggml_element_size(Kcur) * n_embd_head, - ggml_element_size(Kcur) * n_embd_head * n_head_kv, - 0); - cb(Kcur, "Kcur", il); + // and {n_embd_head_qk_rope, n_tokens} + ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_pe_compresseed->nb[1], + kv_pe_compresseed->nb[1], + ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); + cb(k_pe, "k_pe", il); - Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, - model.layers[il].attn_k_norm_b, - LLM_NORM, il); - cb(Kcur, "Kcur", il); - } + kv_compressed = build_norm(kv_compressed, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, il); + cb(kv_compressed, "kv_compressed", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} + ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + cb(kv, "kv", il); - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, + // split into {n_head * n_embd_head_qk_nope, n_tokens} + ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + 0); + cb(k_nope, "k_nope", il); + + // and {n_head * n_embd_head_v, n_tokens} + ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_row_size(kv->type, (n_embd_head_qk_nope))); + cb(v_states, "v_states", il); + + v_states = ggml_cont(ctx0, v_states); + cb(v_states, "v_states", il); + + v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, + ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), + 0); + cb(v_states, "v_states", il); + + q_pe = ggml_rope_ext( + ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); + cb(q_pe, "q_pe", il); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, + // shared RoPE key + k_pe = ggml_rope_ext( + ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); + cb(k_pe, "k_pe", il); - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); + ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + cb(q_states, "q_states", il); + + ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + cb(k_states, "k_states", il); cur = build_attn(inp_attn, gf, - model.layers[il].wo, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + model.layers[il].wo, NULL, + q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -13475,40 +14883,23 @@ struct llm_build_chameleon : public llm_graph_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - if (hparams.swin_norm) { - cur = build_norm(cur, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - } - ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // feed-forward network - if (!hparams.swin_norm) { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - } + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, + NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); - if (hparams.swin_norm) { - cur = build_norm(cur, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); cur = build_cvec(cur, il); cb(cur, "l_out", il); @@ -13526,22 +14917,7 @@ struct llm_build_chameleon : public llm_graph_context { cb(cur, "result_norm", -1); res->t_embd = cur; - // lm_head cur = build_lora_mm(model.output, cur); - cb(cur, "result_output_with_img_logits", -1); - - // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. - // Needs to be removed once image outputs are supported. - int img_token_end_idx = 8196; - int img_token_start_idx = 4; - int num_img_tokens = img_token_end_idx - img_token_start_idx; - // creates 1d tensor of size num_img_tokens and values -FLT_MAX, - // which ensures that text token values are always at least larger than image token values - ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); - img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX); - cb(img_logits, "img_logits", -1); - - cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); cb(cur, "result_output", -1); res->t_logits = cur; @@ -13550,170 +14926,160 @@ struct llm_build_chameleon : public llm_graph_context { } }; -struct llm_build_wavtokenizer_dec : public llm_graph_context { - llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { +struct llm_build_bailingmoe : public llm_graph_context { + llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { ggml_tensor * cur; ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); - cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); - - cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.conv1d_b); - - // posnet - for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { - const auto & layer = model.layers[il].posnet; - - inpL = cur; - - switch (il) { - case 0: - case 1: - case 3: - case 4: - { - cur = build_norm(cur, - layer.norm1, - layer.norm1_b, - LLM_NORM_GROUP, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, layer.conv1_b); - - cur = build_norm(cur, - layer.norm2, - layer.norm2_b, - LLM_NORM_GROUP, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, layer.conv2_b); - - cur = ggml_add(ctx0, cur, inpL); - } break; - case 2: - { - cur = build_norm(cur, - layer.attn_norm, - layer.attn_norm_b, - LLM_NORM_GROUP, 0); + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * q; - ggml_tensor * k; - ggml_tensor * v; + auto * inp_attn = build_attn_inp_kv_unified(); - q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); - k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); - v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1); + ggml_tensor * inp_out_ids = build_inp_out_ids(); - q = ggml_add(ctx0, q, layer.attn_q_b); - k = ggml_add(ctx0, k, layer.attn_k_b); - v = ggml_add(ctx0, v, layer.attn_v_b); + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; - q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); - k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); - kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } - cur = ggml_mul_mat(ctx0, kq, v); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } - cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1); - cur = ggml_add(ctx0, cur, layer.attn_o_b); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } - cur = ggml_add(ctx0, cur, inpL); - } break; - case 5: - { - cur = build_norm(cur, - layer.norm, - layer.norm_b, - LLM_NORM_GROUP, 0); - } break; - default: GGML_ABORT("unknown posnet layer"); - }; - } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens); - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); - cur = build_norm(cur, - model.tok_norm, - model.tok_norm_b, - LLM_NORM, -1); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - inpL = cur; + cur = build_attn(inp_attn, gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il); + } - // convnext - for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) { - const auto & layer = model.layers[il].convnext; + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } - cur = inpL; + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1); - cur = ggml_add(ctx0, cur, layer.dw_b); + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + ggml_tensor * moe_out = + build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + false, hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(moe_out, "ffn_moe_out", il); - cur = build_norm(cur, - layer.norm, - layer.norm_b, - LLM_NORM, -1); + // FFN shared expert + { + ggml_tensor * ffn_shexp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); - cur = build_ffn(cur, - layer.pw1, layer.pw1_b, NULL, - NULL, NULL, NULL, - layer.pw2, layer.pw2_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, il); + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } - cur = ggml_mul(ctx0, cur, layer.gamma); + cur = ggml_add(ctx0, cur, ffn_inp); - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cur = build_cvec(cur, il); + cb(cur, "l_out", il); - inpL = ggml_add(ctx0, cur, inpL); + // input for next layer + inpL = cur; } cur = inpL; - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = build_norm(cur, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); - cur = ggml_add(ctx0, cur, model.output_b); - - cb(cur, "result_embd", -1); - res->t_embd = cur; + cb(cur, "result_output", -1); + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } }; -struct llm_build_plm : public llm_graph_context { - llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { - const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k)); +struct llm_build_dots1 : public llm_graph_context { + llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; - const uint32_t n_embd_head_qk_rope = hparams.n_rot; - const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; - const uint32_t kv_lora_rank = hparams.n_lora_kv; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); ggml_tensor * cur; ggml_tensor * inpL; - // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions @@ -13734,96 +15100,45 @@ struct llm_build_plm : public llm_graph_context { // self_attention { - ggml_tensor * q = NULL; - q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(q, "q", il); - - // split into {n_head * n_embd_head_qk_nope, n_tokens} - ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - 0); - cb(q_nope, "q_nope", il); - - // and {n_head * n_embd_head_qk_rope, n_tokens} - ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - ggml_row_size(q->type, n_embd_head_qk_nope)); - cb(q_pe, "q_pe", il); - - // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_pe_compresseed, "kv_pe_compresseed", il); - - // split into {kv_lora_rank, n_tokens} - ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, - kv_pe_compresseed->nb[1], - 0); - cb(kv_compressed, "kv_compressed", il); - - // and {n_embd_head_qk_rope, n_tokens} - ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_pe_compresseed->nb[1], - kv_pe_compresseed->nb[1], - ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); - cb(k_pe, "k_pe", il); - - kv_compressed = build_norm(kv_compressed, - model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, il); - cb(kv_compressed, "kv_compressed", il); - - // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); - cb(kv, "kv", il); + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - // split into {n_head * n_embd_head_qk_nope, n_tokens} - ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), - ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), - 0); - cb(k_nope, "k_nope", il); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - // and {n_head * n_embd_head_v, n_tokens} - ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), - ggml_row_size(kv->type, (n_embd_head_qk_nope))); - cb(v_states, "v_states", il); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - v_states = ggml_cont(ctx0, v_states); - cb(v_states, "v_states", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, - ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), - 0); - cb(v_states, "v_states", il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); - q_pe = ggml_rope_ext( - ctx0, q_pe, inp_pos, nullptr, + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - cb(q_pe, "q_pe", il); - // shared RoPE key - k_pe = ggml_rope_ext( - ctx0, k_pe, inp_pos, nullptr, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - cb(k_pe, "k_pe", il); - - ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); - cb(q_states, "q_states", il); - ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); - cb(k_states, "k_states", il); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, gf, - model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -13834,18 +15149,48 @@ struct llm_build_plm : public llm_graph_context { ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); + // MoE branch cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); - cb(cur, "ffn_out", il); + if ((uint32_t) il < hparams.n_layer_dense_lead) { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + ggml_tensor * moe_out = + build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + true, hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il); + cb(moe_out, "ffn_moe_out", il); + + { + ggml_tensor * ffn_shexp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } + } cur = ggml_add(ctx0, cur, ffn_inp); @@ -13865,6 +15210,7 @@ struct llm_build_plm : public llm_graph_context { cb(cur, "result_norm", -1); res->t_embd = cur; + // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -13874,8 +15220,13 @@ struct llm_build_plm : public llm_graph_context { } }; -struct llm_build_bailingmoe : public llm_graph_context { - llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { +struct llm_build_ernie4_5 : public llm_graph_context { + llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + ggml_tensor * cur; ggml_tensor * inpL; @@ -13886,23 +15237,19 @@ struct llm_build_bailingmoe : public llm_graph_context { auto * inp_attn = build_attn_inp_kv_unified(); - ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); + { + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + } // self-attention { - // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); - - // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { @@ -13924,18 +15271,18 @@ struct llm_build_bailingmoe : public llm_graph_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -13945,11 +15292,13 @@ struct llm_build_bailingmoe : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, gf, - model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il); + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer - 1) { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -13957,36 +15306,19 @@ struct llm_build_bailingmoe : public llm_graph_context { ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - ggml_tensor * moe_out = - build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, hparams.expert_weights_norm, - false, hparams.expert_weights_scale, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - il); - cb(moe_out, "ffn_moe_out", il); - - // FFN shared expert + // feed-forward network { - ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(ffn_shexp, "ffn_shexp", il); - - cur = ggml_add(ctx0, moe_out, ffn_shexp); cb(cur, "ffn_out", il); } @@ -14018,8 +15350,134 @@ struct llm_build_bailingmoe : public llm_graph_context { } }; -struct llm_build_dots1 : public llm_graph_context { - llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { +struct llm_build_falcon_h1 : public llm_graph_context_mamba { + llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // Build the inputs in the recurrent & kv cache + auto * inp = build_inp_mem_hybrid(); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur-post-rope", il); + cb(Kcur, "Kcur-post-rope", il); + cb(Vcur, "Vcur-post-rope", il); + + ggml_tensor * attn_out = build_attn(inp->get_attn(), gf, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(attn_out, "attn_out", il); + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + // Mamba2 layer + cb(cur, "ssm_in", il); + + ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il); + cb(ssm_out, "ssm_out", il); + + // // Aggregation + cur = ggml_add(ctx0, attn_out, ssm_out); + inpSA = ggml_add(ctx0, cur, inpSA); + cb(cur, "layer_out", il); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = inpSA; + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, inpSA); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + +struct llm_build_arcee : public llm_graph_context { + llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14035,6 +15493,8 @@ struct llm_build_dots1 : public llm_graph_context { auto * inp_attn = build_attn_inp_kv_unified(); + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { @@ -14046,36 +15506,45 @@ struct llm_build_dots1 : public llm_graph_context { LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - // self_attention + // self-attention { + // rope freq factors for llama3; may return nullptr for llama2 and other models + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); - cb(Qcur, "Qcur_normed", il); - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, + ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); - cb(Kcur, "Kcur_normed", il); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, + ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -14086,7 +15555,8 @@ struct llm_build_dots1 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); } if (il == n_layer - 1 && inp_out_ids) { @@ -14097,50 +15567,23 @@ struct llm_build_dots1 : public llm_graph_context { ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // MoE branch + // feed-forward network + // ARCEE uses relu^2 instead of silu cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - } else { - ggml_tensor * moe_out = - build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - model.layers[il].ffn_exp_probs_b, - n_expert, n_expert_used, - LLM_FFN_SILU, hparams.expert_weights_norm, - true, hparams.expert_weights_scale, - (llama_expert_gating_func_type) hparams.expert_gating_func, - il); - cb(moe_out, "ffn_moe_out", il); - - { - ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(ffn_shexp, "ffn_shexp", il); - - cur = ggml_add(ctx0, moe_out, ffn_shexp); - cb(cur, "ffn_out", il); - } - } + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); cur = build_cvec(cur, il); cb(cur, "l_out", il); @@ -14168,8 +15611,8 @@ struct llm_build_dots1 : public llm_graph_context { } }; -struct llm_build_ernie4_5 : public llm_graph_context { - llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { +struct llm_build_hunyuan_moe : public llm_graph_context { + llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14185,19 +15628,25 @@ struct llm_build_ernie4_5 : public llm_graph_context { auto * inp_attn = build_attn_inp_kv_unified(); + const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm - { - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - } + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); // self-attention { + // rope freq factors for llama3; may return nullptr for llama2 and other models + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { @@ -14224,29 +15673,38 @@ struct llm_build_ernie4_5 : public llm_graph_context { Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, + ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, + ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, nullptr, + LLM_NORM_RMS, il); + cb(Kcur, "Kcur_norm", il); + + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, nullptr, + LLM_NORM_RMS, il); + cb(Qcur, "Qcur_norm", il); cur = build_attn(inp_attn, gf, - model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); } - if (il == n_layer - 1) { - // skip computing output for unused tokens - ggml_tensor * inp_out_ids = build_inp_out_ids(); + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -14254,23 +15712,40 @@ struct llm_build_ernie4_5 : public llm_graph_context { ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // feed-forward network - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - } + // feed-forward network (non-MoE) + ggml_tensor * cur_mlp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur_mlp, "ffn_mlp", il); - cur = ggml_add(ctx0, cur, ffn_inp); + // MoE branch + ggml_tensor * cur_moe = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, + true, // norm_topk_prob + false, + 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur_moe, "ffn_moe_out", il); + + ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp); + cb(ffn_out, "ffn_out", il); + + cur = ggml_add(ctx0, ffn_out, ffn_inp); cur = build_cvec(cur, il); cb(cur, "l_out", il); @@ -14290,7 +15765,6 @@ struct llm_build_ernie4_5 : public llm_graph_context { // lm_head cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); res->t_logits = cur; @@ -14298,8 +15772,8 @@ struct llm_build_ernie4_5 : public llm_graph_context { } }; -struct llm_build_arcee : public llm_graph_context { - llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { +struct llm_build_smollm3 : public llm_graph_context { + llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14322,6 +15796,8 @@ struct llm_build_arcee : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; + const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0; + // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -14330,9 +15806,6 @@ struct llm_build_arcee : public llm_graph_context { // self-attention { - // rope freq factors for llama3; may return nullptr for llama2 and other models - ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); - // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); @@ -14359,17 +15832,19 @@ struct llm_build_arcee : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + if (use_rope) { + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -14390,19 +15865,20 @@ struct llm_build_arcee : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - // ARCEE uses relu^2 instead of silu - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); - cb(cur, "ffn_out", il); + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -14433,6 +15909,163 @@ struct llm_build_arcee : public llm_graph_context { } }; +struct llm_build_lfm2 : public llm_graph_context { + const llama_model & model; + + llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) { + + ggml_tensor * cur = build_inp_embd(model.tok_embd); + cb(cur, "model.embed_tokens", -1); + + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_hybrid = build_inp_mem_hybrid(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + auto * prev_cur = cur; + cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "model.layers.{}.operator_norm", il); + + cur = hparams.is_recurrent(il) ? + build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) : + build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ; + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids); + } + + cur = ggml_add(ctx0, prev_cur, cur); + cur = ggml_add(ctx0, cur, build_feed_forward(cur, il)); + } + + cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "model.embedding_norm", -1); + res->t_embd = cur; + + // lm_head is tied with embeddings + cur = build_lora_mm(model.tok_embd, cur); + cb(cur, "lm_head", -1); + + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + + ggml_tensor * build_feed_forward(ggml_tensor * cur, + int il) const { + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "model.layers.{}.ffn_norm", il); + + GGML_ASSERT(!model.layers[il].ffn_up_b); + GGML_ASSERT(!model.layers[il].ffn_gate_b); + GGML_ASSERT(!model.layers[il].ffn_down_b); + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "model.layers.{}.feed_forward.w2", il); + + return cur; + } + + ggml_tensor * build_attn_block(ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * inp_pos, + llm_graph_input_attn_kv_unified * inp_attn, + int il) const { + GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); + auto const n_embd_head = hparams.n_embd_head_v; + auto const n_head_kv = hparams.n_head_kv(il); + + auto * q = build_lora_mm(model.layers[il].wq, cur); + cb(q, "model.layers.{}.self_attn.q_proj", il); + auto * k = build_lora_mm(model.layers[il].wk, cur); + cb(k, "model.layers.{}.self_attn.k_proj", il); + auto * v = build_lora_mm(model.layers[il].wv, cur); + cb(v, "model.layers.{}.self_attn.v_proj", il); + + q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens); + k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens); + v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens); + + // qk norm + q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(q, "model.layers.{}.self_attn.q_layernorm", il); + k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(k, "model.layers.{}.self_attn.k_layernorm", il); + + // RoPE + q = ggml_rope_ext( + ctx0, q, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + k = ggml_rope_ext( + ctx0, k, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, + q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + + cb(cur, "model.layers.{}.self_attn.out_proj", il); + + return cur; + } + + ggml_tensor * build_shortconv_block(ggml_cgraph * gf, + ggml_tensor * cur, + llm_graph_input_rs * inp_recr, + int il) { + const auto * mctx_cur = static_cast(mctx)->get_recr(); + + auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur); + cb(bcx, "model.layers.{}.conv.in_proj", il); + + constexpr auto n_chunks = 3; + GGML_ASSERT(bcx->ne[0] % n_chunks == 0); + auto const chunk_size = bcx->ne[0] / n_chunks; + auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx)); + auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx)); + auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx)); + + auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); + + // read conv state directly, with build_rs generation is slower + ggml_tensor * conv_state = mctx_cur->get_r_l(il); + const int64_t n_seqs = ubatch.n_seqs; + ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs); + conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs); + + bx = ggml_concat(ctx0, conv, bx, 0); + GGML_ASSERT(bx->ne[0] > conv->ne[0]); + + auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx)); + GGML_ASSERT(ggml_are_same_shape(conv, new_conv)); + + // write conv state + ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state)); + + auto * conv_kernel = model.layers[il].shortconv.conv; + GGML_ASSERT(hparams.n_shortconv_l_cache > 0); + + // construct ssm_conv op + ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); + cb(conv_out, "model.layers.{}.conv.conv", il); + + auto * y = ggml_mul(ctx0, c, conv_out); + + y = build_lora_mm(model.layers[il].shortconv.out_proj, y); + cb(y, "model.layers.{}.conv.out_proj", il); + + return y; + } +}; + llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; @@ -14479,7 +16112,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* recurrent_type_v */ GGML_TYPE_F32, /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max), /* n_seq_max */ cparams.n_seq_max, - /* offload */ cparams.offload_kqv); + /* offload */ cparams.offload_kqv, + /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr, + /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr); } else { const auto padding = llama_kv_cache_unified::get_padding(cparams); @@ -14668,9 +16303,14 @@ llm_graph_result_ptr llama_model::build_graph( llm = std::make_unique(*this, params, gf); } break; case LLM_ARCH_MAMBA: + case LLM_ARCH_MAMBA2: { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_JAMBA: + { + llm = std::make_unique(*this, params, gf); + } break; case LLM_ARCH_XVERSE: { llm = std::make_unique(*this, params, gf); @@ -14784,6 +16424,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_GRANITE_HYBRID: + { + llm = std::make_unique(*this, params, gf); + } break; case LLM_ARCH_CHAMELEON: { llm = std::make_unique(*this, params, gf); @@ -14812,6 +16456,22 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_HUNYUAN_MOE: + { + llm = std::make_unique(*this, params, gf); + } break; + case LLM_ARCH_SMOLLM3: + { + llm = std::make_unique(*this, params, gf); + } break; + case LLM_ARCH_FALCON_H1: + { + llm = std::make_unique(*this, params, gf); + } break; + case LLM_ARCH_LFM2: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } @@ -14928,6 +16588,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_REFACT: case LLM_ARCH_BLOOM: case LLM_ARCH_MAMBA: + case LLM_ARCH_MAMBA2: + case LLM_ARCH_JAMBA: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: case LLM_ARCH_T5ENCODER: @@ -14959,15 +16621,18 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GLM4: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: + case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_CHAMELEON: case LLM_ARCH_BAILINGMOE: case LLM_ARCH_NEO_BERT: + case LLM_ARCH_SMOLLM3: case LLM_ARCH_ARCEE: case LLM_ARCH_ERNIE4_5: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: + case LLM_ARCH_FALCON_H1: case LLM_ARCH_GROK: case LLM_ARCH_DBRX: case LLM_ARCH_BERT: @@ -14999,6 +16664,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_EXAONE: case LLM_ARCH_MINICPM3: case LLM_ARCH_DOTS1: + case LLM_ARCH_HUNYUAN_MOE: + case LLM_ARCH_LFM2: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h index a958c5997a1..027a7f0c3e2 100644 --- a/examples/talk-llama/llama-model.h +++ b/examples/talk-llama/llama-model.h @@ -32,17 +32,21 @@ enum llm_type { LLM_TYPE_190M, LLM_TYPE_220M, LLM_TYPE_250M, + LLM_TYPE_256M, LLM_TYPE_270M, LLM_TYPE_335M, + LLM_TYPE_350M, LLM_TYPE_410M, LLM_TYPE_450M, LLM_TYPE_475M, + LLM_TYPE_700M, LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_3B, LLM_TYPE_0_5B, LLM_TYPE_0_6B, LLM_TYPE_1B, + LLM_TYPE_1_2B, LLM_TYPE_1_3B, LLM_TYPE_1_4B, LLM_TYPE_1_5B, @@ -94,6 +98,7 @@ enum llm_type { LLM_TYPE_57B_A14B, LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_128E, // llama4 Maverick + LLM_TYPE_A13B, LLM_TYPE_30B_A3B, LLM_TYPE_235B_A22B, LLM_TYPE_E2B, @@ -153,6 +158,12 @@ struct llama_layer_convnext { struct ggml_tensor * gamma = nullptr; }; +struct llama_layer_shortconv { + struct ggml_tensor * in_proj = nullptr; + struct ggml_tensor * conv = nullptr; + struct ggml_tensor * out_proj = nullptr; +}; + struct llama_layer { // normalization struct ggml_tensor * attn_norm = nullptr; @@ -172,6 +183,10 @@ struct llama_layer { struct ggml_tensor * ffn_sub_norm = nullptr; struct ggml_tensor * attn_norm_cross = nullptr; struct ggml_tensor * attn_norm_enc = nullptr; + struct ggml_tensor * ssm_norm = nullptr; + struct ggml_tensor * ssm_dt_norm = nullptr; + struct ggml_tensor * ssm_b_norm = nullptr; + struct ggml_tensor * ssm_c_norm = nullptr; // attention struct ggml_tensor * wq = nullptr; @@ -335,6 +350,8 @@ struct llama_layer { struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; + + struct llama_layer_shortconv shortconv; }; struct llama_model { diff --git a/examples/talk-llama/llama-quant.cpp b/examples/talk-llama/llama-quant.cpp index f4b5713d7dd..4dbd1e30991 100644 --- a/examples/talk-llama/llama-quant.cpp +++ b/examples/talk-llama/llama-quant.cpp @@ -844,6 +844,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // do not quantize Mamba's small yet 2D weights // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ssm_conv1d.weight") == std::string::npos; + quantize &= name.find("shortconv.conv.weight") == std::string::npos; // do not quantize RWKV's small yet 2D weights quantize &= name.find("time_mix_first.weight") == std::string::npos; diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp index 5c9eb87566d..e0e578d6394 100644 --- a/examples/talk-llama/llama-vocab.cpp +++ b/examples/talk-llama/llama-vocab.cpp @@ -351,6 +351,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { break; case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_QWEN2: + case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: regex_exprs = { // original regex from tokenizer.json // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" @@ -1522,7 +1523,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| tokenizer_pre == "falcon3" || - tokenizer_pre == "pixtral") { + tokenizer_pre == "falcon-h1" || + tokenizer_pre == "pixtral" || + tokenizer_pre == "midm-2.0" || + tokenizer_pre == "lfm2") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; @@ -1554,7 +1558,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "jina-de" || tokenizer_pre == "gigachat" || tokenizer_pre == "jina-v2-es" || - tokenizer_pre == "jina-v2-de") { + tokenizer_pre == "jina-v2-de" || + tokenizer_pre == "a.x-4.0") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "jina-v1-en" || @@ -1656,6 +1661,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "seed-coder") { pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; + } else if ( + tokenizer_pre == "hunyuan") { + pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -1839,6 +1848,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "_" || t.first == "<|end▁of▁sentence|>" // DeepSeek + || t.first == "" // smoldocling ) { special_eot_id = t.second; if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { @@ -1998,6 +2008,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "_" || t.first == "<|end_of_text|>" + || t.first == "" // smoldocling ) { special_eog_ids.insert(t.second); if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { diff --git a/examples/talk-llama/llama-vocab.h b/examples/talk-llama/llama-vocab.h index 40e4d1c05b1..46a1ccecb51 100644 --- a/examples/talk-llama/llama-vocab.h +++ b/examples/talk-llama/llama-vocab.h @@ -6,6 +6,47 @@ #include #include +// pre-tokenization types +enum llama_vocab_pre_type { + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, + LLAMA_VOCAB_PRE_TYPE_FALCON = 4, + LLAMA_VOCAB_PRE_TYPE_MPT = 5, + LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, + LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_REFACT = 8, + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, + LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10, + LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, + LLAMA_VOCAB_PRE_TYPE_OLMO = 12, + LLAMA_VOCAB_PRE_TYPE_DBRX = 13, + LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, + LLAMA_VOCAB_PRE_TYPE_PORO = 15, + LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16, + LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, + LLAMA_VOCAB_PRE_TYPE_VIKING = 18, + LLAMA_VOCAB_PRE_TYPE_JAIS = 19, + LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, + LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, + LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, + LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, + LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, + LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, + LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, + LLAMA_VOCAB_PRE_TYPE_GPT4O = 29, + LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30, + LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, + LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, + LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, + LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, + LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, + LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, +}; + struct LLM_KV; struct llama_model_loader; diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h index 3eda9bc6860..f73b1ab65fe 100644 --- a/examples/talk-llama/llama.h +++ b/examples/talk-llama/llama.h @@ -79,46 +79,6 @@ extern "C" { LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization }; - // pre-tokenization types - enum llama_vocab_pre_type { - LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, - LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, - LLAMA_VOCAB_PRE_TYPE_FALCON = 4, - LLAMA_VOCAB_PRE_TYPE_MPT = 5, - LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, - LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, - LLAMA_VOCAB_PRE_TYPE_REFACT = 8, - LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, - LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10, - LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, - LLAMA_VOCAB_PRE_TYPE_OLMO = 12, - LLAMA_VOCAB_PRE_TYPE_DBRX = 13, - LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, - LLAMA_VOCAB_PRE_TYPE_PORO = 15, - LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16, - LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, - LLAMA_VOCAB_PRE_TYPE_VIKING = 18, - LLAMA_VOCAB_PRE_TYPE_JAIS = 19, - LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, - LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, - LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, - LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, - LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, - LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, - LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, - LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, - LLAMA_VOCAB_PRE_TYPE_GPT4O = 29, - LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30, - LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, - LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, - LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, - LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, - LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, - }; - enum llama_rope_type { LLAMA_ROPE_TYPE_NONE = -1, LLAMA_ROPE_TYPE_NORM = 0, From 3775c503d5133d3d8b99d7d062e87a54064b0eb8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 12 Jul 2025 16:26:44 +0300 Subject: [PATCH 0098/1348] sync : resolve conflicts (#0) ggml-ci --- ggml/CMakeLists.txt | 2 -- ggml/src/CMakeLists.txt | 1 - ggml/src/ggml-backend-reg.cpp | 8 -------- 3 files changed, 11 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 675f3bf7567..eaba9c70469 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF) option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) -option(GGML_KOMPUTE "ggml: use Kompute" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) @@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS include/ggml-cann.h include/ggml-cpp.h include/ggml-cuda.h - include/ggml-kompute.h include/ggml-opt.h include/ggml-metal.h include/ggml-rpc.h diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 9cb2c228dcf..8760c2d35ec 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -365,7 +365,6 @@ ggml_add_backend(BLAS) ggml_add_backend(CANN) ggml_add_backend(CUDA) ggml_add_backend(HIP) -ggml_add_backend(Kompute) ggml_add_backend(METAL) ggml_add_backend(MUSA) ggml_add_backend(RPC) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 2d93771fd1c..042ea77aca7 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -61,10 +61,6 @@ #include "ggml-cann.h" #endif -#ifdef GGML_USE_KOMPUTE -#include "ggml-kompute.h" -#endif - // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push @@ -189,9 +185,6 @@ struct ggml_backend_registry { #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif -#ifdef GGML_USE_KOMPUTE - register_backend(ggml_backend_kompute_reg()); -#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif @@ -575,7 +568,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("cann", silent, dir_path); ggml_backend_load_best("cuda", silent, dir_path); ggml_backend_load_best("hip", silent, dir_path); - ggml_backend_load_best("kompute", silent, dir_path); ggml_backend_load_best("metal", silent, dir_path); ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path); From a16da91365700f396da916d16a7f5a2ec99364b9 Mon Sep 17 00:00:00 2001 From: Greg Sadetsky Date: Sat, 12 Jul 2025 17:22:35 -0400 Subject: [PATCH 0099/1348] examples : update links in wasm examples (#3318) * fix 404 link * update link in whisper.wasm example * update example in command.wasm * update link in bench.wasm example * update link in stream.wasm example --- examples/bench.wasm/README.md | 2 +- examples/command.wasm/README.md | 2 +- examples/stream.wasm/README.md | 2 +- examples/whisper.wasm/README.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/bench.wasm/README.md b/examples/bench.wasm/README.md index 726d4d08e0a..1a16a11018f 100644 --- a/examples/bench.wasm/README.md +++ b/examples/bench.wasm/README.md @@ -2,7 +2,7 @@ Benchmark the performance of whisper.cpp in the browser using WebAssembly -Link: https://ggerganov.github.io/whisper.cpp/bench.wasm +Link: https://ggml.ai/whisper.cpp/bench.wasm/ Terminal version: [examples/bench](/examples/bench) diff --git a/examples/command.wasm/README.md b/examples/command.wasm/README.md index 78d8776f570..b46b89e2043 100644 --- a/examples/command.wasm/README.md +++ b/examples/command.wasm/README.md @@ -3,7 +3,7 @@ This is a basic Voice Assistant example that accepts voice commands from the microphone. It runs in fully in the browser via WebAseembly. -Online demo: https://ggerganov.github.io/whisper.cpp/command.wasm +Online demo: https://ggml.ai/whisper.cpp/command.wasm/ Terminal version: [examples/command](/examples/command) diff --git a/examples/stream.wasm/README.md b/examples/stream.wasm/README.md index f3c0c3d7582..29ff982d617 100644 --- a/examples/stream.wasm/README.md +++ b/examples/stream.wasm/README.md @@ -2,7 +2,7 @@ Real-time transcription in the browser using WebAssembly -Online demo: https://whisper.ggerganov.com/stream/ +Online demo: https://ggml.ai/whisper.cpp/stream.wasm/ ## Build instructions diff --git a/examples/whisper.wasm/README.md b/examples/whisper.wasm/README.md index f1bb7cadde2..b267d3d242b 100644 --- a/examples/whisper.wasm/README.md +++ b/examples/whisper.wasm/README.md @@ -22,7 +22,7 @@ audio is limited to 120 seconds. ## Live demo -Link: https://ggerganov.github.io/whisper.cpp/ +Link: https://ggml.ai/whisper.cpp/ ![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png) From 032697b9a850dc2615555e2a93a683cc3dd58559 Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Mon, 14 Jul 2025 14:13:44 +0200 Subject: [PATCH 0100/1348] whisper: validate get_rows support for cpu extra buffer (#3323) --- src/whisper.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 347cc178ee7..5c08478aefd 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1438,7 +1438,8 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * op_supported = true; } else { switch (op) { - // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT + // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT and GGML_OP_GET_ROWS + case GGML_OP_GET_ROWS: case GGML_OP_MUL_MAT: { ggml_init_params params = { /*.mem_size =*/ 2 * ggml_tensor_overhead(), @@ -1454,9 +1455,15 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * ggml_tensor * op_tensor = nullptr; - int64_t n_ctx = hparams.n_audio_ctx; - ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]); - op_tensor = ggml_mul_mat(ctx, w, b); + if (op == GGML_OP_MUL_MAT) { + int64_t n_ctx = hparams.n_audio_ctx; + ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]); + op_tensor = ggml_mul_mat(ctx, w, b); + } else if (op == GGML_OP_GET_ROWS) { + int64_t num_indices = 8; + ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices); + op_tensor = ggml_get_rows(ctx, w, indices); + } // create a temporary dummy buffer for the weight so that supports_op can check the buffer type GGML_ASSERT(w->buffer == nullptr); From 66b3a39bdc166e42874a257b796559d1c5a382bf Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 12 Jul 2025 21:31:38 +0800 Subject: [PATCH 0101/1348] CUDA: add set rows for f32 and f16 (llama/14551) * CUDA: add set rows for f32 and f16 * Review: change kernel params, use strides from host * Use 1-d kernel * Review: use int64_t for blockDim.x, rename nb->s for clarity --- ggml/src/ggml-cuda/ggml-cuda.cu | 10 +++ ggml/src/ggml-cuda/set-rows.cu | 130 ++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/set-rows.cuh | 7 ++ 3 files changed, 147 insertions(+) create mode 100644 ggml/src/ggml-cuda/set-rows.cu create mode 100644 ggml/src/ggml-cuda/set-rows.cuh diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 72406f0af36..88b17dd682c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -43,6 +43,7 @@ #include "ggml-cuda/upscale.cuh" #include "ggml-cuda/wkv.cuh" #include "ggml-cuda/gla.cuh" +#include "ggml-cuda/set-rows.cuh" #include "ggml.h" #include @@ -2230,6 +2231,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_GET_ROWS_BACK: ggml_cuda_op_get_rows_back(ctx, dst); break; + case GGML_OP_SET_ROWS: + ggml_cuda_op_set_rows(ctx, dst); + break; case GGML_OP_DUP: ggml_cuda_dup(ctx, dst); break; @@ -3216,6 +3220,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g { return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; } break; + case GGML_OP_SET_ROWS: + { + return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && + op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_I64; + } break; case GGML_OP_CPY: { ggml_type src0_type = op->src[0]->type; diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu new file mode 100644 index 00000000000..d8b3e63e1aa --- /dev/null +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -0,0 +1,130 @@ +#include "set-rows.cuh" + +typedef void (*set_rows_kernel_t)(const char * src, char * dst); + +template +__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {} + +template<> +__device__ __forceinline__ void set_rows_1(const float * src_f, half * dst_h) { + *dst_h = __float2half(*src_f); +} + +template<> +__device__ __forceinline__ void set_rows_1(const float * src_f, float * dst_f) { + *dst_f = *src_f; +} + +template +static __global__ void k_set_rows( + const src_t * __restrict__ src0, const int64_t * __restrict__ src1, dst_t * __restrict__ dst, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t s10, const int64_t s11, const int64_t s12, + const int64_t s1, const int64_t s2, const int64_t s3) { + + const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; + const int64_t ne_total = ne00 * ne01 * ne02 * ne03; + + if (i >= ne_total) { + return; + } + + const int64_t i03 = i / (ne00 * ne01 * ne02); + const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00; + const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00; + + const int64_t i12 = i03 % ne12; + const int64_t i11 = i02 % ne11; + const int64_t i10 = i01; + + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + + const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; + dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3; + + const src_t* src_elem = src0_row + i00; + dst_t* dst_elem = dst_row_ptr + i00; + set_rows_1(src_elem, dst_elem); +} + +template +static void set_rows_cuda( + const src_t * src0_d, const int64_t * src1_d, dst_t * dst_d, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { + + const int64_t ne_total = ne00 * ne01 * ne02 * ne03; + const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE; + const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE); + const dim3 grid_size(num_blocks); + + + const int64_t s01 = nb01/sizeof(src_t); + const int64_t s02 = nb02/sizeof(src_t); + const int64_t s03 = nb03/sizeof(src_t); + const int64_t s10 = nb10/sizeof(int64_t); + const int64_t s11 = nb11/sizeof(int64_t); + const int64_t s12 = nb12/sizeof(int64_t); + const int64_t s1 = nb1/sizeof(dst_t); + const int64_t s2 = nb2/sizeof(dst_t); + const int64_t s3 = nb3/sizeof(dst_t); + + if (ne_total > 0) { + k_set_rows<<>>( + src0_d, src1_d, dst_d, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + s01, s02, s03, + s10, s11, s12, + s1, s2, s3); + } +} + + +void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_I64); + + GGML_TENSOR_BINARY_OP_LOCALS + + const float * src0_d = (const float *)src0->data; + const int64_t * src1_d = (const int64_t *)src1->data; + + cudaStream_t stream = ctx.stream(); + + + + if (dst->type == GGML_TYPE_F32) { + set_rows_cuda( + src0_d, src1_d, (float*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); + } else if (dst->type == GGML_TYPE_F16) { + set_rows_cuda( + src0_d, src1_d, (half*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); + } else { + GGML_ABORT("unsupported type"); + } +} diff --git a/ggml/src/ggml-cuda/set-rows.cuh b/ggml/src/ggml-cuda/set-rows.cuh new file mode 100644 index 00000000000..c140c0873c8 --- /dev/null +++ b/ggml/src/ggml-cuda/set-rows.cuh @@ -0,0 +1,7 @@ +#pragma once + +#include "common.cuh" + +#define CUDA_SET_ROWS_BLOCK_SIZE 256 + +void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); From 3cad26d80731e8e95685f377e5d17d012a0805eb Mon Sep 17 00:00:00 2001 From: Yavor Ivanov Date: Sat, 12 Jul 2025 22:38:13 -0700 Subject: [PATCH 0102/1348] metal : Add missing unary ops Metal support (llama/14660) --- ggml/src/ggml-metal/ggml-metal.m | 90 ++++++++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 83a0739809a..44ddc69d08f 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -173,6 +173,12 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_SILU, GGML_METAL_KERNEL_TYPE_SILU_4, GGML_METAL_KERNEL_TYPE_ELU, + GGML_METAL_KERNEL_TYPE_ABS, + GGML_METAL_KERNEL_TYPE_SGN, + GGML_METAL_KERNEL_TYPE_STEP, + GGML_METAL_KERNEL_TYPE_HARDSWISH, + GGML_METAL_KERNEL_TYPE_HARDSIGMOID, + GGML_METAL_KERNEL_TYPE_EXP, GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, @@ -1155,6 +1161,12 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU, elu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ABS, abs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SGN, sgn, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_STEP, step, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSWISH, hardswish, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSIGMOID, hardsigmoid, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_EXP, exp, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, has_simdgroup_reduction); @@ -1688,6 +1700,12 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_EXP: return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; default: return false; @@ -2439,6 +2457,78 @@ static bool ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_UNARY_OP_ABS: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ABS].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_SGN: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SGN].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_STEP: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_STEP].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_HARDSWISH: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSWISH].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_HARDSIGMOID: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSIGMOID].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_EXP: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_EXP].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; default: { GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 239ec31fbcb..13235e28852 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1199,6 +1199,51 @@ kernel void kernel_neg( dst[tpig] = -src0[tpig]; } +kernel void kernel_abs( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = fabs(src0[tpig]); +} + +kernel void kernel_sgn( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + dst[tpig] = (x > 0.0f) ? 1.0f : ((x < 0.0f) ? -1.0f : 0.0f); +} + +kernel void kernel_step( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] > 0.0f ? 1.0f : 0.0f; +} + +kernel void kernel_hardswish( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + dst[tpig] = x * fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f)); +} + +kernel void kernel_hardsigmoid( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + dst[tpig] = fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f)); +} + +kernel void kernel_exp( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = exp(src0[tpig]); +} + kernel void kernel_reglu( device const char * src0, device const char * src1, From 21308b4e6eb89d388dd56a534336a9603338b903 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Jul 2025 10:36:33 +0300 Subject: [PATCH 0103/1348] ggml : add build-time message to remind about ggml_set_rows (llama/14661) ggml-ci --- ggml/src/ggml-cann/ggml-cann.cpp | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 1 + ggml/src/ggml-opencl/ggml-opencl.cpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 1 + 4 files changed, 4 insertions(+) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index ccb17eb072e..e5e11d4cdce 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2090,6 +2090,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, { // TODO: add support // ref: https://github.com/ggml-org/llama.cpp/pull/14274 +#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)") return false; } break; case GGML_OP_CPY: { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 88b17dd682c..1478245998a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3222,6 +3222,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } break; case GGML_OP_SET_ROWS: { +#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)") return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64; diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 58830b733a8..3388259152b 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2280,6 +2280,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te { // TODO: add support // ref: https://github.com/ggml-org/llama.cpp/pull/14274 +#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)") if (op->src[0]->type != GGML_TYPE_F32) { return false; } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 65b26fd0276..7f74fbfe5c1 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4303,6 +4303,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g { // TODO: add support // ref: https://github.com/ggml-org/llama.cpp/pull/14274 +#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)") return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64)); } break; case GGML_OP_CPY: From fe33572b227c58173300f02d641afec3c758d204 Mon Sep 17 00:00:00 2001 From: Yavor Ivanov Date: Sun, 13 Jul 2025 02:33:16 -0700 Subject: [PATCH 0104/1348] cuda : add ELU support (llama/14657) --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++ ggml/src/ggml-cuda/unary.cu | 7 +++++++ ggml/src/ggml-cuda/unary.cuh | 2 ++ 3 files changed, 13 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1478245998a..c7222207efe 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2303,6 +2303,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_UNARY_OP_EXP: ggml_cuda_op_exp(ctx, dst); break; + case GGML_UNARY_OP_ELU: + ggml_cuda_op_elu(ctx, dst); + break; default: return false; } @@ -3116,6 +3119,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_ELU: return ggml_is_contiguous(op->src[0]); default: return false; diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index f9c7b83c40d..91c830c4dac 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -83,6 +83,10 @@ static __device__ __forceinline__ float op_log(float x) { return logf(x); } +static __device__ __forceinline__ float op_elu(float x) { + return (x > 0.f) ? x : expm1f(x); +} + template static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -196,6 +200,9 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_unary(ctx, dst); } +void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary(ctx, dst); +} /* gated ops */ template diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index 289d690e5cf..cb14d16f8f3 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -59,6 +59,8 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); From 0611387d17f0eed04bb16c808371d14b4c2f2e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 13 Jul 2025 15:01:24 +0200 Subject: [PATCH 0105/1348] cuda : add set rows for bf16 (llama/14664) --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++-- ggml/src/ggml-cuda/set-rows.cu | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c7222207efe..8015b0d4e8d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3226,8 +3226,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } break; case GGML_OP_SET_ROWS: { -#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)") - return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && +#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)") + return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64; } break; diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index d8b3e63e1aa..3fade72b84e 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -10,6 +10,11 @@ __device__ __forceinline__ void set_rows_1(const float * src_f, hal *dst_h = __float2half(*src_f); } +template<> +__device__ __forceinline__ void set_rows_1(const float * src_f, nv_bfloat16 * dst_b) { + *dst_b = *src_f; +} + template<> __device__ __forceinline__ void set_rows_1(const float * src_f, float * dst_f) { *dst_f = *src_f; @@ -124,6 +129,16 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { nb1, nb2, nb3, stream ); + } else if (dst->type == GGML_TYPE_BF16) { + set_rows_cuda( + src0_d, src1_d, (nv_bfloat16*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); } else { GGML_ABORT("unsupported type"); } From 24803d62c6394a54c3866fe3c8694655c61c8c9a Mon Sep 17 00:00:00 2001 From: Anton Mitkov Date: Mon, 14 Jul 2025 10:37:35 +0100 Subject: [PATCH 0106/1348] sycl: Batched mulmat rework for oneDNN dispatch (llama/14617) --- ggml/src/ggml-sycl/gemm.hpp | 40 +++----- ggml/src/ggml-sycl/ggml-sycl.cpp | 165 ++++++++++++++++++++++--------- 2 files changed, 133 insertions(+), 72 deletions(-) diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp index 5efe03d364b..dcf6c7aeeb4 100644 --- a/ggml/src/ggml-sycl/gemm.hpp +++ b/ggml/src/ggml-sycl/gemm.hpp @@ -32,39 +32,28 @@ class DnnlGemmWrapper { else static_assert(0); } - // matrix A has m rows, k columns - // matrix B has k rows, n columns - // nra - number of elements to skip when moving into next row in A - // nrb - number of elements to skip when moving into next row in B - // nca - number of elements to skip when moving into next column in A - // ncb - number of elements to skip when moving into next column in B - // stride_a - number of elements to skip when moving to next A matrix - // stride_b - number of elements to skip when moving to next B matrix - // batches_a - number of A matrices - // batches_b - number of B matrices static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k, - const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a, - const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b, + const void * a, dt at, dnnl_dim_t stra0, dnnl_dim_t stra1, dnnl_dim_t stra2, + const void * b, dt bt, dnnl_dim_t strb0, dnnl_dim_t strb1, dnnl_dim_t strb2, void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) { auto stream = ctx.stream_dnnl(q); auto eng = ctx.engine_dnnl(q); - // { # strides, # rows, # columns } - dnnl::memory::dims a_dims = { batches_a, m, k }; - dnnl::memory::dims b_dims = { batches_b, k, n }; - dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n }; - - // { # elements to skip to next stride, # elements to skip to next row, # elements to skip to next column } - dnnl::memory::dims a_strides = { stride_a, nra, nca }; - dnnl::memory::dims b_strides = { stride_b, nrb, ncb }; - + dnnl::memory::dims a_dims = {batches_a, m, k }; + dnnl::memory::dims a_strides = {stra2, stra1, stra0}; const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides); + + dnnl::memory::dims b_dims = {batches_b, k, n }; + dnnl::memory::dims b_strides = {strb2, strb0, strb1}; const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides); - const auto c_md = dnnl::memory::desc(c_dims, ct, tag::abc); + dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n}; + dnnl::memory::dims c_strides = {m*n, 1, m }; + const auto c_md = dnnl::memory::desc(c_dims, ct, c_strides); dnnl::primitive_attr primitive_attr; primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + #ifdef GGML_SYCL_F16 primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16); #endif @@ -76,24 +65,23 @@ class DnnlGemmWrapper { auto scratchpad_md = matmul_pd.scratchpad_desc(); auto scratchpad_mem = ctx.get_scratchpad_mem(scratchpad_md, eng, q); + auto matmul_prim = dnnl::matmul(matmul_pd); std::unordered_map matmul_args; matmul_args.insert({ DNNL_ARG_SRC, a_mem }); matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem }); + matmul_args.insert({ DNNL_ARG_DST, c_mem }); matmul_args.insert({ DNNL_ARG_SCRATCHPAD, scratchpad_mem }); matmul_prim.execute(stream, matmul_args); } - // matrices A and B are column major, both having k rows - // matrix A has m column, matrix B has n columns - // output: column major matrix C = A transposed * B static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k, const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) { - gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1); + gemm(ctx, m, n, k, a, at, 1, k, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1); } }; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 7f74fbfe5c1..cf46012be81 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1546,7 +1546,7 @@ static void mul_mat_p021_f16_f32( static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, - const int row_stride_x, const int channel_stride_x, const int channel_x_divisor, + const int row_stride_x, const int channel_stride_x,const int channel_stride_y, const int channel_x_divisor, const sycl::nd_item<3> &item_ct1) { const sycl::half *x = (const sycl::half *)vx; @@ -1557,7 +1557,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous item_ct1.get_local_id(0); const int channel_x = channel / channel_x_divisor; - const int nrows_y = ncols_x; const int nrows_dst = nrows_x; const int row_dst = row_x; @@ -1576,7 +1575,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const int row_y = col_x; const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; - const int iy = channel*nrows_y + row_y; + const int iy = channel * channel_stride_y + row_y; const float xi = sycl::vec(x[ix]) @@ -1823,7 +1822,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y, static void ggml_mul_mat_vec_nc_f16_f32_sycl( const void *vx, const float *y, float *dst, const int ncols_x, const int nrows_x, const int row_stride_x, const int nchannels_x, - const int nchannels_y, const int channel_stride_x, queue_ptr stream) { + const int nchannels_y, const int channel_stride_x, const int channel_stride_y, queue_ptr stream) { const sycl::range<3> block_nums(nchannels_y, nrows_x, 1); const sycl::range<3> block_dims(1, 1, WARP_SIZE); @@ -1835,7 +1834,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, - row_stride_x, channel_stride_x, + row_stride_x, channel_stride_x, channel_stride_y, nchannels_y / nchannels_x, item_ct1); }); } @@ -2124,8 +2123,8 @@ inline void ggml_sycl_op_mul_mat_sycl( #if GGML_SYCL_DNNL if (!g_ggml_sycl_disable_dnn) { - DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr, - DnnlGemmWrapper::to_dt(), src0_ptr, DnnlGemmWrapper::to_dt(), + DnnlGemmWrapper::row_gemm(ctx,row_diff, src1_ncols , ne10, src0_ptr, + DnnlGemmWrapper::to_dt(), src1_ptr, DnnlGemmWrapper::to_dt(), dst_dd_i, DnnlGemmWrapper::to_dt(), stream); } else @@ -2171,8 +2170,8 @@ inline void ggml_sycl_op_mul_mat_sycl( #if GGML_SYCL_DNNL if (!g_ggml_sycl_disable_dnn) { - DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i, - DnnlGemmWrapper::to_dt(), src0_ddf_i, DnnlGemmWrapper::to_dt(), + DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i, + DnnlGemmWrapper::to_dt(), src1_ddf1_i, DnnlGemmWrapper::to_dt(), dst_dd_i, DnnlGemmWrapper::to_dt(), stream); } else @@ -2776,6 +2775,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml const int64_t nb02 = src0->nb[2]; const int64_t ne12 = src1->ne[2]; + const int64_t nb11 = src1->nb[1]; SYCL_CHECK(ggml_sycl_set_device(ctx.device)); queue_ptr main_stream = ctx.stream(); @@ -2786,8 +2786,9 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml const int64_t row_stride_x = nb01 / sizeof(sycl::half); const int64_t channel_stride_x = nb02 / sizeof(sycl::half); + const int64_t channel_stride_y = nb11 / sizeof(float); - ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); + ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,channel_stride_y, main_stream); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -2841,8 +2842,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons float * dst_ddf = static_cast(dst->data); const sycl::half * src1_f16 = static_cast(src1->data); + const size_t type_size_src0 = ggml_type_size(src0->type); const size_t type_size_src1 = ggml_type_size(src1->type); - GGML_ASSERT(nb10 == type_size_src1); // SRC1 strides int64_t s11 = nb11 / type_size_src1; @@ -2854,11 +2855,32 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons if (src1->type != GGML_TYPE_F16) { scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2, " : converting src1 to fp16"); - const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type); - GGML_ASSERT(to_fp16_nc_sycl != nullptr); - const int64_t ne_src1 = ggml_nelements(src1); + + // iterate tensor dims and find the slowest moving dim and stride + int64_t last_dim=0; + int64_t last_str=0; + int64_t largest_str=0; + for(int i = 0; i< 4; i++){ + // last stride is always the largest + if(src1->nb[i] == largest_str){ + if(src1->ne[last_dim] == 1){ + last_str = i; + last_dim = i; + } + } + if(src1->nb[i] > largest_str){ + largest_str = src1->nb[i]; + last_str = i; + last_dim = i; + } + + } + const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1; src1_f16_alloc.alloc(ne_src1); - to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue); + + const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst); + GGML_ASSERT(to_fp16_sycl != nullptr); + to_fp16_sycl(src1_f16, src1_f16_alloc.get(), ne_src1, queue); src1_f16 = src1_f16_alloc.get(); s11 = ne10; @@ -2892,38 +2914,89 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons #if GGML_SYCL_DNNL if (!g_ggml_sycl_disable_dnn) { - auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12] - (const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) { - - DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10, - src1, DnnlGemmWrapper::to_dt(), s11, 1, s12, - src0, DnnlGemmWrapper::to_dt(), 1, nb01/nb00, nb02/nb00, - dst, DnnlGemmWrapper::to_dt(), queue, batches_a, batches_b); - }; - - if (r2 == 1 && r3 == 1) { - if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) { - dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03); - } - else { - for (int64_t ie03 = 0; ie03 < ne03; ++ie03) { - const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes - const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13; - float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float)); - dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02); + int64_t str_a0 = nb00 / type_size_src0; + int64_t str_a1 = nb01 / type_size_src0; + int64_t str_a2 = nb02 / type_size_src0; + + int64_t str_b0 = nb10 / type_size_src1; + int64_t str_b1 = nb11 / type_size_src1; + int64_t str_b2 = nb12 / type_size_src1; + + auto launch_gemm_for_batches = [&ctx, queue](const sycl::half *src0, + const sycl::half *src1, float *dst, + int64_t a0, int64_t a1, int64_t batcha, + int64_t b0, int64_t b1, int64_t batchb, + int64_t sa0, int64_t sa1, int64_t sa2, + int64_t sb0, int64_t sb1, int64_t sb2, + int64_t sd2) { + bool supported_broadcast = batchb == batcha ? true + : batchb == 1 || batcha == 1 ? true + : false; + if (supported_broadcast) { + DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0, + DnnlGemmWrapper::to_dt(), sa0, sa1, sa2, src1, + DnnlGemmWrapper::to_dt(), sb0, sb1, sb2, dst, + DnnlGemmWrapper::to_dt(), queue, batcha, batchb); + } else { + // iterate over batches from smaller set of matrices (matrix 0) + int64_t batches0 = batcha; + int64_t batches1 = batchb; + + if (batches0 > batches1) { + int64_t num_mul_mats = batches1; + int64_t sub_batch = batches0 / num_mul_mats; + // src0 is batched and bigger, shift and multiply with src1 + for (int64_t i0 = 0; i0 < num_mul_mats; i0++) { + const sycl::half *src0_shifted = src0 + (sa2 * i0 * sub_batch); + const sycl::half *src1_shifted = src1 + (sb2 * i0); + float *dst_shifted = dst + (sd2 * i0 * sub_batch); + DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted, + DnnlGemmWrapper::to_dt(), sa0, sa1, sa2, + src1_shifted, DnnlGemmWrapper::to_dt(), sb0, + sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt(), + queue, sub_batch, 1); + } + } else { + int64_t num_mul_mats = batches0; + int64_t sub_batch = batches1 / num_mul_mats; + // src1 is batched and bigger, shift and multiply with src0 + for (int64_t i1 = 0; i1 < num_mul_mats; i1++) { + const sycl::half *src0_shifted = src0 + (sa2 * i1); + const sycl::half *src1_shifted = src1 + (sb2 * i1 * sub_batch); + float *dst_shifted = dst + (sd2 * i1 * sub_batch); + DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted, + DnnlGemmWrapper::to_dt(), sa0, sa1, sa2, + src1_shifted, DnnlGemmWrapper::to_dt(), sb0, + sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt(), + queue, 1, sub_batch); + } + } } - } - } else { - // iterate over batches from smaller set of matrices (matrix 0) - for (int64_t ie02 = 0; ie02 < ne02; ++ie02) { - for (int64_t ie03 = 0; ie03 < ne03; ++ie03) { - const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half)); - const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3; - float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float)); - dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1); + }; + + bool cont_batches_a = nb02 * ne02 == nb03; + bool cont_batches_b = nb12 * ne12 == nb13; + if (cont_batches_a && cont_batches_b) { + int64_t batches0 = ne02 * ne03; + int64_t batches1 = ne12 * ne13; + launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0, + ne10, ne11, batches1, str_a0, str_a1, str_a2, str_b0, str_b1, + str_b2, nb2 / sizeof(float)); + } else { + for (int64_t b_a = 0; b_a < ne03; b_a++) { + const sycl::half *src0_f16_shifted + = src0_f16 + (nb03 * b_a / type_size_src0); + const sycl::half *src1_f16_shifted + = src1_f16 + (nb13 * b_a / type_size_src1); + float *dst_shifted = dst_ddf + (nb3 * b_a / sizeof(float)); + int64_t batches0 = ne02; + int64_t batches1 = ne12; + launch_gemm_for_batches(src0_f16_shifted, src1_f16_shifted, dst_shifted, + ne00, ne01, batches0, ne10, ne11, batches1, str_a0, str_a1, + str_a2, str_b0, str_b1, str_b2, nb2 / sizeof(float)); } } - } + } else #endif @@ -3263,10 +3336,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor // The kernel from the if path is faster for that specific case, but does not support all mul mats. ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); } - } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst); - } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2] * src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { From ebb0e9d0ed7eccd5b320d56d60ef50cf3c87b105 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 14 Jul 2025 15:07:55 +0530 Subject: [PATCH 0107/1348] SYCL: use 1D kernel for set_rows (llama/14618) * SYCL: Use 1D kernel for set_rows * Remove dangling comment * Refactor and use ceil_div --- ggml/src/ggml-sycl/set_rows.cpp | 86 ++++++++++++++++----------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index 4a76a63d354..3091fab3995 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -6,46 +6,49 @@ static constexpr bool is_arithmetic_v() { return std::is_arithmetic_v || std::is_same_v || std::is_same_v; } } + template static inline std::enable_if_t() && utils::is_arithmetic_v(), void> convert (const char* src, char* dst) { auto src_val = *reinterpret_cast(src); auto dst_val = sycl::vec(src_val).template convert()[0]; - *reinterpret_cast(dst) = dst_val;; + *reinterpret_cast(dst) = dst_val; } template static void k_set_rows( const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst, - const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12, + const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03, const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb1, const size_t nb2, const size_t nb3, const size_t src_type_size, const size_t dst_type_size, - const sycl::nd_item<3> & item_ct1) { - - const int i03 = item_ct1.get_group(0); - const int i02 = item_ct1.get_group(1); - const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); // Row index + const int64_t total_elements, + const sycl::nd_item<1> & item_ct1) { - if (i01 >= ne01) { + const int64_t i = item_ct1.get_global_linear_id(); + if (i >= total_elements) { return; } - const int i12 = i03 % ne12; - const int i11 = i02 % ne11; - const int i10 = i01; + const int64_t i03 = i / (ne00 * ne01 * ne02); + const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00; + const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00; + + const int64_t i12 = i03 % ne12; + const int64_t i11 = i02 % ne11; + const int64_t i10 = i01; const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12})); const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03}); - char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3; + const char * src_elem = src0_row + i00 * src_type_size; + char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3; + char * dst_elem = dst_row_ptr + i00 * dst_type_size; - for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) { - const char * src_elem = src0_row + col * src_type_size; - char * dst_elem = dst_row_ptr + col * dst_type_size; - convert(src_elem, dst_elem); - } + convert(src_elem, dst_elem); } template @@ -58,32 +61,29 @@ static void set_rows_sycl( const size_t src_type_size, const size_t dst_type_size, queue_ptr stream) { - constexpr int max_threads_per_row = 64; // KEEPING 64 for now - const int threads_per_row = std::min((int)ne00, max_threads_per_row); - - constexpr int max_threads_per_block = 64; - const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row); - - const sycl::range<3> block_size(1, rows_per_block, threads_per_row); - const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block); - - sycl_parallel_for( - stream, - sycl::nd_range<3>(grid_size * block_size, block_size), - [=](sycl::nd_item<3> item_ct1) { - k_set_rows( - src0_d, src1_d, dst_d, - ne00, ne01, ne11, ne12, - nb01, nb02, nb03, - nb10, nb11, nb12, - nb1, nb2, nb3, - src_type_size, dst_type_size, - item_ct1 - ); - } - ); -} + const int64_t total_elements = ne00 * ne01 * ne02 * ne03; + constexpr int block_size = 64; + const int64_t grid_size = ceil_div(total_elements, block_size); + + sycl_parallel_for( + stream, + sycl::nd_range<1>(grid_size * block_size, block_size), + [=](sycl::nd_item<1> item_ct1) { + k_set_rows( + src0_d, src1_d, dst_d, + ne00, ne01, ne02, + ne11, ne12, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + src_type_size, dst_type_size, + total_elements, + item_ct1 + ); + } + ); +} void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); @@ -122,7 +122,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { nb1, nb2, nb3, sizeof(float), sizeof(sycl::half), stream - ); + ); break; default: GGML_ABORT("Unsupported tensor type!"); From ded2e3cf6d29838b83cf74f3ae4d407f895cf12d Mon Sep 17 00:00:00 2001 From: shalinib-ibm Date: Mon, 14 Jul 2025 18:46:42 +0530 Subject: [PATCH 0108/1348] ggml : refactor llamafile_sgemm PPC code (llama/14673) Remove un-necessary templates from class definition and packing functions Reduce deeply nested conditionals, if-else switching in mnapck function Replace repetitive code with inline functions in Packing functions 2 ~ 7% improvement in Q8 Model 15 ~ 50% improvement in Q4 Model Signed-off-by: Shalini Salomi Bodapati --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 1437 ++++++------------------- 1 file changed, 343 insertions(+), 1094 deletions(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index ed61869a550..2be54c31b5f 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -1541,7 +1541,7 @@ class tinyBLAS_BF16_PPC { } else if constexpr(RM == 8 && RN == 4) { KERNEL_8x4(ii,jj); } else { - static_assert(false, "RN/RM values not supported"); + assert(false && "RN/RM values not supported"); } } @@ -1573,13 +1573,13 @@ class tinyBLAS_BF16_PPC { const int nth; }; -template +template class tinyBLAS_Q0_PPC { public: tinyBLAS_Q0_PPC(int64_t k, const TA *A, int64_t lda, - const TB *B, int64_t ldb, - TC *C, int64_t ldc, + const block_q8_0 *B, int64_t ldb, + float *C, int64_t ldc, int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } @@ -1590,8 +1590,7 @@ class tinyBLAS_Q0_PPC { private: - template - inline void save_res(int ii, int jj, int idx, vector float* fin_res) { + inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) { for (int I = 0; I < RM; I++) { for (int J = 0; J < RN; J++) { *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J); @@ -1611,29 +1610,67 @@ class tinyBLAS_Q0_PPC { fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]); } } - - template - void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, VA* vec, std::array& comparray) { - int64_t i, j; - TA *aoffset = NULL; - VA *vecOffset = NULL; - TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; - TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; - VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0}; - VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0}; - VB t1, t2, t3, t4, t5, t6, t7, t8; + /* This function processes quantized data from block_q4_0 elements. + * First the we try to extract the two int4 values stored in single int8_t into two signed int8. + * And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8. + * Also compute the rowsum which is required to compensate the above conversion. */ + inline void process_q4_elements(vector signed char (&c)[2], int* ca) { const vector signed char lowMask = vec_splats((signed char)0xF); const vector unsigned char v4 = vec_splats((unsigned char)0x4); const vector signed char v8 = vec_splats((signed char)0x8); - aoffset = const_cast(a); - vecOffset = vec; + vector signed int vsum = {0}; + vector signed int vsum2 = {0}; + c[0] = vec_and(c[1], lowMask); + c[1] = vec_sr(c[1], v4); + c[0] = vec_sub(c[0], v8); + c[1] = vec_sub(c[1], v8); + vsum = vec_sum4s(c[0], vsum); + vsum2 = vec_sum4s(c[1], vsum2); + vsum = vec_add(vsum, vsum2); + *(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3]; + } + + template + inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) { vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}; vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; - vector signed int vsum = {0}; - vector signed int vsum2 = {0}; + V2 t1, t2, t3, t4, t5, t6, t7, t8; + vector unsigned char xor_vector; + uint8_t flip_vec = 0x80; + xor_vector = vec_splats(flip_vec); + t1 = vec_perm(s1, s2, swiz1); + t2 = vec_perm(s1, s2, swiz2); + t3 = vec_perm(s3, s4, swiz1); + t4 = vec_perm(s3, s4, swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset+16); + vec_xst(t7, 0, vecOffset+32); + vec_xst(t8, 0, vecOffset+48); + } + template + void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array& comparray) { + int64_t i, j; + TA *aoffset = NULL; + int8_t *vecOffset = NULL; + TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; + TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; + vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0}; + vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0}; + aoffset = const_cast(a); + vecOffset = vec; j = (rows >> 3); if (j > 0) { do { @@ -1646,159 +1683,30 @@ class tinyBLAS_Q0_PPC { aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; - i = (cols >> 2); if (i > 0) { do { - c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); - c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); - c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); - c4[1] = reinterpret_cast(vec_xl(0, aoffset4->qs)); - c5[1] = reinterpret_cast(vec_xl(0, aoffset5->qs)); - c6[1] = reinterpret_cast(vec_xl(0, aoffset6->qs)); - c7[1] = reinterpret_cast(vec_xl(0, aoffset7->qs)); - c8[1] = reinterpret_cast(vec_xl(0, aoffset8->qs)); - - c1[0] = vec_and(c1[1], lowMask); - c1[1] = vec_sr(c1[1], v4); - c1[0] = vec_sub(c1[0], v8); - c1[1] = vec_sub(c1[1], v8); - vsum = vec_sum4s(c1[0], vsum); - vsum2 = vec_sum4s(c1[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c2[0] = vec_and(c2[1], lowMask); - c2[1] = vec_sr(c2[1], v4); - c2[0] = vec_sub(c2[0], v8); - c2[1] = vec_sub(c2[1], v8); - vsum = vec_sum4s(c2[0], vsum); - vsum2 = vec_sum4s(c2[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c3[0] = vec_and(c3[1], lowMask); - c3[1] = vec_sr(c3[1], v4); - c3[0] = vec_sub(c3[0], v8); - c3[1] = vec_sub(c3[1], v8); - vsum = vec_sum4s(c3[0], vsum); - vsum2 = vec_sum4s(c3[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c4[0] = vec_and(c4[1], lowMask); - c4[1] = vec_sr(c4[1], v4); - c4[0] = vec_sub(c4[0], v8); - c4[1] = vec_sub(c4[1], v8); - vsum = vec_sum4s(c4[0], vsum); - vsum2 = vec_sum4s(c4[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c5[0] = vec_and(c5[1], lowMask); - c5[1] = vec_sr(c5[1], v4); - c5[0] = vec_sub(c5[0], v8); - c5[1] = vec_sub(c5[1], v8); - vsum = vec_sum4s(c5[0], vsum); - vsum2 = vec_sum4s(c5[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[4] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c6[0] = vec_and(c6[1], lowMask); - c6[1] = vec_sr(c6[1], v4); - c6[0] = vec_sub(c6[0], v8); - c6[1] = vec_sub(c6[1], v8); - vsum = vec_sum4s(c6[0], vsum); - vsum2 = vec_sum4s(c6[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[5] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c7[0] = vec_and(c7[1], lowMask); - c7[1] = vec_sr(c7[1], v4); - c7[0] = vec_sub(c7[0], v8); - c7[1] = vec_sub(c7[1], v8); - vsum = vec_sum4s(c7[0], vsum); - vsum2 = vec_sum4s(c7[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[6] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c8[0] = vec_and(c8[1], lowMask); - c8[1] = vec_sr(c8[1], v4); - c8[0] = vec_sub(c8[0], v8); - c8[1] = vec_sub(c8[1], v8); - vsum = vec_sum4s(c8[0], vsum); - vsum2 = vec_sum4s(c8[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[7] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - t1 = vec_perm(c1[0], c2[0], swiz1); - t2 = vec_perm(c1[0], c2[0], swiz2); - t3 = vec_perm(c3[0], c4[0], swiz1); - t4 = vec_perm(c3[0], c4[0], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - vec_xst(t5, 0, vecOffset); - vec_xst(t6, 0, vecOffset+16); - vec_xst(t7, 0, vecOffset+32); - vec_xst(t8, 0, vecOffset+48); - - t1 = vec_perm(c1[1], c2[1], swiz1); - t2 = vec_perm(c1[1], c2[1], swiz2); - t3 = vec_perm(c3[1], c4[1], swiz1); - t4 = vec_perm(c3[1], c4[1], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - vec_xst(t5, 0, vecOffset+64); - vec_xst(t6, 0, vecOffset+80); - vec_xst(t7, 0, vecOffset+96); - vec_xst(t8, 0, vecOffset+112); - - t1 = vec_perm(c5[0], c6[0], swiz1); - t2 = vec_perm(c5[0], c6[0], swiz2); - t3 = vec_perm(c7[0], c8[0], swiz1); - t4 = vec_perm(c7[0], c8[0], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - vec_xst(t5, 0, vecOffset+128); - vec_xst(t6, 0, vecOffset+144); - vec_xst(t7, 0, vecOffset+160); - vec_xst(t8, 0, vecOffset+176); - - t1 = vec_perm(c5[1], c6[1], swiz1); - t2 = vec_perm(c5[1], c6[1], swiz2); - t3 = vec_perm(c7[1], c8[1], swiz1); - t4 = vec_perm(c7[1], c8[1], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - vec_xst(t5, 0, vecOffset+192); - vec_xst(t6, 0, vecOffset+208); - vec_xst(t7, 0, vecOffset+224); - vec_xst(t8, 0, vecOffset+240); - + c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); + c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); + c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); + c4[1] = reinterpret_cast(vec_xl(0, aoffset4->qs)); + c5[1] = reinterpret_cast(vec_xl(0, aoffset5->qs)); + c6[1] = reinterpret_cast(vec_xl(0, aoffset6->qs)); + c7[1] = reinterpret_cast(vec_xl(0, aoffset7->qs)); + c8[1] = reinterpret_cast(vec_xl(0, aoffset8->qs)); + + process_q4_elements(c1, &comparray[0]); + process_q4_elements(c2, &comparray[1]); + process_q4_elements(c3, &comparray[2]); + process_q4_elements(c4, &comparray[3]); + process_q4_elements(c5, &comparray[4]); + process_q4_elements(c6, &comparray[5]); + process_q4_elements(c7, &comparray[6]); + process_q4_elements(c8, &comparray[7]); + vector_permute_store(c1[0], c2[0], c3[0], c4[0], vecOffset, false); + vector_permute_store(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false); + vector_permute_store(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false); + vector_permute_store(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false); aoffset1 += lda; aoffset2 += lda; aoffset3 += lda; @@ -1821,85 +1729,20 @@ class tinyBLAS_Q0_PPC { aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; - i = (cols >> 2); if (i > 0) { do { - c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); - c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); - c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); - c4[1] = reinterpret_cast(vec_xl(0, aoffset4->qs)); - - c1[0] = vec_and(c1[1], lowMask); - c1[1] = vec_sr(c1[1], v4); - c1[0] = vec_sub(c1[0], v8); - c1[1] = vec_sub(c1[1], v8); - vsum = vec_sum4s(c1[0], vsum); - vsum2 = vec_sum4s(c1[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c2[0] = vec_and(c2[1], lowMask); - c2[1] = vec_sr(c2[1], v4); - c2[0] = vec_sub(c2[0], v8); - c2[1] = vec_sub(c2[1], v8); - vsum = vec_sum4s(c2[0], vsum); - vsum2 = vec_sum4s(c2[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c3[0] = vec_and(c3[1], lowMask); - c3[1] = vec_sr(c3[1], v4); - c3[0] = vec_sub(c3[0], v8); - c3[1] = vec_sub(c3[1], v8); - vsum = vec_sum4s(c3[0], vsum); - vsum2 = vec_sum4s(c3[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c4[0] = vec_and(c4[1], lowMask); - c4[1] = vec_sr(c4[1], v4); - c4[0] = vec_sub(c4[0], v8); - c4[1] = vec_sub(c4[1], v8); - vsum = vec_sum4s(c4[0], vsum); - vsum2 = vec_sum4s(c4[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats( 0); - - t1 = vec_perm(c1[0], c2[0], swiz1); - t2 = vec_perm(c1[0], c2[0], swiz2); - t3 = vec_perm(c3[0], c4[0], swiz1); - t4 = vec_perm(c3[0], c4[0], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - vec_xst(t5, 0, vecOffset); - vec_xst(t6, 0, vecOffset+16); - vec_xst(t7, 0, vecOffset+32); - vec_xst(t8, 0, vecOffset+48); - - t1 = vec_perm(c1[1], c2[1], swiz1); - t2 = vec_perm(c1[1], c2[1], swiz2); - t3 = vec_perm(c3[1], c4[1], swiz1); - t4 = vec_perm(c3[1], c4[1], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - vec_xst(t5, 0, vecOffset+64); - vec_xst(t6, 0, vecOffset+80); - vec_xst(t7, 0, vecOffset+96); - vec_xst(t8, 0, vecOffset+112); - + c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); + c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); + c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); + c4[1] = reinterpret_cast(vec_xl(0, aoffset4->qs)); + + process_q4_elements(c1, &comparray[0]); + process_q4_elements(c2, &comparray[1]); + process_q4_elements(c3, &comparray[2]); + process_q4_elements(c4, &comparray[3]); + vector_permute_store(c1[0], c2[0], c3[0], c4[0], vecOffset, false); + vector_permute_store(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false); aoffset1 += lda; aoffset2 += lda; aoffset3 += lda; @@ -1918,80 +1761,17 @@ class tinyBLAS_Q0_PPC { if (i > 0) { do { switch(rows) { - case 3: c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); - case 2: c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); - case 1: c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); + case 3: c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); + case 2: c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); + case 1: c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); break; } - c1[0] = vec_and(c1[1], lowMask); - c1[1] = vec_sr(c1[1], v4); - c1[0] = vec_sub(c1[0], v8); - c1[1] = vec_sub(c1[1], v8); - vsum = vec_sum4s(c1[0], vsum); - vsum2 = vec_sum4s(c1[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c2[0] = vec_and(c2[1], lowMask); - c2[1] = vec_sr(c2[1], v4); - c2[0] = vec_sub(c2[0], v8); - c2[1] = vec_sub(c2[1], v8); - vsum = vec_sum4s(c2[0], vsum); - vsum2 = vec_sum4s(c2[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c3[0] = vec_and(c3[1], lowMask); - c3[1] = vec_sr(c3[1], v4); - c3[0] = vec_sub(c3[0], v8); - c3[1] = vec_sub(c3[1], v8); - vsum = vec_sum4s(c3[0], vsum); - vsum2 = vec_sum4s(c3[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - c4[0] = vec_and(c4[1], lowMask); - c4[1] = vec_sr(c4[1], v4); - c4[0] = vec_sub(c4[0], v8); - c4[1] = vec_sub(c4[1], v8); - vsum = vec_sum4s(c4[0], vsum); - vsum2 = vec_sum4s(c4[1], vsum2); - vsum = vec_add(vsum, vsum2); - comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3]; - vsum = vec_splats(0); - vsum2 = vec_splats(0); - - t1 = vec_perm(c1[0], c2[0], swiz1); - t2 = vec_perm(c1[0], c2[0], swiz2); - t3 = vec_perm(c3[0], c4[0], swiz1); - t4 = vec_perm(c3[0], c4[0], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - vec_xst(t5, 0, vecOffset); - vec_xst(t6, 0, vecOffset+16); - vec_xst(t7, 0, vecOffset+32); - vec_xst(t8, 0, vecOffset+48); - - t1 = vec_perm(c1[1], c2[1], swiz1); - t2 = vec_perm(c1[1], c2[1], swiz2); - t3 = vec_perm(c3[1], c4[1], swiz1); - t4 = vec_perm(c3[1], c4[1], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - vec_xst(t5, 0, vecOffset+64); - vec_xst(t6, 0, vecOffset+80); - vec_xst(t7, 0, vecOffset+96); - vec_xst(t8, 0, vecOffset+112); + process_q4_elements(c1, &comparray[0]); + process_q4_elements(c2, &comparray[1]); + process_q4_elements(c3, &comparray[2]); + process_q4_elements(c4, &comparray[3]); + vector_permute_store(c1[0], c2[0], c3[0], c4[0], vecOffset, false); + vector_permute_store(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false); aoffset1 += lda; aoffset2 += lda; aoffset3 += lda; @@ -2001,146 +1781,40 @@ class tinyBLAS_Q0_PPC { } } } - template - void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec, bool flip) { + void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) { int64_t i, j; - TB *aoffset = NULL; + block_q8_0 *aoffset = NULL; VA *vecOffset = NULL; - TB *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; - TB *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; - __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; - VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0}; - VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0}; - VB t1, t2, t3, t4, t5, t6, t7, t8; - vector unsigned char xor_vector; - uint8_t flip_vec = 0x80; - xor_vector = vec_splats(flip_vec); - vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; - vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; - vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}; - vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; - - aoffset = const_cast(a); + block_q8_0* aoffsets[8]; + __vector_pair arr[8]; + VB c[8][2] = {0}; + VB c1[8] = {0}; VB c2[8] = {0}; + aoffset = const_cast(a); vecOffset = vec; j = (rows >> 3); if (j > 0) { do { - aoffset1 = aoffset; - aoffset2 = aoffset1 + lda; - aoffset3 = aoffset2 + lda; - aoffset4 = aoffset3 + lda; - aoffset5 = aoffset4 + lda; - aoffset6 = aoffset5 + lda; - aoffset7 = aoffset6 + lda; - aoffset8 = aoffset7 + lda; + aoffsets[0] = aoffset; + for (int it = 1; it < 8; it++) + aoffsets[it] = aoffsets[it-1] + lda; aoffset += 8 * lda; i = (cols >> 3); if (i > 0) { do { - C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); - C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); - C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); - C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs); - C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs); - C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs); - C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs); - C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs); - - __builtin_vsx_disassemble_pair(c1, &C1); - __builtin_vsx_disassemble_pair(c2, &C2); - __builtin_vsx_disassemble_pair(c3, &C3); - __builtin_vsx_disassemble_pair(c4, &C4); - __builtin_vsx_disassemble_pair(c5, &C5); - __builtin_vsx_disassemble_pair(c6, &C6); - __builtin_vsx_disassemble_pair(c7, &C7); - __builtin_vsx_disassemble_pair(c8, &C8); - - t1 = vec_perm(c1[0], c2[0], swiz1); - t2 = vec_perm(c1[0], c2[0], swiz2); - t3 = vec_perm(c3[0], c4[0], swiz1); - t4 = vec_perm(c3[0], c4[0], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { - t5 = vec_xor(t5, xor_vector); - t6 = vec_xor(t6, xor_vector); - t7 = vec_xor(t7, xor_vector); - t8 = vec_xor(t8, xor_vector); - } - vec_xst(t5, 0, vecOffset); - vec_xst(t6, 0, vecOffset+16); - vec_xst(t7, 0, vecOffset+32); - vec_xst(t8, 0, vecOffset+48); - - t1 = vec_perm(c1[1], c2[1], swiz1); - t2 = vec_perm(c1[1], c2[1], swiz2); - t3 = vec_perm(c3[1], c4[1], swiz1); - t4 = vec_perm(c3[1], c4[1], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { - t5 = vec_xor(t5, xor_vector); - t6 = vec_xor(t6, xor_vector); - t7 = vec_xor(t7, xor_vector); - t8 = vec_xor(t8, xor_vector); - } - vec_xst(t5, 0, vecOffset+64); - vec_xst(t6, 0, vecOffset+80); - vec_xst(t7, 0, vecOffset+96); - vec_xst(t8, 0, vecOffset+112); - - t1 = vec_perm(c5[0], c6[0], swiz1); - t2 = vec_perm(c5[0], c6[0], swiz2); - t3 = vec_perm(c7[0], c8[0], swiz1); - t4 = vec_perm(c7[0], c8[0], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { - t5 = vec_xor(t5, xor_vector); - t6 = vec_xor(t6, xor_vector); - t7 = vec_xor(t7, xor_vector); - t8 = vec_xor(t8, xor_vector); - } - vec_xst(t5, 0, vecOffset+128); - vec_xst(t6, 0, vecOffset+144); - vec_xst(t7, 0, vecOffset+160); - vec_xst(t8, 0, vecOffset+176); - - t1 = vec_perm(c5[1], c6[1], swiz1); - t2 = vec_perm(c5[1], c6[1], swiz2); - t3 = vec_perm(c7[1], c8[1], swiz1); - t4 = vec_perm(c7[1], c8[1], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { - t5 = vec_xor(t5, xor_vector); - t6 = vec_xor(t6, xor_vector); - t7 = vec_xor(t7, xor_vector); - t8 = vec_xor(t8, xor_vector); + for (int it = 0; it < 8; it++) { + arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[it]->qs); + __builtin_vsx_disassemble_pair(c[it], &arr[it]); + c1[it] = c[it][0]; + c2[it] = c[it][1]; } - vec_xst(t5, 0, vecOffset+192); - vec_xst(t6, 0, vecOffset+208); - vec_xst(t7, 0, vecOffset+224); - vec_xst(t8, 0, vecOffset+240); - - aoffset1 += lda; - aoffset2 += lda; - aoffset3 += lda; - aoffset4 += lda; - aoffset5 += lda; - aoffset6 += lda; - aoffset7 += lda; - aoffset8 += lda; + vector_permute_store(c1[0], c1[1], c1[2], c1[3], vecOffset, flip); + vector_permute_store(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip); + vector_permute_store(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip); + vector_permute_store(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip); + for (int it = 0; it < 8; it++) + aoffsets[it] += lda; vecOffset += 256; i--; } while(i > 0); @@ -2150,129 +1824,53 @@ class tinyBLAS_Q0_PPC { } if (rows & 4) { - aoffset1 = aoffset; - aoffset2 = aoffset1 + lda; - aoffset3 = aoffset2 + lda; - aoffset4 = aoffset3 + lda; - aoffset += 4 * lda; - + aoffsets[0] = aoffset; + for (int it = 1; it < 4; it++ ) + aoffsets[it] = aoffsets[it-1] + lda; + aoffset += 4 * lda; i = (cols >> 3); if (i > 0) { do { - C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); - C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); - C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); - C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs); - - __builtin_vsx_disassemble_pair(c1, &C1); - __builtin_vsx_disassemble_pair(c2, &C2); - __builtin_vsx_disassemble_pair(c3, &C3); - __builtin_vsx_disassemble_pair(c4, &C4); - - t1 = vec_perm(c1[0], c2[0], swiz1); - t2 = vec_perm(c1[0], c2[0], swiz2); - t3 = vec_perm(c3[0], c4[0], swiz1); - t4 = vec_perm(c3[0], c4[0], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { - t5 = vec_xor(t5, xor_vector); - t6 = vec_xor(t6, xor_vector); - t7 = vec_xor(t7, xor_vector); - t8 = vec_xor(t8, xor_vector); + for (int it = 0; it < 4; it++) { + arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[it]->qs); + __builtin_vsx_disassemble_pair(c[it], &arr[it]); + c1[it] = c[it][0]; + c2[it] = c[it][1]; } - vec_xst(t5, 0, vecOffset); - vec_xst(t6, 0, vecOffset+16); - vec_xst(t7, 0, vecOffset+32); - vec_xst(t8, 0, vecOffset+48); - - t1 = vec_perm(c1[1], c2[1], swiz1); - t2 = vec_perm(c1[1], c2[1], swiz2); - t3 = vec_perm(c3[1], c4[1], swiz1); - t4 = vec_perm(c3[1], c4[1], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { - t5 = vec_xor(t5, xor_vector); - t6 = vec_xor(t6, xor_vector); - t7 = vec_xor(t7, xor_vector); - t8 = vec_xor(t8, xor_vector); + vector_permute_store(c1[0], c1[1], c1[2], c1[3], vecOffset, flip); + vector_permute_store(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip); + for (int it = 0; it < 4; it++) { + aoffsets[it] += lda; } - vec_xst(t5, 0, vecOffset+64); - vec_xst(t6, 0, vecOffset+80); - vec_xst(t7, 0, vecOffset+96); - vec_xst(t8, 0, vecOffset+112); - - aoffset1 += lda; - aoffset2 += lda; - aoffset3 += lda; - aoffset4 += lda; vecOffset += 128; i--; } while(i > 0); } } + if (rows & 3) { - aoffset1 = aoffset; - aoffset2 = aoffset1 + lda; - aoffset3 = aoffset2 + lda; + aoffsets[0] = aoffset; + for (int it = 1; it < 3; it++ ) + aoffsets[it] = aoffsets[it-1] + lda; i = (cols >> 3); if (i > 0) { do { switch(rows) { - case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); - __builtin_vsx_disassemble_pair(c3, &C3); - case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); - __builtin_vsx_disassemble_pair(c2, &C2); - case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); - __builtin_vsx_disassemble_pair(c1, &C1); + case 3: arr[2] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[2]->qs); + __builtin_vsx_disassemble_pair(c[2], &arr[2]); + c1[2] = c[2][0]; c2[2] = c[2][1]; + case 2: arr[1] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[1]->qs); + __builtin_vsx_disassemble_pair(c[1], &arr[1]); + c1[1] = c[1][0]; c2[1] = c[1][1]; + case 1: arr[0] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[0]->qs); + __builtin_vsx_disassemble_pair(c[0], &arr[0]); + c1[0] = c[0][0]; c2[0] = c[0][1]; break; } - t1 = vec_perm(c1[0], c2[0], swiz1); - t2 = vec_perm(c1[0], c2[0], swiz2); - t3 = vec_perm(c3[0], c4[0], swiz1); - t4 = vec_perm(c3[0], c4[0], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { - t5 = vec_xor(t5, xor_vector); - t6 = vec_xor(t6, xor_vector); - t7 = vec_xor(t7, xor_vector); - t8 = vec_xor(t8, xor_vector); - } - vec_xst(t5, 0, vecOffset); - vec_xst(t6, 0, vecOffset+16); - vec_xst(t7, 0, vecOffset+32); - vec_xst(t8, 0, vecOffset+48); - - t1 = vec_perm(c1[1], c2[1], swiz1); - t2 = vec_perm(c1[1], c2[1], swiz2); - t3 = vec_perm(c3[1], c4[1], swiz1); - t4 = vec_perm(c3[1], c4[1], swiz2); - t5 = vec_perm(t1, t3, swiz3); - t6 = vec_perm(t1, t3, swiz4); - t7 = vec_perm(t2, t4, swiz3); - t8 = vec_perm(t2, t4, swiz4); - if (flip == true) { - t5 = vec_xor(t5, xor_vector); - t6 = vec_xor(t6, xor_vector); - t7 = vec_xor(t7, xor_vector); - t8 = vec_xor(t8, xor_vector); - } - vec_xst(t5, 0, vecOffset+64); - vec_xst(t6, 0, vecOffset+80); - vec_xst(t7, 0, vecOffset+96); - vec_xst(t8, 0, vecOffset+112); - - aoffset1 += lda; - aoffset2 += lda; - aoffset3 += lda; + vector_permute_store(c1[0], c1[1], c1[2], c1[3], vecOffset, flip); + vector_permute_store(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip); + for (int it = 0; it < 3; it++) + aoffsets[it] += lda; vecOffset += 128; i--; } while(i > 0); @@ -2281,159 +1879,42 @@ class tinyBLAS_Q0_PPC { } void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { - int64_t mc, nc, mp, np; - int m_rem = MIN(m - m0, 8); - int n_rem = MIN(n - n0, 8); - // TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance - // issues. After resolving them, below code will be enabled. - /*if (m_rem >= 16 && n_rem >= 8) { - mc = 16; - nc = 8; - gemm<16,8>(m0, m, n0, n); - } else if(m_rem >= 8 && n_rem >= 16) { - mc = 8; - nc = 16; - gemm<8,16>(m0, m, n0, n); - }*/ + int m_rem = MIN(m - m0, 16); + int n_rem = MIN(n - n0, 16); + + int mc = 0, nc = 0; + if (m_rem >= 8 && n_rem >= 8) { - mc = 8; - nc = 8; - gemm<8,8>(m0, m, n0, n); + mc = 8; + nc = 8; + gemm<8, 8>(m0, m, n0, n); } else if (m_rem >= 4 && n_rem >= 8) { mc = 4; nc = 8; - gemm<4,8>(m0, m, n0, n); + gemm<4, 8>(m0, m, n0, n); } else if (m_rem >= 8 && n_rem >= 4) { mc = 8; nc = 4; - gemm<8,4>(m0, m, n0, n); + gemm<8, 4>(m0, m, n0, n); } else if (m_rem >= 4 && n_rem >= 4) { mc = 4; nc = 4; - gemm_small<4, 4>(m0, m, n0, n); - } else if ((m_rem < 4) && (n_rem > 4)) { - nc = 4; - switch(m_rem) { - case 1: - mc = 1; - gemm_small<1, 4>(m0, m, n0, n); - break; - case 2: - mc = 2; - gemm_small<2, 4>(m0, m, n0, n); - break; - case 3: - mc = 3; - gemm_small<3, 4>(m0, m, n0, n); - break; - default: - return; - } - } else if ((m_rem > 4) && (n_rem < 4)) { - mc = 4; - switch(n_rem) { - case 1: - nc = 1; - gemm_small<4, 1>(m0, m, n0, n); - break; - case 2: - nc = 2; - gemm_small<4, 2>(m0, m, n0, n); - break; - case 3: - nc = 3; - gemm_small<4, 3>(m0, m, n0, n); - break; - default: - return; - } + gemm_small(m0, m, n0, n, mc, nc); } else { - switch((m_rem << 4) | n_rem) { - case 0x43: - mc = 4; - nc = 3; - gemm_small<4, 3>(m0, m, n0, n); - break; - case 0x42: - mc = 4; - nc = 2; - gemm_small<4, 2>(m0, m, n0, n); - break; - case 0x41: - mc = 4; - nc = 1; - gemm_small<4, 1>(m0, m, n0, n); - break; - case 0x34: - mc = 3; - nc = 4; - gemm_small<3, 4>(m0, m, n0, n); - break; - case 0x33: - mc = 3; - nc = 3; - gemm_small<3, 3>(m0, m, n0, n); - break; - case 0x32: - mc = 3; - nc = 2; - gemm_small<3, 2>(m0, m, n0, n); - break; - case 0x31: - mc = 3; - nc = 1; - gemm_small<3, 1>(m0, m, n0, n); - break; - case 0x24: - mc = 2; - nc = 4; - gemm_small<2, 4>(m0, m, n0, n); - break; - case 0x23: - mc = 2; - nc = 3; - gemm_small<2, 3>(m0, m, n0, n); - break; - case 0x22: - mc = 2; - nc = 2; - gemm_small<2, 2>(m0, m, n0, n); - break; - case 0x21: - mc = 2; - nc = 1; - gemm_small<2, 1>(m0, m, n0, n); - break; - case 0x14: - mc = 1; - nc = 4; - gemm_small<1, 4>(m0, m, n0, n); - break; - case 0x13: - mc = 1; - nc = 3; - gemm_small<1, 3>(m0, m, n0, n); - break; - case 0x12: - mc = 1; - nc = 2; - gemm_small<1, 2>(m0, m, n0, n); - break; - case 0x11: - mc = 1; - nc = 1; - gemm_small<1, 1>(m0, m, n0, n); - break; - default: - return; - } + mc = (m_rem >= 4) ? 4 : m_rem; + nc = (n_rem >= 4) ? 4 : n_rem; + if (mc == 0 || nc == 0) + return; + gemm_small(m0, m, n0, n, mc, nc); } - mp = m0 + (m - m0) / mc * mc; - np = n0 + (n - n0) / nc * nc; + + int64_t mp = m0 + ((m - m0) / mc) * mc; + int64_t np = n0 + ((n - n0) / nc) * nc; mnpack(mp, m, n0, np); mnpack(m0, m, np, n); } + void KERNEL_4x8(int64_t ii, int64_t jj) { vec_t vec_A[8], vec_B[16] = {0}; acc_t acc_0, acc_1; @@ -2445,9 +1926,9 @@ class tinyBLAS_Q0_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); if (std::is_same_v) { - packNormalInt4((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray); + packNormalInt4<4>((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray); } else { - packNormal((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false); + packNormal((const block_q8_0*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false); } packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true); for(int x = 0; x < 8; x++) { @@ -2475,8 +1956,8 @@ class tinyBLAS_Q0_PPC { compute<4>(&acc_0, 0, 0, comparray, vs, fin_res); compute<4>(&acc_1, 0, 4, comparray, vs, fin_res); } - save_res<4, 4>(ii, jj, 0, fin_res); - save_res<4, 4>(ii, jj+4, 4, fin_res); + save_res(ii, jj, 0, fin_res); + save_res(ii, jj+4, 4, fin_res); } void KERNEL_8x4(int64_t ii, int64_t jj) { @@ -2490,9 +1971,9 @@ class tinyBLAS_Q0_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); if (std::is_same_v) { - packNormalInt4((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray); + packNormalInt4<8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray); } else { - packNormal((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); + packNormal((const block_q8_0*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); } packNormal((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true); for(int x = 0; x < 8; x++) { @@ -2519,8 +2000,8 @@ class tinyBLAS_Q0_PPC { compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); } - save_res<4, 4>(ii, jj, 0, fin_res); - save_res<4, 4>(ii+4, jj, 4, fin_res); + save_res(ii, jj, 0, fin_res); + save_res(ii+4, jj, 4, fin_res); } void KERNEL_8x8(int64_t ii, int64_t jj) { @@ -2536,9 +2017,9 @@ class tinyBLAS_Q0_PPC { __builtin_mma_xxsetaccz(&acc_2); __builtin_mma_xxsetaccz(&acc_3); if (std::is_same_v) { - packNormalInt4((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray); + packNormalInt4<8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray); } else { - packNormal((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); + packNormal((const block_q8_0*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); } packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true); for(int x = 0; x < 8; x++) { @@ -2570,14 +2051,13 @@ class tinyBLAS_Q0_PPC { compute<8>(&acc_2, 0, 8, comparray, vs, fin_res); compute<8>(&acc_3, 4, 12, comparray, vs, fin_res); } - save_res<4, 4>(ii, jj, 0, fin_res); - save_res<4, 4>(ii+4, jj, 4, fin_res); - save_res<4, 4>(ii, jj+4, 8, fin_res); - save_res<4, 4>(ii+4, jj+4, 12, fin_res); + save_res(ii, jj, 0, fin_res); + save_res(ii+4, jj, 4, fin_res); + save_res(ii, jj+4, 8, fin_res); + save_res(ii+4, jj+4, 12, fin_res); } - template - void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) { + void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) { int64_t ytiles = (m - m0) / RM; int64_t xtiles = (n - n0) / RN; int64_t tiles = xtiles * ytiles; @@ -2606,9 +2086,9 @@ class tinyBLAS_Q0_PPC { __builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead __builtin_mma_xxsetaccz(&acc_0); if (isAblock_q4) { - packNormalInt4((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray); + packNormalInt4<4>((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray); } else { - packNormal((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false); + packNormal((const block_q8_0*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false); } packNormal((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true); for(int x = 0; x < 8; x+=4) { @@ -2641,7 +2121,7 @@ class tinyBLAS_Q0_PPC { fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]); } } - save_res(ii, jj, 0, fin_res); + save_res(ii, jj, 0, fin_res, RM, RN); } } @@ -2654,7 +2134,7 @@ class tinyBLAS_Q0_PPC { } else if constexpr(RM == 8 && RN == 8) { KERNEL_8x8(ii,jj); } else { - static_assert(false, "RN/RM values not supported"); + assert(false && "RN/RM values not supported"); } } @@ -2676,10 +2156,8 @@ class tinyBLAS_Q0_PPC { } const TA *const A; - const TB *const B; - TC *C; - TA *At; - TB *Bt; + const block_q8_0 *const B; + float *C; const int64_t k; const int64_t lda; const int64_t ldb; @@ -2688,13 +2166,12 @@ class tinyBLAS_Q0_PPC { const int nth; }; -template class tinyBLAS_PPC { public: tinyBLAS_PPC(int64_t k, - const TA *A, int64_t lda, - const TB *B, int64_t ldb, - TC *C, int64_t ldc, + const float *A, int64_t lda, + const float *B, int64_t ldb, + float *C, int64_t ldc, int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } @@ -2707,247 +2184,139 @@ class tinyBLAS_PPC { void (tinyBLAS_PPC::*kernel)(int64_t, int64_t); - template - void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) { + inline void vector_permute_store_4(vector float *src, float *vecOffset) { + vector float t1, t2, t3, t4, t5, t6, t7, t8; + t1 = vec_mergeh(src[0], src[1]); + t2 = vec_mergeh(src[2], src[3]); + t3 = vec_mergel(src[0], src[1]); + t4 = vec_mergel(src[2], src[3]); + + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t1, t2, 3); + t7 = vec_xxpermdi(t3, t4, 0); + t8 = vec_xxpermdi(t3, t4, 3); + + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset + 4); + vec_xst(t7, 0, vecOffset + 8); + vec_xst(t8, 0, vecOffset + 12); + } + + inline void vector_permute_store_8(vector float *src, float *vecOffset) { + vector float t1, t2, t3, t4, t5, t6, t7, t8; + t1 = vec_mergeh(src[0], src[1]); + t2 = vec_mergeh(src[2], src[3]); + t3 = vec_mergeh(src[4], src[5]); + t4 = vec_mergeh(src[6], src[7]); + + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t3, t4, 0); + t7 = vec_xxpermdi(t1, t2, 3); + t8 = vec_xxpermdi(t3, t4, 3); + + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset + 4); + vec_xst(t7, 0, vecOffset + 8); + vec_xst(t8, 0, vecOffset + 12); + + t1 = vec_mergel(src[0], src[1]); + t2 = vec_mergel(src[2], src[3]); + t3 = vec_mergel(src[4], src[5]); + t4 = vec_mergel(src[6], src[7]); + + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t3, t4, 0); + t7 = vec_xxpermdi(t1, t2, 3); + t8 = vec_xxpermdi(t3, t4, 3); + + vec_xst(t5, 0, vecOffset + 16); + vec_xst(t6, 0, vecOffset + 20); + vec_xst(t7, 0, vecOffset + 24); + vec_xst(t8, 0, vecOffset + 28); + } + + void packTranspose(const float* a, int64_t lda, int rows, int cols, float* vec) { int64_t i, j; - TA *aoffset = NULL, *boffset = NULL; - TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; - TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; - __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; - VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0}; - VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0}; - VA t1, t2, t3, t4, t5, t6, t7, t8; - aoffset = const_cast(a); + float * aoffsets[8]; + float *aoffset = NULL, *boffset = NULL; + __vector_pair arr[8]; + vector float c[8][2] = {0}; + vector float c1[8] = {0}; + vector float c2[8] = {0}; + aoffset = const_cast(a); boffset = vec; j = (rows >> 3); if (j > 0) { do { - aoffset1 = aoffset; - aoffset2 = aoffset1 + lda; - aoffset3 = aoffset2 + lda; - aoffset4 = aoffset3 + lda; - aoffset5 = aoffset4 + lda; - aoffset6 = aoffset5 + lda; - aoffset7 = aoffset6 + lda; - aoffset8 = aoffset7 + lda; + aoffsets[0] = aoffset; + for (int it = 1; it< 8; it++) + aoffsets[it] = aoffsets[it-1] + lda; aoffset += 8 * lda; i = (cols >> 3); if (i > 0) { do { - C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); - C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); - C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3); - C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4); - C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5); - C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6); - C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7); - C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8); - __builtin_vsx_disassemble_pair(c1, &C1); - __builtin_vsx_disassemble_pair(c2, &C2); - __builtin_vsx_disassemble_pair(c3, &C3); - __builtin_vsx_disassemble_pair(c4, &C4); - __builtin_vsx_disassemble_pair(c5, &C5); - __builtin_vsx_disassemble_pair(c6, &C6); - __builtin_vsx_disassemble_pair(c7, &C7); - __builtin_vsx_disassemble_pair(c8, &C8); - - t1 = vec_mergeh(c1[0], c2[0]); - t2 = vec_mergeh(c3[0], c4[0]); - t3 = vec_mergeh(c5[0], c6[0]); - t4 = vec_mergeh(c7[0], c8[0]); - t5 = vec_xxpermdi(t1, t2, 0); - t6 = vec_xxpermdi(t3, t4, 0); - t7 = vec_xxpermdi(t1, t2, 3); - t8 = vec_xxpermdi(t3, t4, 3); - vec_xst(t5, 0, boffset); - vec_xst(t6, 0, boffset+4); - vec_xst(t7, 0, boffset+8); - vec_xst(t8, 0, boffset+12); - - t1 = vec_mergel(c1[0], c2[0]); - t2 = vec_mergel(c3[0], c4[0]); - t3 = vec_mergel(c5[0], c6[0]); - t4 = vec_mergel(c7[0], c8[0]); - t5 = vec_xxpermdi(t1, t2, 0); - t6 = vec_xxpermdi(t3, t4, 0); - t7 = vec_xxpermdi(t1, t2, 3); - t8 = vec_xxpermdi(t3, t4, 3); - vec_xst(t5, 0, boffset+16); - vec_xst(t6, 0, boffset+20); - vec_xst(t7, 0, boffset+24); - vec_xst(t8, 0, boffset+28); - - t1 = vec_mergeh(c1[1], c2[1]); - t2 = vec_mergeh(c3[1], c4[1]); - t3 = vec_mergeh(c5[1], c6[1]); - t4 = vec_mergeh(c7[1], c8[1]); - t5 = vec_xxpermdi(t1, t2, 0); - t6 = vec_xxpermdi(t3, t4, 0); - t7 = vec_xxpermdi(t1, t2, 3); - t8 = vec_xxpermdi(t3, t4, 3); - vec_xst(t5, 0, boffset+32); - vec_xst(t6, 0, boffset+36); - vec_xst(t7, 0, boffset+40); - vec_xst(t8, 0, boffset+44); - - t1 = vec_mergel(c1[1], c2[1]); - t2 = vec_mergel(c3[1], c4[1]); - t3 = vec_mergel(c5[1], c6[1]); - t4 = vec_mergel(c7[1], c8[1]); - t5 = vec_xxpermdi(t1, t2, 0); - t6 = vec_xxpermdi(t3, t4, 0); - t7 = vec_xxpermdi(t1, t2, 3); - t8 = vec_xxpermdi(t3, t4, 3); - vec_xst(t5, 0, boffset+48); - vec_xst(t6, 0, boffset+52); - vec_xst(t7, 0, boffset+56); - vec_xst(t8, 0, boffset+60); - - aoffset1 += 8*lda; - aoffset2 += 8*lda; - aoffset3 += 8*lda; - aoffset4 += 8*lda; + for (int it = 0; it< 8; it++) { + arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[it]); + __builtin_vsx_disassemble_pair(c[it], &arr[it]); + c1[it] = c[it][0]; + c2[it] = c[it][1]; + } + + vector_permute_store_8(c1, boffset); + vector_permute_store_8(c2, boffset+32); + for (int it = 0; it < 4; it++) + aoffsets[it] = aoffsets[it] + 8*lda; boffset += 64; i--; } while(i > 0); } if (cols & 4) { - c1[0] = vec_xl(0, aoffset1); - c2[0] = vec_xl(0, aoffset2); - c3[0] = vec_xl(0, aoffset3); - c4[0] = vec_xl(0, aoffset4); - c5[0] = vec_xl(0, aoffset5); - c6[0] = vec_xl(0, aoffset6); - c7[0] = vec_xl(0, aoffset7); - c8[0] = vec_xl(0, aoffset8); - - t1 = vec_mergeh(c1[0], c2[0]); - t2 = vec_mergeh(c3[0], c4[0]); - t3 = vec_mergeh(c5[0], c6[0]); - t4 = vec_mergeh(c7[0], c8[0]); - t5 = vec_xxpermdi(t1, t2, 0); - t6 = vec_xxpermdi(t3, t4, 0); - t7 = vec_xxpermdi(t1, t2, 3); - t8 = vec_xxpermdi(t3, t4, 3); - vec_xst(t5, 0, boffset); - vec_xst(t6, 0, boffset+4); - vec_xst(t7, 0, boffset+8); - vec_xst(t8, 0, boffset+12); - - t1 = vec_mergel(c1[0], c2[0]); - t2 = vec_mergel(c3[0], c4[0]); - t3 = vec_mergel(c5[0], c6[0]); - t4 = vec_mergel(c7[0], c8[0]); - t5 = vec_xxpermdi(t1, t2, 0); - t6 = vec_xxpermdi(t3, t4, 0); - t7 = vec_xxpermdi(t1, t2, 3); - t8 = vec_xxpermdi(t3, t4, 3); - vec_xst(t5, 0, boffset+16); - vec_xst(t6, 0, boffset+20); - vec_xst(t7, 0, boffset+24); - vec_xst(t8, 0, boffset+28); + for (int it = 0; it < 8 ; it++) + c1[it] = vec_xl(0, aoffsets[it]); + vector_permute_store_8(c1, boffset); } j--; } while(j > 0); } if (rows & 4) { - aoffset1 = aoffset; - aoffset2 = aoffset1 + lda; - aoffset3 = aoffset2 + lda; - aoffset4 = aoffset3 + lda; + aoffsets[0] = aoffset; + for (int it = 1; it < 4; it++) + aoffsets[it] = aoffsets[it-1] + lda; aoffset += 4 * lda; i = (cols >> 3); if (i > 0) { do { - C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); - C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); - C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3); - C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4); - __builtin_vsx_disassemble_pair(c1, &C1); - __builtin_vsx_disassemble_pair(c2, &C2); - __builtin_vsx_disassemble_pair(c3, &C3); - __builtin_vsx_disassemble_pair(c4, &C4); - - t1 = vec_mergeh(c1[0], c2[0]); - t2 = vec_mergeh(c3[0], c4[0]); - t3 = vec_mergel(c1[0], c2[0]); - t4 = vec_mergel(c3[0], c4[0]); - t5 = vec_xxpermdi(t1, t2, 0); - t6 = vec_xxpermdi(t1, t2, 3); - t7 = vec_xxpermdi(t3, t4, 0); - t8 = vec_xxpermdi(t3, t4, 3); - vec_xst(t5, 0, boffset); - vec_xst(t6, 0, boffset+4); - vec_xst(t7, 0, boffset+8); - vec_xst(t8, 0, boffset+12); - - t1 = vec_mergeh(c1[1], c2[1]); - t2 = vec_mergeh(c3[1], c4[1]); - t3 = vec_mergel(c1[1], c2[1]); - t4 = vec_mergel(c3[1], c4[1]); - t5 = vec_xxpermdi(t1, t2, 0); - t6 = vec_xxpermdi(t1, t2, 3); - t7 = vec_xxpermdi(t3, t4, 0); - t8 = vec_xxpermdi(t3, t4, 3); - vec_xst(t5, 0, boffset+16); - vec_xst(t6, 0, boffset+20); - vec_xst(t7, 0, boffset+24); - vec_xst(t8, 0, boffset+28); - - aoffset1 += 8*lda; - aoffset2 += 8*lda; - aoffset3 += 8*lda; - aoffset4 += 8*lda; + for (int it = 0; it < 4; it++) { + arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)aoffsets[it]); + __builtin_vsx_disassemble_pair(c[it], &arr[it]); + c1[it] = c[it][0]; + c2[it] = c[it][1]; + } + vector_permute_store_4(c1, boffset); + vector_permute_store_4(c2, boffset+16); + for (int it = 0; it < 4; it++) + aoffsets[it] += 8*lda; boffset += 32; i--; } while(i > 0); } if (cols & 4) { - c1[0] = vec_xl(0, aoffset1); - c2[0] = vec_xl(0, aoffset2); - c3[0] = vec_xl(0, aoffset3); - c4[0] = vec_xl(0, aoffset4); - - t1 = vec_mergeh(c1[0], c2[0]); - t2 = vec_mergeh(c3[0], c4[0]); - t3 = vec_xxpermdi(t1, t2, 0); - t4 = vec_xxpermdi(t1, t2, 3); - vec_xst(t3, 0, boffset); - vec_xst(t4, 0, boffset+4); - - t1 = vec_mergel(c1[0], c2[0]); - t2 = vec_mergel(c3[0], c4[0]); - t3 = vec_xxpermdi(t1, t2, 0); - t4 = vec_xxpermdi(t1, t2, 3); - vec_xst(t3, 0, boffset+8); - vec_xst(t4, 0, boffset+12); + for (int it = 0; it < 4; it++) + c1[it] = vec_xl(0, aoffsets[it]); + vector_permute_store_4(c1, boffset); } } if (rows & 3) { - aoffset1 = aoffset; - aoffset2 = aoffset1 + lda; - aoffset3 = aoffset2 + lda; + aoffsets[0] = aoffset; + for (int it = 1; it < 3; it++) + aoffsets[it] = aoffsets[it-1] + lda; if (cols & 4) { - c1[0] = vec_xl(0, aoffset1); - c2[0] = vec_xl(0, aoffset2); - c3[0] = vec_xl(0, aoffset3); - - t1 = vec_mergeh(c1[0], c2[0]); - t2 = vec_mergeh(c3[0], c4[0]); - t3 = vec_xxpermdi(t1, t2, 0); - t4 = vec_xxpermdi(t1, t2, 3); - vec_xst(t3, 0, boffset); - vec_xst(t4, 0, boffset+4); - - t1 = vec_mergel(c1[0], c2[0]); - t2 = vec_mergel(c3[0], c4[0]); - t3 = vec_xxpermdi(t1, t2, 0); - t4 = vec_xxpermdi(t1, t2, 3); - vec_xst(t3, 0, boffset+8); - vec_xst(t4, 0, boffset+12); + for (int it = 0; it < 3; it++) + c1[it] = vec_xl(0, aoffsets[it]); + vector_permute_store_4(c1, boffset); } } } @@ -2957,8 +2326,8 @@ class tinyBLAS_PPC { acc_t acc_0; __builtin_mma_xxsetaccz(&acc_0); for (int l = 0; l < k; l+=4) { - packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); - packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]); @@ -2973,8 +2342,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); for (int64_t l = 0; l < k; l+=4) { - packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); - packTranspose(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]); @@ -2994,8 +2363,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); for (int64_t l = 0; l < k; l+=4) { - packTranspose(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A); - packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]); @@ -3017,8 +2386,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_2); __builtin_mma_xxsetaccz(&acc_3); for (int l = 0; l < k; l+=8) { - packTranspose(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A); - packTranspose(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B); for(int x = 0; x < 16; x+=2) { __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]); @@ -3033,155 +2402,37 @@ class tinyBLAS_PPC { } void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { - int64_t mc, nc, mp, np; - int m_rem = MIN(m - m0, 16); - int n_rem = MIN(n - n0, 16); - if (m_rem >= 16 && n_rem >= 8) { - mc = 8; - nc = 8; - gemm<8,8>(m0, m, n0, n); - } else if(m_rem >= 8 && n_rem >= 16) { - mc = 8; - nc = 8; - gemm<8,8>(m0, m, n0, n); - } else if (m_rem >= 8 && n_rem >= 8) { - mc = 8; - nc = 8; - gemm<8,8>(m0, m, n0, n); + int m_rem = MIN(m - m0, 8); + int n_rem = MIN(n - n0, 8); + int mc = 0, nc = 0; + if (m_rem >= 8 && n_rem >= 8) { + mc = 8; + nc = 8; + gemm<8, 8>(m0, m, n0, n); } else if (m_rem >= 4 && n_rem >= 8) { - mc = 4; - nc = 8; - gemm<4,8>(m0, m, n0, n); + mc = 4; + nc = 8; + gemm<4, 8>(m0, m, n0, n); } else if (m_rem >= 8 && n_rem >= 4) { - mc = 8; - nc = 4; - gemm<8,4>(m0, m, n0, n); + mc = 8; + nc = 4; + gemm<8, 4>(m0, m, n0, n); } else if (m_rem >= 4 && n_rem >= 4) { - mc = 4; - nc = 4; - gemm<4,4>(m0, m, n0, n); - } else if ((m_rem < 4) && (n_rem > 4)) { - nc = 4; - switch(m_rem) { - case 1: - mc = 1; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 2: - mc = 2; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 3: - mc = 3; - gemm_small(m0, m, n0, n, mc, nc); - break; - default: - return; - } - } else if ((m_rem > 4) && (n_rem < 4)) { - mc = 4; - switch(n_rem) { - case 1: - nc = 1; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 2: - nc = 2; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 3: - nc = 3; - gemm_small(m0, m, n0, n, mc, nc); - break; - default: - return; - } + mc = 4; + nc = 4; + gemm<4, 4>(m0, m, n0, n); } else { - switch((m_rem << 4) | n_rem) { - case 0x43: - mc = 4; - nc = 3; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x42: - mc = 4; - nc = 2; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x41: - mc = 4; - nc = 1; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x34: - mc = 3; - nc = 4; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x33: - mc = 3; - nc = 3; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x32: - mc = 3; - nc = 2; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x31: - mc = 3; - nc = 1; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x24: - mc = 2; - nc = 4; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x23: - mc = 2; - nc = 3; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x22: - mc = 2; - nc = 2; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x21: - mc = 2; - nc = 1; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x14: - mc = 1; - nc = 4; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x13: - mc = 1; - nc = 3; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x12: - mc = 1; - nc = 2; - gemm_small(m0, m, n0, n, mc, nc); - break; - case 0x11: - mc = 1; - nc = 1; - gemm_small(m0, m, n0, n, mc, nc); - break; - default: - return; - } + mc = (m_rem >= 4) ? 4 : m_rem; + nc = (n_rem >= 4) ? 4 : n_rem; + if (mc == 0 || nc == 0) + return; + gemm_small(m0, m, n0, n, mc, nc); } - mp = m0 + (m - m0) / mc * mc; - np = n0 + (n - n0) / nc * nc; + int64_t mp = m0 + ((m - m0) / mc) * mc; + int64_t np = n0 + ((n - n0) / nc) * nc; mnpack(mp, m, n0, np); mnpack(m0, m, np, n); - } + } void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) { int64_t ytiles = (m - m0) / RM; @@ -3206,22 +2457,22 @@ class tinyBLAS_PPC { * matrix elements. */ if (RM == 1) { - TA* a = const_cast(A+(ii)*lda+l); - packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); + float* a = const_cast(A+(ii)*lda+l); + packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B); vec_A[0] = (vec_t)vec_xl(0,a); - vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1)); - vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2)); - vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3)); + vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1)); + vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2)); + vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3)); } else if (RN == 1) { - packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); - TB* b = const_cast(B+(jj)*ldb+l); + packTranspose(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A); + float* b = const_cast(B+(jj)*ldb+l); vec_B[0] = (vec_t)vec_xl(0,b); - vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1)); - vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2)); - vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3)); + vec_B[1] = (vec_t)vec_splats(*((float*)&vec_B+1)); + vec_B[2] = (vec_t)vec_splats(*((float*)&vec_B+2)); + vec_B[3] = (vec_t)vec_splats(*((float*)&vec_B+3)); } else { - packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); - packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); + packTranspose(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B); } __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); @@ -3231,7 +2482,7 @@ class tinyBLAS_PPC { __builtin_mma_disassemble_acc(vec_C, &acc_0); for (int I = 0; I < RM; I++) { for (int J = 0; J < RN; J++) { - *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J); + *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); } } } @@ -3263,11 +2514,9 @@ class tinyBLAS_PPC { } } - const TA *const A; - const TB *const B; - TC *C; - TA *At; - TB *Bt; + const float *const A; + const float *const B; + float *C; const int64_t k; const int64_t lda; const int64_t ldb; @@ -3366,7 +2615,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 #elif defined(__MMA__) if (k % 8) return false; - tinyBLAS_PPC tb{ + tinyBLAS_PPC tb{ k, (const float *)A, lda, (const float *)B, ldb, (float *)C, ldc, @@ -3493,7 +2742,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 return false; if (m < 8 && m != 4) return false; - tinyBLAS_Q0_PPC tb{ + tinyBLAS_Q0_PPC tb{ k, (const block_q8_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, @@ -3530,7 +2779,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 return false; if (m < 8 && m != 4) return false; - tinyBLAS_Q0_PPC tb{ + tinyBLAS_Q0_PPC tb{ k, (const block_q4_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, From a6b9271c2c6606b42ecf0c4e8894768a04ed7cdb Mon Sep 17 00:00:00 2001 From: Anton Mitkov Date: Mon, 14 Jul 2025 18:12:42 +0100 Subject: [PATCH 0109/1348] sycl: Hotfix for non dnnl codepath (llama/14677) --- ggml/src/ggml-sycl/ggml-sycl.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index cf46012be81..a6f9af0c86e 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2875,12 +2875,20 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons } } +#if GGML_SYCL_DNNL + // oneDNN handles strided data and does not need overhead of get_to_fp16_nc_sycl const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1; src1_f16_alloc.alloc(ne_src1); - const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst); GGML_ASSERT(to_fp16_sycl != nullptr); to_fp16_sycl(src1_f16, src1_f16_alloc.get(), ne_src1, queue); +# else + const int64_t ne_src1 = ggml_nelements(src1); + src1_f16_alloc.alloc(ne_src1); + const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type); + GGML_ASSERT(to_fp16_nc_sycl != nullptr); + to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue); +#endif src1_f16 = src1_f16_alloc.get(); s11 = ne10; From ab79c6c1186f3f4ddf16a0f2b0a9233a27c428c8 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Tue, 15 Jul 2025 15:28:53 +0800 Subject: [PATCH 0110/1348] cuda: fix build warnings in set-rows.cu (unused variable) (llama/14687) Signed-off-by: Xiaodong Ye --- ggml/src/ggml-cuda/set-rows.cu | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 3fade72b84e..58cee924401 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -3,7 +3,10 @@ typedef void (*set_rows_kernel_t)(const char * src, char * dst); template -__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {} +__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) { + GGML_UNUSED(src_f); + GGML_UNUSED(dst_f); +} template<> __device__ __forceinline__ void set_rows_1(const float * src_f, half * dst_h) { @@ -53,6 +56,9 @@ static __global__ void k_set_rows( const src_t* src_elem = src0_row + i00; dst_t* dst_elem = dst_row_ptr + i00; set_rows_1(src_elem, dst_elem); + + GGML_UNUSED(ne10); + GGML_UNUSED(ne13); } template From b33841c453f3c7b14db36d4831ce0682328c140c Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 15 Jul 2025 14:32:11 -0500 Subject: [PATCH 0111/1348] vulkan: add RTE variants for glu/add/sub/mul/div (llama/14653) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 12 ++- .../vulkan-shaders/copy_to_quant.comp | 6 +- .../vulkan-shaders/generic_binary_head.comp | 2 + .../ggml-vulkan/vulkan-shaders/glu_head.comp | 2 + .../ggml-vulkan/vulkan-shaders/im2col.comp | 5 +- .../ggml-vulkan/vulkan-shaders/rope_head.comp | 5 +- ggml/src/ggml-vulkan/vulkan-shaders/rte.comp | 5 ++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 85 +++++++++++++++---- 8 files changed, 90 insertions(+), 32 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/rte.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 416ee3bd3f7..9f5646bf29d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2835,10 +2835,11 @@ static void ggml_vk_load_shaders(vk_device& device) { return s; }; + bool rte = device->float_controls_rte_fp16; #define CREATE_BINARY(name, namemod, spec) \ for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \ ggml_vk_create_pipeline(device, device->pipeline_ ## name ## namemod[s0][s1][d], \ - #name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d], name ## _data[s0][s1][d], \ + #name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d][rte], name ## _data[s0][s1][d][rte], \ "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1); CREATE_BINARY(add, , {0}) @@ -2890,8 +2891,13 @@ static void ggml_vk_load_shaders(vk_device& device) { #undef CREATE_UNARY #define CREATE_GLU(name) \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); + if (device->float_controls_rte_fp16) { \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16_rte", name ## _f16_rte_len, name ## _f16_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ + } else { \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ + } CREATE_GLU(geglu) CREATE_GLU(reglu) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp index e06547e48f7..27d6b7464f6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp @@ -1,10 +1,6 @@ #version 450 -#if RTE16 -#extension GL_EXT_spirv_intrinsics : enable -spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits -#endif // RTE16 - +#include "rte.comp" #include "types.comp" #if defined(SET_ROWS) && QUANT_K == 1 diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp index 062e2a4cdf2..4b4316cf3d9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp @@ -1,6 +1,8 @@ #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_control_flow_attributes : require +#include "rte.comp" + layout (push_constant) uniform parameter { uint ne; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp index 41a29889075..004a61fc162 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp @@ -1,5 +1,7 @@ #extension GL_EXT_shader_16bit_storage : require +#include "rte.comp" + layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp index 09aa849e881..17c7ccb90d0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp @@ -1,12 +1,9 @@ #version 450 #extension GL_EXT_shader_16bit_storage : require -#extension GL_EXT_spirv_intrinsics: enable #extension GL_EXT_control_flow_attributes : require -#if RTE16 -spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits -#endif +#include "rte.comp" layout (push_constant) uniform parameter { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp index 96c9c4cbd30..00e203e73bd 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp @@ -1,11 +1,8 @@ #include "types.comp" #extension GL_EXT_shader_16bit_storage : require -#extension GL_EXT_spirv_intrinsics: enable -#if RTE16 -spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits -#endif +#include "rte.comp" layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp new file mode 100644 index 00000000000..ad51c1e80b8 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp @@ -0,0 +1,5 @@ + +#if RTE16 +#extension GL_EXT_spirv_intrinsics : enable +spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits +#endif // RTE16 diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index d4a4e4c5290..809c0bd9bd3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -537,8 +537,10 @@ void process_shaders() { for (auto src0_f16 : {false, true}) { for (auto src1_f16 : {false, true}) { for (auto dst_f16 : {false, true}) { - auto name = op + get_suffix(src0_f16, src1_f16, dst_f16); - string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}}); + for (auto rte : {false, true}) { + auto name = op + get_suffix(src0_f16, src1_f16, dst_f16) + (rte ? "_rte" : ""); + string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}}); + } } } } @@ -592,16 +594,19 @@ void process_shaders() { string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("geglu_f16", "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("geglu_f32", "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("reglu_f16", "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("reglu_f32", "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("swiglu_f16", "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("swiglu_f32", "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("geglu_erf_f16", "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("geglu_erf_f32", "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("geglu_quick_f16","geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("geglu_quick_f32","geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + for (auto rte : {false, true}) { + std::string suffix = rte ? "_rte" : ""; + string_to_spv("geglu_f16" + suffix, "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("geglu_f32" + suffix, "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("reglu_f16" + suffix, "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("reglu_f32" + suffix, "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("swiglu_f16" + suffix, "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("swiglu_f32" + suffix, "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("geglu_erf_f16" + suffix, "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("geglu_erf_f32" + suffix, "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("geglu_quick_f16" + suffix,"geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}}); + string_to_spv("geglu_quick_f32" + suffix,"geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}}); + } string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); @@ -709,11 +714,59 @@ void write_output_files() { std::remove(path.c_str()); } } + + std::string suffixes[2] = {"_f32", "_f16"}; for (const char *op : {"add", "sub", "mul", "div"}) { - fprintf(hdr, "extern unsigned char *%s_data[2][2][2];\n", op); - fprintf(hdr, "extern uint64_t %s_len[2][2][2];\n", op); - fprintf(src, "unsigned char *%s_data[2][2][2] = {{{%s_f32_f32_f32_data, %s_f32_f32_f16_data}, {%s_f32_f16_f32_data, %s_f32_f16_f16_data}}, {{%s_f16_f32_f32_data, %s_f16_f32_f16_data}, {%s_f16_f16_f32_data, %s_f16_f16_f16_data}}};\n", op, op, op, op, op, op, op, op, op); - fprintf(src, "uint64_t %s_len[2][2][2] = {{{%s_f32_f32_f32_len, %s_f32_f32_f16_len}, {%s_f32_f16_f32_len, %s_f32_f16_f16_len}}, {{%s_f16_f32_f32_len, %s_f16_f32_f16_len}, {%s_f16_f16_f32_len, %s_f16_f16_f16_len}}};\n", op, op, op, op, op, op, op, op, op); + fprintf(hdr, "extern unsigned char *%s_data[2][2][2][2];\n", op); + fprintf(hdr, "extern uint64_t %s_len[2][2][2][2];\n", op); + std::string data = "unsigned char *" + std::string(op) + "_data[2][2][2][2] = "; + std::string len = "uint64_t " + std::string(op) + "_len[2][2][2][2] = "; + for (uint32_t t0 = 0; t0 < 2; ++t0) { + if (t0 == 0) { + data += "{"; + len += "{"; + } + for (uint32_t t1 = 0; t1 < 2; ++t1) { + if (t1 == 0) { + data += "{"; + len += "{"; + } + for (uint32_t t2 = 0; t2 < 2; ++t2) { + if (t2 == 0) { + data += "{"; + len += "{"; + } + for (uint32_t rte = 0; rte < 2; ++rte) { + if (rte == 0) { + data += "{"; + len += "{"; + } + data += op + suffixes[t0] + suffixes[t1] + suffixes[t2] + ((rte != 0) ? "_rte" : ""); + len += op + suffixes[t0] + suffixes[t1] + suffixes[t2] + ((rte != 0) ? "_rte" : ""); + data += "_data,"; + len += "_len,"; + if (rte == 1) { + data += "}, "; + len += "}, "; + } + } + if (t2 == 1) { + data += "}, "; + len += "}, "; + } + } + if (t1 == 1) { + data += "}, "; + len += "}, "; + } + } + if (t0 == 1) { + data += "};\n"; + len += "};\n"; + } + } + fprintf(src, data.c_str()); + fprintf(src, len.c_str()); } fclose(hdr); fclose(src); From 8d1a0485f1a31ec1530700963053387c8c9569e1 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 15 Jul 2025 14:51:09 -0500 Subject: [PATCH 0112/1348] vulkan: fix noncontig check for mat_mul_id splitting (llama/14683) * vulkan: fix noncontig check for mat_mul_id splitting Remove supports_op check for > 4096 (splitting fixes this) * vulkan: fix batched matmul dequant for Q*_K --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 +----- ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp | 2 +- ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp | 2 +- ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp | 2 +- ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp | 2 +- ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp | 2 +- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 9f5646bf29d..3019a545d58 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4922,7 +4922,7 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { return tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && - tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; + (tensor->ne[3] == 1 || tensor->nb[3] == tensor->nb[2]*tensor->ne[2]); } static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) { @@ -10356,10 +10356,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm // If there's not enough shared memory for row_ids and the result tile, fallback to CPU return false; } - // Check against size of shared memory variable - if (op->src[2]->ne[0] > 4096) { - return false; - } } switch (src0_type) { case GGML_TYPE_F32: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp index 157154af3a3..d4e4e6bae63 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp @@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { const uint i = gl_WorkGroupID.x * 256 + wgy; - if (i >= p.M * p.K / QUANT_K) { + if (i >= p.nel / QUANT_K) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp index c17dd0d9991..3661f771c74 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp @@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { const uint i = uint(gl_WorkGroupID.x * 256 + wgy); - if (i >= p.M * p.K / QUANT_K) { + if (i >= p.nel / QUANT_K) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp index 987f113a35a..1370db3654d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp @@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { const uint ib = gl_WorkGroupID.x * 256 + wgy; - if (ib >= p.M * p.K / QUANT_K) { + if (ib >= p.nel / QUANT_K) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp index 6db5403b661..3f3b839e118 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp @@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { const uint ib = gl_WorkGroupID.x * 256 + wgy; - if (ib >= p.M * p.K / QUANT_K) { + if (ib >= p.nel / QUANT_K) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp index 0b91317550f..9cf34256e8c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp @@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { const uint i = gl_WorkGroupID.x * 256 + wgy; - if (i >= p.M * p.K / QUANT_K) { + if (i >= p.nel / QUANT_K) { return; } const uint tid = gl_LocalInvocationID.x; From 9cc645fec0c207925378d7eb70aaf0c1a3bd73d4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 16 Jul 2025 14:43:32 +0300 Subject: [PATCH 0113/1348] ggml : add asserts (llama/14720) * ggml : add asserts ggml-ci * cont : fix constant type Co-authored-by: Diego Devesa --------- Co-authored-by: Diego Devesa --- ggml/src/ggml-cpu/ops.cpp | 3 +++ ggml/src/ggml-cpu/vec.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index fd77e9a6aba..6581d27adde 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -4015,6 +4015,9 @@ static void ggml_compute_forward_rms_norm_f32( const float scale = 1.0f/sqrtf(mean + eps); + // if you hit this, likely you got an inf somewhere earlier + assert(scale > 0.0f); + ggml_vec_scale_f32(ne00, y, scale); } } diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index a8156011eba..07b377bdd82 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G for (int i = np; i < n; ++i) { sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); } + + // if you hit this, you are likely running outside the FP range + assert(!isnan(sumf) && !isinf(sumf)); #else for (int i = 0; i < n; ++i) { sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); From ae1bb2c8ea1ccb96bf1293cee32a4193bab95726 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 16 Jul 2025 16:35:42 +0300 Subject: [PATCH 0114/1348] llama : add high-throughput mode (llama/14363) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * kv-cache : prepare K/V buffers for separation ggml-ci * batched-bench : fix oob write ggml-ci * llama : add "virtual sequences" ggml-ci * llama : use "stream" vs "virtual sequence" ggml-ci * graph : fix stream splitting when KV cache is not used ggml-ci * kv-cache : add multi-stream save/load support ggml-ci * llama : add "--attn-streams" flag ggml-ci * kv-cache : fix handling when find_slot fails ggml-ci * kv-cache : restore find_slot impl ggml-ci * kv-cache : add comments * kv-cache : add bounds checks for sequence id ggml-ci * cont : add n_seq_max to batch allocr ggml-ci * kv-cache : perform stream copies lazily after llama_synchronize ggml-ci * kv-cache : avoid throwing exceptions across the C boundary ggml-ci * CUDA: 4D FlashAttention support (llama/14628) * CUDA: 4D FlashAttention support * CUDA: fix WMMA FA kernel * llama : rename attn_streams -> kv_unified ggml-ci * common : rename kv_split -> kv_unified ggml-ci --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/fattn-common.cuh | 54 ++++++++++++++++++---------- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 40 +++++++++++---------- ggml/src/ggml-cuda/fattn-tile-f16.cu | 34 ++++++++++-------- ggml/src/ggml-cuda/fattn-tile-f32.cu | 30 +++++++++------- ggml/src/ggml-cuda/fattn-vec-f16.cuh | 23 ++++++------ ggml/src/ggml-cuda/fattn-vec-f32.cuh | 27 +++++++------- ggml/src/ggml-cuda/fattn-wmma-f16.cu | 24 ++++++++----- ggml/src/ggml-cuda/ggml-cuda.cu | 9 ++--- 8 files changed, 141 insertions(+), 100 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 075f14a49e9..9122fca6cf9 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -33,8 +33,10 @@ typedef void (* fattn_kernel_t)( const int ne13, const int ne31, const int ne32, + const int ne33, const int nb31, const int nb32, + const int nb33, const int nb01, const int nb02, const int nb03, @@ -521,7 +523,7 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) { template // D == head size __launch_bounds__(D, 1) static __global__ void flash_attn_stream_k_fixup( - float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) { + float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11) { constexpr int ncols = ncols1*ncols2; const int bidx0 = blockIdx.x; @@ -535,8 +537,8 @@ static __global__ void flash_attn_stream_k_fixup( const int iter_k = ne11 / FATTN_KQ_STRIDE; const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; - const int kbc0 = (bidx0 + 0)*iter_k*iter_j*(ne02/ncols2) / gridDim.x; - const int kbc0_stop = (bidx0 + 1)*iter_k*iter_j*(ne02/ncols2) / gridDim.x; + const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; const bool did_not_have_any_data = kbc0 == kbc0_stop; const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; @@ -545,14 +547,15 @@ static __global__ void flash_attn_stream_k_fixup( return; } - const int channel = kbc0 / (iter_k*iter_j); - const int jt = (kbc0 - channel*iter_k*iter_j) / iter_k; + const int sequence = kbc0 / (iter_k*iter_j*(ne02/ncols2)); + const int head = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); + const int jt = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile. if (jt*ncols1 + j >= ne01) { return; } - dst += jt*ne02*(ncols1*D) + channel*(ncols2*D) + (j*ne02 + c)*D + tid; + dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + head*(ncols2*D) + (j*ne02 + c)*D + tid; // Load the partial result that needs a fixup: float dst_val = 0.0f; @@ -571,7 +574,7 @@ static __global__ void flash_attn_stream_k_fixup( int bidx = bidx0 - 1; int kbc_stop = kbc0; while(true) { - const int kbc = bidx*iter_k*iter_j*(ne02/ncols2) / gridDim.x; + const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; if (kbc == kbc_stop) { // Did not have any data. bidx--; kbc_stop = kbc; @@ -617,16 +620,31 @@ static __global__ void flash_attn_combine_results( const float2 * __restrict__ VKQ_meta, float * __restrict__ dst, const int parallel_blocks) { - VKQ_parts += parallel_blocks*D * gridDim.z*blockIdx.x; - VKQ_meta += parallel_blocks * gridDim.z*blockIdx.x; - dst += D * gridDim.z*blockIdx.x; + // Dimension 0: threadIdx.x + // Dimension 1: blockIdx.x + // Dimension 2: blockIdx.y + // Dimension 3: blockIdx.z + // Memory layout is permuted with [0, 2, 1, 3] + + const int ne01 = gridDim.x; + const int ne02 = gridDim.y; + + const int col = blockIdx.x; + const int head = blockIdx.y; + const int sequence = blockIdx.z; + + const int j_dst_unrolled = (sequence*ne01 + col)*ne02 + head; + + VKQ_parts += j_dst_unrolled * parallel_blocks*D; + VKQ_meta += j_dst_unrolled * parallel_blocks; + dst += j_dst_unrolled * D; const int tid = threadIdx.x; __builtin_assume(tid < D); extern __shared__ float2 meta[]; for (int i = tid; i < 2*parallel_blocks; i += D) { - ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i]; + ((float *) meta)[i] = ((const float *)VKQ_meta) [i]; } __syncthreads(); @@ -644,11 +662,11 @@ static __global__ void flash_attn_combine_results( const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD); *((uint32_t *) &KQ_max_scale) &= ftz_mask; - VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.z*D + blockIdx.z*D + tid]; + VKQ_numerator += KQ_max_scale * VKQ_parts[l*D + tid]; VKQ_denominator += KQ_max_scale * meta[l].y; } - dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator; + dst[tid] = VKQ_numerator / VKQ_denominator; } [[noreturn]] @@ -705,8 +723,6 @@ void launch_fattn( GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding."); - GGML_ASSERT(Q->ne[3] == 1); - ggml_cuda_pool & pool = ctx.pool(); cudaStream_t main_stream = ctx.stream(); const int id = ggml_cuda_get_device(); @@ -853,8 +869,8 @@ void launch_fattn( scale, max_bias, m0, m1, n_head_log2, logit_softcap, Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], K->ne[0], K->ne[1], K->ne[2], K->ne[3], - mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, - mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, + mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0, + mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0, Q->nb[1], Q->nb[2], Q->nb[3], nb11, nb12, nb13, nb21, nb22, nb23, @@ -869,11 +885,11 @@ void launch_fattn( flash_attn_stream_k_fixup <<>> - ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]); + ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1]); } } else if (parallel_blocks > 1) { const dim3 block_dim_combine(DV, 1, 1); - const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z); + const dim3 blocks_num_combine(Q->ne[1], Q->ne[2], Q->ne[3]); const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2); flash_attn_combine_results diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 709589854f0..6fa2e77299e 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1224,8 +1224,10 @@ static __global__ void flash_attn_ext_f16( const int ne13, const int ne31, const int ne32, + const int ne33, const int nb31, const int nb32, + const int nb33, const int nb01, const int nb02, const int nb03, @@ -1274,8 +1276,8 @@ static __global__ void flash_attn_ext_f16( constexpr int kb_niter = FATTN_KQ_STRIDE / c::nbatch_fa; // Number of kernel iterations per assigned KQ slice. // kbc == k block continuous, current index in continuous ijk space. - int kbc = (blockIdx.x + 0)*iter_k*iter_j*(ne02/ncols2) / gridDim.x; - const int kbc_stop = (blockIdx.x + 1)*iter_k*iter_j*(ne02/ncols2) / gridDim.x; + int kbc = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined. // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup). @@ -1285,18 +1287,19 @@ static __global__ void flash_attn_ext_f16( int kb0_start = kbc % iter_k; int kb0_stop = min(iter_k, kb0_start + kbc_stop - kbc); while (kbc < kbc_stop && kb0_stop == iter_k) { - const int channel = kbc / (iter_k*iter_j); - const int jt = (kbc - channel*iter_k*iter_j) / iter_k; // j index of current tile. + const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2)); + const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); + const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile. - const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2); - const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio)); + const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2)); + const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio)); const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr : - (const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1); - float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2); + (const half2 *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1); + float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2); - const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio)); + const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio)); - const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f; + const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f; const int kb0_start_kernel = kb0_start * kb_niter; const int kb0_stop_kernel = kb0_stop * kb_niter; @@ -1325,18 +1328,19 @@ static __global__ void flash_attn_ext_f16( return; } - const int channel = kbc / (iter_k*iter_j); - const int jt = (kbc - channel*iter_k*iter_j) / iter_k; // j index of current tile. + const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2)); + const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); + const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile. - const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2); - const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio)); + const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2)); + const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio)); const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr : - (const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1); - float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2); + (const half2 *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1); + float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2); - const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio)); + const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio)); - const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f; + const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f; const int kb0_start_kernel = kb0_start * kb_niter; const int kb0_stop_kernel = kb0_stop * kb_niter; diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 0c967f178e7..1f141328845 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -31,8 +31,10 @@ static __global__ void flash_attn_tile_ext_f16( const int ne13, const int ne31, const int ne32, + const int ne33, const int nb31, const int nb32, + const int nb33, const int nb01, const int nb02, const int nb03, @@ -62,15 +64,17 @@ static __global__ void flash_attn_tile_ext_f16( const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on. + const int sequence = blockIdx.z / ne02; + const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio)); - const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); + const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0); + const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio)); + const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); const int stride_KV2 = nb11 / sizeof(half2); - const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1); + const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); const half slopeh = __float2half(slopef); static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); @@ -255,6 +259,8 @@ static __global__ void flash_attn_tile_ext_f16( __syncthreads(); } + float2 * dst2 = (float2 *) dst; + #pragma unroll for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { const int j_VKQ = j_VKQ_0 + threadIdx.y; @@ -266,21 +272,21 @@ static __global__ void flash_attn_tile_ext_f16( half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]); kqsum_j = warp_reduce_sum((float)kqsum_j); + const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y; + #pragma unroll - for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) { - const int i0 = i00 + 2*threadIdx.x; + for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) { + const int i0 = i00 + threadIdx.x; - half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)]; + half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE]; if (gridDim.y == 1) { dst_val /= __half2half2(kqsum_j); } - const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y; - dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] = __low2float(dst_val); - dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = __high2float(dst_val); + dst2[j_dst_unrolled*(D/2) + i0] = __half22float2(dst_val); } if (gridDim.y != 1 && threadIdx.x == 0) { - dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); + dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); } } #else @@ -290,8 +296,8 @@ static __global__ void flash_attn_tile_ext_f16( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index 908c76dbdd2..a4965583cef 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -31,8 +31,10 @@ static __global__ void flash_attn_tile_ext_f32( const int ne13, const int ne31, const int ne32, + const int ne33, const int nb31, const int nb32, + const int nb33, const int nb01, const int nb02, const int nb03, @@ -74,15 +76,17 @@ static __global__ void flash_attn_tile_ext_f32( const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on. + const int sequence = blockIdx.z / ne02; + const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio)); - const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); + const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0); + const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio)); + const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); const int stride_KV2 = nb11 / sizeof(half2); - const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1); + const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); @@ -265,6 +269,8 @@ static __global__ void flash_attn_tile_ext_f32( __syncthreads(); } + float2 * dst2 = (float2 *) dst; + #pragma unroll for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { const int j_VKQ = j_VKQ_0 + threadIdx.y; @@ -276,22 +282,22 @@ static __global__ void flash_attn_tile_ext_f32( float kqsum_j = kqsum[j_VKQ_0/nwarps]; kqsum_j = warp_reduce_sum(kqsum_j); + const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y; + #pragma unroll - for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) { - const int i0 = i00 + 2*threadIdx.x; + for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) { + const int i0 = i00 + threadIdx.x; - float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)]; + float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE]; if (gridDim.y == 1) { dst_val.x /= kqsum_j; dst_val.y /= kqsum_j; } - const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y; - dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] = dst_val.x; - dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = dst_val.y; + dst2[j_dst_unrolled*(D/2) + i0] = dst_val; } if (gridDim.y != 1 && threadIdx.x == 0) { - dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); + dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); } } #else diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index e78fb181919..b2d469938ab 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -28,8 +28,10 @@ static __global__ void flash_attn_vec_ext_f16( const int ne13, const int ne31, const int ne32, + const int ne33, const int nb31, const int nb32, + const int nb33, const int nb01, const int nb02, const int nb03, @@ -65,14 +67,16 @@ static __global__ void flash_attn_vec_ext_f16( const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on. + const int sequence = blockIdx.z / ne02; + const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - Q += nb02* blockIdx.z + nb01*ic0; - K += nb12*(blockIdx.z / gqa_ratio); - V += nb22*(blockIdx.z / gqa_ratio); + Q += nb03*sequence + nb02* head + nb01*ic0; + K += nb13*sequence + nb12*(head / gqa_ratio); + V += nb23*sequence + nb22*(head / gqa_ratio); - const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); + const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); - const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1); + const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); const half slopeh = __float2half(slopef); static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); @@ -330,12 +334,11 @@ static __global__ void flash_attn_vec_ext_f16( if (gridDim.y == 1) { dst_val /= kqsum[j_VKQ]; } - const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y; - dst[j_dst*D*gridDim.z + D*blockIdx.z + tid] = dst_val; + dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val; } if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { - dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]); + dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]); } #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); @@ -344,8 +347,8 @@ static __global__ void flash_attn_vec_ext_f16( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne32); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh index b2f1724c955..405b6f5106e 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -28,8 +28,10 @@ static __global__ void flash_attn_vec_ext_f32( const int ne13, const int ne31, const int ne32, + const int ne33, const int nb31, const int nb32, + const int nb33, const int nb01, const int nb02, const int nb03, @@ -53,8 +55,8 @@ static __global__ void flash_attn_vec_ext_f32( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); @@ -77,14 +79,16 @@ static __global__ void flash_attn_vec_ext_f32( const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on. + const int sequence = blockIdx.z / ne02; + const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - Q += nb02* blockIdx.z + nb01*ic0; - K += nb12*(blockIdx.z / gqa_ratio); - V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape + Q += nb03*sequence + nb02* head + nb01*ic0; + K += nb13*sequence + nb12*(head / gqa_ratio); + V += nb23*sequence + nb22*(head / gqa_ratio); - const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); + const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); - const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1); + const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); constexpr int nwarps = D / WARP_SIZE; @@ -326,12 +330,11 @@ static __global__ void flash_attn_vec_ext_f32( if (gridDim.y == 1) { dst_val /= kqsum[j_VKQ]; } - const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y; - dst[j_dst*D*gridDim.z + D*blockIdx.z + tid] = dst_val; + dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val; } if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { - dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]); + dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]); } #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); @@ -340,8 +343,8 @@ static __global__ void flash_attn_vec_ext_f32( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index c95ca7b1f28..741b8781d29 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -47,8 +47,10 @@ static __global__ void flash_attn_ext_f16( const int ne13, const int ne31, const int ne32, + const int ne33, const int nb31, const int nb32, + const int nb33, const int nb01, const int nb02, const int nb03, @@ -95,17 +97,19 @@ static __global__ void flash_attn_ext_f16( constexpr int kqs_padded = FATTN_KQ_STRIDE + 8; constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half); + const int sequence = blockIdx.z / ne02; + const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0); - const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio)); - const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0); + const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0); + const half * K_h = (const half *) (K + nb13* sequence + nb12*(head / gqa_ratio)); + const half * V_h = (const half *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); const half2 * mask2 = (const half2 *) maskh; const int stride_Q = nb01 / sizeof(float); const int stride_KV = nb11 / sizeof(half); - const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1); + const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); const half slopeh = __float2half(slopef); const half2 slope2 = make_half2(slopef, slopef); @@ -400,7 +404,6 @@ static __global__ void flash_attn_ext_f16( if (ic0 + j_VKQ >= ne01) { return; } - const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y; float KQ_rowsum_j; if (std::is_same::value) { @@ -409,6 +412,8 @@ static __global__ void flash_attn_ext_f16( KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]); } + const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y; + #pragma unroll for (int i0 = 0; i0 < D; i0 += warp_size) { const int i = i0 + threadIdx.x; @@ -419,7 +424,7 @@ static __global__ void flash_attn_ext_f16( if (gridDim.y == 1) { dst_val /= KQ_rowsum_j; } - dst[j_dst*gridDim.z*D + blockIdx.z*D + i] = dst_val; + dst[j_dst_unrolled*D + i] = dst_val; } if (gridDim.y == 1 || threadIdx.x != 0) { @@ -433,7 +438,7 @@ static __global__ void flash_attn_ext_f16( dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]); } dst_meta_val.y = KQ_rowsum_j; - dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val; + dst_meta[j_dst_unrolled] = dst_meta_val; } #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); @@ -442,7 +447,8 @@ static __global__ void flash_attn_ext_f16( GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); GGML_UNUSED(nb31); + GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 8015b0d4e8d..778d5a48bd9 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3413,12 +3413,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g if (op->src[0]->ne[0] == 192) { return false; } - // TODO: support broadcast - // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but - // the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505 - if (op->src[0]->ne[3] != 1) { - return false; - } if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) { return false; } @@ -3431,6 +3425,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) { return true; } + if (op->src[3] && op->src[3]->ne[2] != 1) { + return false; + } return fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; } From 17c541119518fb1b92a27bd26fa82606cb090178 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 16 Jul 2025 08:18:51 -0700 Subject: [PATCH 0115/1348] ggml: Add initial WebGPU backend (llama/14521) * Minimal setup of webgpu backend with dawn. Just prints out the adapter and segfaults * Initialize webgpu device * Making progress on setting up the backend * Finish more boilerplate/utility functions * Organize file and work on alloc buffer * Add webgpu_context to prepare for actually running some shaders * Work on memset and add shader loading * Work on memset polyfill * Implement set_tensor as webgpu WriteBuffer, remove host_buffer stubs since webgpu doesn't support it * Implement get_tensor and buffer_clear * Finish rest of setup * Start work on compute graph * Basic mat mul working * Work on emscripten build * Basic WebGPU backend instructions * Use EMSCRIPTEN flag * Work on passing ci, implement 4d tensor multiplication * Pass thread safety test * Implement permuting for mul_mat and cpy * minor cleanups * Address feedback * Remove division by type size in cpy op * Fix formatting and add github action workflows for vulkan and metal (m-series) webgpu backends * Fix name * Fix macos dawn prefix path --- ggml/CMakeLists.txt | 3 +++ ggml/include/ggml-webgpu.h | 19 +++++++++++++++++++ ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 7 +++++++ 4 files changed, 30 insertions(+) create mode 100644 ggml/include/ggml-webgpu.h diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index eaba9c70469..de6d789c98a 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -181,6 +181,8 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF) option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) +option(GGML_WEBGPU "ggml: use WebGPU" OFF) +option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) @@ -270,6 +272,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h + include/ggml-webgpu.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/include/ggml-webgpu.h b/ggml/include/ggml-webgpu.h new file mode 100644 index 00000000000..65b8ed9bb66 --- /dev/null +++ b/ggml/include/ggml-webgpu.h @@ -0,0 +1,19 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_WEBGPU_NAME "WebGPU" + +// Needed for examples in ggml +GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 8760c2d35ec..0425fd60a94 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -370,6 +370,7 @@ ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) +ggml_add_backend(WebGPU) ggml_add_backend(OpenCL) foreach (target ggml-base ggml) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 042ea77aca7..f0cdac31eae 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -45,6 +45,10 @@ #include "ggml-vulkan.h" #endif +#ifdef GGML_USE_WEBGPU +#include "ggml-webgpu.h" +#endif + #ifdef GGML_USE_OPENCL #include "ggml-opencl.h" #endif @@ -173,6 +177,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_VULKAN register_backend(ggml_backend_vk_reg()); #endif +#ifdef GGML_USE_WEBGPU + register_backend(ggml_backend_webgpu_reg()); +#endif #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); #endif From fed20b06824c8502cf534c4466389116bc935d1a Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Fri, 18 Jul 2025 10:23:14 +0800 Subject: [PATCH 0116/1348] use max work group size for device to replace the magic number (llama/14732) --- ggml/src/ggml-sycl/ggml-sycl.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index a6f9af0c86e..872eb4b052d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3530,8 +3530,11 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, SYCL_CHECK(CHECK_TRY_ERROR( stream->memset(dev_cur_src1_row.get(), 0, sizeof(int)))); + const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device]; + assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0); + { - sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u)); + sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size)); sycl::range<3> grid_dims(1, n_ids, ids->ne[1]); sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor src1_row_acc(cgh); @@ -3575,7 +3578,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row); { - sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u)); + sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size)); sycl::range<3> grid_dims(1, 1, num_src1_rows); sycl_launch(stream, [&](sycl::handler & cgh) { const char *__restrict dst_contiguous_get = From 9a07cb064aa1f5b94b0868ee294b98a2e5b01b9a Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 18 Jul 2025 14:54:18 +0800 Subject: [PATCH 0117/1348] CUDA: set_rows + cpy.cu refactor (llama/14712) --- ggml/src/ggml-cuda/cpy-utils.cuh | 251 +++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/cpy.cu | 239 +---------------------------- ggml/src/ggml-cuda/ggml-cuda.cu | 5 +- ggml/src/ggml-cuda/set-rows.cu | 145 +++++++++++++++++- 4 files changed, 396 insertions(+), 244 deletions(-) create mode 100644 ggml/src/ggml-cuda/cpy-utils.cuh diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh new file mode 100644 index 00000000000..e7a0bd2f1a0 --- /dev/null +++ b/ggml/src/ggml-cuda/cpy-utils.cuh @@ -0,0 +1,251 @@ +#pragma once + +#include "ggml-common.h" + +static __device__ __forceinline__ void convert_f32_f32(const float * src, float * dst) { + *dst = *src; +} + +static __device__ __forceinline__ void convert_f32_f16(const float * src, half * dst) { + *dst = __float2half(*src); +} + +static __device__ __forceinline__ void convert_f32_bf16(const float * src, nv_bfloat16 * dst) { + *dst = *src; +} + +static __device__ __forceinline__ void convert_f16_f16(const half * src, half * dst) { + *dst = *src; +} + +static __device__ __forceinline__ void convert_f16_f32(const half * src, float * dst) { + *dst = *src; +} + +static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} + +static __device__ void quantize_f32_q4_0_block(const float * __restrict__ x, block_q4_0 * __restrict__ y) { + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = x[j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f/d : 0.0f; + + y->d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = x[0 + j]*id; + const float x1 = x[QK4_0/2 + j]*id; + + const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f)); + + y->qs[j] = xi0; + y->qs[j] |= xi1 << 4; + } +} + +static __device__ void quantize_f32_q4_1_block(const float * __restrict__ x, block_q4_1 * __restrict__ y) { + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = x[j]; + if (v < vmin) vmin = v; + if (v > vmax) vmax = v; + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y->dm.x = d; + y->dm.y = vmin; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (x[0 + j] - vmin)*id; + const float x1 = (x[QK4_1/2 + j] - vmin)*id; + + const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f)); + + y->qs[j] = xi0; + y->qs[j] |= xi1 << 4; + } +} + +static __device__ void quantize_f32_q5_0_block(const float * __restrict__ x, block_q5_0 * __restrict__ y) { + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK5_0; ++j) { + const float v = x[j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + vmax = v; + } + } + + const float d = vmax / -16; + const float id = d ? 1.0f/d : 0.0f; + + y->d = d; + + uint32_t qh = 0; + for (int j = 0; j < QK5_0/2; ++j) { + const float x0 = x[0 + j]*id; + const float x1 = x[QK5_0/2 + j]*id; + + const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f)); + + y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); + } + memcpy(y->qh, &qh, sizeof(qh)); +} + +static __device__ void quantize_f32_q5_1_block(const float * __restrict__ x, block_q5_1 * __restrict__ y) { + float min = x[0]; + float max = x[0]; + + for (int j = 1; j < QK5_1; ++j) { + const float v = x[j]; + min = v < min ? v : min; + max = v > max ? v : max; + } + + const float d = (max - min) / 31; + const float id = d ? 1.0f/d : 0.0f; + + y->dm.x = d; + y->dm.y = min; + + uint32_t qh = 0; + for (int j = 0; j < QK5_1/2; ++j) { + const float x0 = (x[0 + j] - min)*id; + const float x1 = (x[QK5_1/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); + } + memcpy(y->qh, &qh, sizeof(qh)); +} + +static __device__ void quantize_f32_q8_0_block(const float * __restrict__ x, block_q8_0 * __restrict__ y) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = x[j]; + amax = fmaxf(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = x[j]*id; + y->qs[j] = roundf(x0); + } +} + +static __device__ void quantize_f32_iq4_nl_block(const float * __restrict__ x, block_iq4_nl * __restrict__ y) { + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_NL; ++j) { + const float v = x[j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + vmax = v; + } + } + + float d = vmax / kvalues_iq4nl[0]; + const float id = d ? 1.0f/d : 0.0f; + + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + const float x0 = x[0 + j]*id; + const float x1 = x[QK4_NL/2 + j]*id; + const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); + const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); + y->qs[j] = xi0 | (xi1 << 4); + const float v0 = kvalues_iq4nl[xi0]; + const float v1 = kvalues_iq4nl[xi1]; + const float w0 = x[0 + j]*x[0 + j]; + const float w1 = x[QK4_NL/2 + j]*x[QK4_NL/2 + j]; + sumqx += w0*v0*x[j] + w1*v1*x[QK4_NL/2 + j]; + sumq2 += w0*v0*v0 + w1*v1*v1; + } + + y->d = sumq2 > 0 ? sumqx/sumq2 : d; +} + +// Wrapper functions for cpy.cu compatibility +static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + quantize_f32_q4_0_block((const float *)cxi, (block_q4_0 *)cdsti); +} + +static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + quantize_f32_q4_1_block((const float *)cxi, (block_q4_1 *)cdsti); +} + +static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { + quantize_f32_q5_0_block((const float *)cxi, (block_q5_0 *)cdsti); +} + +static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { + quantize_f32_q5_1_block((const float *)cxi, (block_q5_1 *)cdsti); +} + +static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + quantize_f32_q8_0_block((const float *)cxi, (block_q8_0 *)cdsti); +} + +static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { + quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti); +} + +static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) { + convert_f32_f32((const float *)cxi, (float *)cdsti); +} + +static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { + convert_f32_f16((const float *)cxi, (half *)cdsti); +} + +static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) { + convert_f32_bf16((const float *)cxi, (nv_bfloat16 *)cdsti); +} + +static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) { + convert_f16_f16((const half *)cxi, (half *)cdsti); +} + +static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { + convert_f16_f32((const half *)cxi, (float *)cdsti); +} diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 2c55d2149b2..e7d0da08705 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -1,46 +1,12 @@ #include "cpy.cuh" #include "dequantize.cuh" +#include "cpy-utils.cuh" #ifdef GGML_USE_MUSA #include "ggml-musa/mudnn.cuh" #endif // GGML_USE_MUSA typedef void (*cpy_kernel_t)(const char * cx, char * cdst); -static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti; - - *dsti = *xi; -} - -static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - half * dsti = (half *) cdsti; - - *dsti = __float2half(*xi); -} - -static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) { - const half * xi = (const half *) cxi; - half * dsti = (half *) cdsti; - - *dsti = *xi; -} - -static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { - const half * xi = (const half *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - template static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, @@ -71,29 +37,6 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in cpy_1(cx + x_offset, cdst + dst_offset); } -static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q8_0 * dsti = (block_q8_0 *) cdsti; - - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = xi[j]; - amax = fmaxf(amax, fabsf(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = xi[j]*id; - - dsti->qs[j] = roundf(x0); - } -} - static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { float * cdstf = (float *)(cdsti); @@ -106,139 +49,6 @@ static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { } } -static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_0 * dsti = (block_q4_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_0; ++j) { - const float v = xi[j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - vmax = v; - } - } - - const float d = vmax / -8; - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK4_0/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK4_0/2 + j]*id; - - const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_1 * dsti = (block_q4_1 *) cdsti; - - float vmin = FLT_MAX; - float vmax = -FLT_MAX; - - for (int j = 0; j < QK4_1; ++j) { - const float v = xi[j]; - - if (v < vmin) vmin = v; - if (v > vmax) vmax = v; - } - - const float d = (vmax - vmin) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->dm.x = d; - dsti->dm.y = vmin; - - for (int j = 0; j < QK4_1/2; ++j) { - const float x0 = (xi[0 + j] - vmin)*id; - const float x1 = (xi[QK4_1/2 + j] - vmin)*id; - - const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q5_0 * dsti = (block_q5_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK5_0; ++j) { - const float v = xi[j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - vmax = v; - } - } - - const float d = vmax / -16; - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - uint32_t qh = 0; - for (int j = 0; j < QK5_0/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK5_0/2 + j]*id; - - const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f)); - const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f)); - - dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); - } - memcpy(dsti->qh, &qh, sizeof(qh)); -} - -static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q5_1 * dsti = (block_q5_1 *) cdsti; - - float min = xi[0]; - float max = xi[0]; - - for (int j = 1; j < QK5_1; ++j) { - const float v = xi[j]; - min = v < min ? v : min; - max = v > max ? v : max; - } - - const float d = (max - min) / 31; - const float id = d ? 1.0f/d : 0.0f; - - dsti->dm.x = d; - dsti->dm.y = min; - - uint32_t qh = 0; - for (int j = 0; j < QK5_1/2; ++j) { - const float x0 = (xi[0 + j] - min)*id; - const float x1 = (xi[QK5_1/2 + j] - min)*id; - - const uint8_t xi0 = (uint8_t)(x0 + 0.5f); - const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - - dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); - } - memcpy(dsti->qh, &qh, sizeof(qh)); -} - template static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) { float * cdstf = (float *)(cdsti); @@ -252,53 +62,6 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) { } } -static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) { - if (x <= val[0]) return 0; - if (x >= val[n-1]) return n-1; - int ml = 0, mu = n-1; - while (mu-ml > 1) { - int mav = (ml+mu)/2; - if (x < val[mav]) mu = mav; else ml = mav; - } - return x - val[mu-1] < val[mu] - x ? mu-1 : mu; -} - -static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_iq4_nl * dsti = (block_iq4_nl *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_NL; ++j) { - const float v = xi[j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - vmax = v; - } - } - - float d = vmax / kvalues_iq4nl[0]; - const float id = d ? 1.0f/d : 0.0f; - - float sumqx = 0, sumq2 = 0; - for (int j = 0; j < QK4_NL/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK4_NL/2 + j]*id; - const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); - const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); - dsti->qs[j] = xi0 | (xi1 << 4); - const float v0 = kvalues_iq4nl[xi0]; - const float v1 = kvalues_iq4nl[xi1]; - const float w0 = xi[0 + j]*xi[0 + j]; - const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j]; - sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j]; - sumq2 += w0*v0*v0 + w1*v1*v1; - } - - dsti->d = sumq2 > 0 ? sumqx/sumq2 : d; -} - template static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 778d5a48bd9..50a977c3076 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3226,8 +3226,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } break; case GGML_OP_SET_ROWS: { -#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)") - return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) && + return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || + op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 || + op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64; } break; diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 58cee924401..560604d095f 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -1,4 +1,5 @@ #include "set-rows.cuh" +#include "cpy-utils.cuh" typedef void (*set_rows_kernel_t)(const char * src, char * dst); @@ -10,17 +11,93 @@ __device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) { template<> __device__ __forceinline__ void set_rows_1(const float * src_f, half * dst_h) { - *dst_h = __float2half(*src_f); + convert_f32_f16(src_f, dst_h); } template<> __device__ __forceinline__ void set_rows_1(const float * src_f, nv_bfloat16 * dst_b) { - *dst_b = *src_f; + convert_f32_bf16(src_f, dst_b); } template<> __device__ __forceinline__ void set_rows_1(const float * src_f, float * dst_f) { - *dst_f = *src_f; + convert_f32_f32(src_f, dst_f); +} + +// Generic quantized set_rows kernel template +template +static __global__ void k_set_rows_quant( + const float * __restrict__ src0, const int64_t * __restrict__ src1, block_type * __restrict__ dst, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t s10, const int64_t s11, const int64_t s12, + const int64_t s1, const int64_t s2, const int64_t s3) { + + const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; + const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk; + + if (i >= ne_total) { + return; + } + + const int64_t i_base = i * qk; + const int64_t i03 = i_base / (ne00 * ne01 * ne02); + const int64_t i02 = (i_base - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int64_t i01 = (i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00; + const int64_t i00 = i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00; + + const int64_t i12 = i03 % ne12; + const int64_t i11 = i02 % ne11; + const int64_t i10 = i01; + + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + + const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; + block_type * dst_row_ptr = dst + (dst_row*s1 + i02*s2 + i03*s3) / sizeof(block_type); + + const float * src_block = src0_row + i00; + block_type * dst_block = dst_row_ptr + i00 / qk; + + quantize_func(src_block, dst_block); +} + +// Template dispatch function for quantized set_rows +template +static void set_rows_cuda_quant( + const float * src0_d, const int64_t * src1_d, block_type * dst_d, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { + + GGML_ASSERT(ne00 % qk == 0); + const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk; + const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE; + const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE); + const dim3 grid_size(num_blocks); + + const int64_t s01 = nb01/sizeof(float); + const int64_t s02 = nb02/sizeof(float); + const int64_t s03 = nb03/sizeof(float); + const int64_t s10 = nb10/sizeof(int64_t); + const int64_t s11 = nb11/sizeof(int64_t); + const int64_t s12 = nb12/sizeof(int64_t); + const int64_t s1 = nb1; + const int64_t s2 = nb2; + const int64_t s3 = nb3; + + if (ne_total > 0) { + k_set_rows_quant<<>>( + src0_d, src1_d, dst_d, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + s01, s02, s03, + s10, s11, s12, + s1, s2, s3); + } } template @@ -145,7 +222,67 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { nb1, nb2, nb3, stream ); + } else if (dst->type == GGML_TYPE_Q4_0) { + set_rows_cuda_quant( + src0_d, src1_d, (block_q4_0*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); + } else if (dst->type == GGML_TYPE_Q4_1) { + set_rows_cuda_quant( + src0_d, src1_d, (block_q4_1*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); + } else if (dst->type == GGML_TYPE_Q5_0) { + set_rows_cuda_quant( + src0_d, src1_d, (block_q5_0*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); + } else if (dst->type == GGML_TYPE_Q5_1) { + set_rows_cuda_quant( + src0_d, src1_d, (block_q5_1*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); + } else if (dst->type == GGML_TYPE_Q8_0) { + set_rows_cuda_quant( + src0_d, src1_d, (block_q8_0*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); + } else if (dst->type == GGML_TYPE_IQ4_NL) { + set_rows_cuda_quant( + src0_d, src1_d, (block_iq4_nl*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); } else { - GGML_ABORT("unsupported type"); + GGML_ABORT("unsupported type %s", ggml_type_name(dst->type)); } } From d4a7ea1634b0b07b52be9fb889d9f437177cc021 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Fri, 18 Jul 2025 13:35:32 +0200 Subject: [PATCH 0118/1348] cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs (llama/14741) * Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs Gemma3n uses Matrix-Matrix addition as part of their input processing, wrongly triggering CUDA_GRAPH disablement on NVGPUs even when batch-size of 1 is used. * Exclude `project_per_layer_input` by matching node names This ensures that all other graphs which don't exhibit this pattern do not have their behavior changed. * Revert unnecessary formatting changes --- ggml/src/ggml-cuda/ggml-cuda.cu | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 50a977c3076..dfc50ef0daf 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2590,6 +2590,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud // Loop over nodes in GGML graph to obtain info needed for CUDA graph cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); + const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; + const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; + for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2611,9 +2614,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { - // disable CUDA graphs for batch size > 1 for now. - // Changes in batch size or context size can cause changes to the grid size of some kernels. + if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) { + // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation + // by means of matching node names. See + // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and + // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, + // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); From 0ed687c6f1a52f009d925396ed7d86c581feee7f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 18 Jul 2025 20:37:26 +0300 Subject: [PATCH 0119/1348] metal : fuse add, mul + add tests (llama/14596) ggml-ci --- ggml/src/ggml-alloc.c | 15 -- ggml/src/ggml-backend.cpp | 15 -- ggml/src/ggml-impl.h | 16 ++ ggml/src/ggml-metal/ggml-metal-impl.h | 15 +- ggml/src/ggml-metal/ggml-metal.m | 364 +++++++++++++++++++++----- ggml/src/ggml-metal/ggml-metal.metal | 236 ++++++++++++++--- 6 files changed, 518 insertions(+), 143 deletions(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 5fd379f6a94..fcc552da519 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -22,21 +22,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) { return t->view_src != NULL; } -static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { - if (a->type != b->type) { - return false; - } - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if (a->ne[i] != b->ne[i]) { - return false; - } - if (a->nb[i] != b->nb[i]) { - return false; - } - } - return true; -} - // ops that return true for this function must not use restrict pointers for their backend implementations static bool ggml_op_can_inplace(enum ggml_op op) { switch (op) { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 788861a365f..b7498b8d402 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -352,21 +352,6 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { // backend copy -static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { - if (a->type != b->type) { - return false; - } - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if (a->ne[i] != b->ne[i]) { - return false; - } - if (a->nb[i] != b->nb[i]) { - return false; - } - } - return true; -} - void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 4972558c98b..a2e30994c46 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -73,6 +73,22 @@ static inline int ggml_up(int n, int m) { return (n + m - 1) & ~(m - 1); } +// TODO: move to ggml.h? +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + // // logging // diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 752d55c2166..b7b3fc49af3 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -126,6 +126,7 @@ typedef struct { uint64_t nb2; uint64_t nb3; uint64_t offs; + uint64_t o1[8]; } ggml_metal_kargs_bin; typedef struct { @@ -240,7 +241,7 @@ typedef struct { float max_bias; float m0; float m1; - uint16_t n_head_log2; + int32_t n_head_log2; float logit_softcap; } ggml_metal_kargs_flash_attn_ext; @@ -377,8 +378,16 @@ typedef struct { typedef struct { int32_t ne00; int32_t ne00_4; - uint64_t nb01; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; float eps; + int32_t nef1[3]; + int32_t nef2[3]; + int32_t nef3[3]; + uint64_t nbf1[3]; + uint64_t nbf2[3]; + uint64_t nbf3[3]; } ggml_metal_kargs_rms_norm; typedef struct { @@ -484,7 +493,7 @@ typedef struct { float max_bias; float m0; float m1; - uint32_t n_head_log2; + int32_t n_head_log2; } ggml_metal_kargs_soft_max; typedef struct { diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 44ddc69d08f..dc391a0d4d5 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -55,6 +55,12 @@ bool has_residency_sets; bool has_bfloat; bool use_bfloat; + bool use_fusion; + + int debug_fusion; + + // how many times a given op was fused + uint64_t fuse_cnt[GGML_OP_COUNT]; size_t max_size; @@ -69,6 +75,9 @@ /*.has_residency_sets =*/ false, /*.has_bfloat =*/ false, /*.use_bfloat =*/ false, + /*.use_fusion =*/ true, + /*.debug_fusion =*/ 0, + /*.fuse_cnt =*/ { 0 }, /*.max_size =*/ 0, /*.name =*/ "", }; @@ -83,16 +92,14 @@ if (ctx->mtl_device == nil) { ctx->mtl_device = MTLCreateSystemDefaultDevice(); - } - if (ctx->mtl_device) { ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; #if defined(GGML_METAL_HAS_RESIDENCY_SETS) - ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL; + ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil; #endif ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -103,6 +110,14 @@ #else ctx->use_bfloat = false; #endif + ctx->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil; + + { + const char * val = getenv("GGML_METAL_FUSION_DEBUG"); + ctx->debug_fusion = val ? atoi(val) : 0; + } + + memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt)); ctx->max_size = ctx->mtl_device.maxBufferLength; @@ -122,6 +137,18 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_device_ref_count--; if (ctx->mtl_device_ref_count == 0) { + if (ctx->debug_fusion > 0) { + fprintf(stderr, "%s: fusion stats:\n", __func__); + for (int i = 0; i < GGML_OP_COUNT; i++) { + if (ctx->fuse_cnt[i] == 0) { + continue; + } + + // note: cannot use ggml_log here + fprintf(stderr, "%s: - %s: %" PRIu64 "\n", __func__, ggml_op_name((enum ggml_op) i), ctx->fuse_cnt[i]); + } + } + if (ctx->mtl_lock) { [ctx->mtl_lock release]; ctx->mtl_lock = nil; @@ -147,13 +174,27 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_ADD, - GGML_METAL_KERNEL_TYPE_ADD_ROW, + GGML_METAL_KERNEL_TYPE_ADD_FUSE_2, + GGML_METAL_KERNEL_TYPE_ADD_FUSE_3, + GGML_METAL_KERNEL_TYPE_ADD_FUSE_4, + GGML_METAL_KERNEL_TYPE_ADD_FUSE_5, + GGML_METAL_KERNEL_TYPE_ADD_FUSE_6, + GGML_METAL_KERNEL_TYPE_ADD_FUSE_7, + GGML_METAL_KERNEL_TYPE_ADD_FUSE_8, + GGML_METAL_KERNEL_TYPE_ADD_ROW_C4, + GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2, + GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3, + GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4, + GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5, + GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6, + GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7, + GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8, GGML_METAL_KERNEL_TYPE_SUB, - GGML_METAL_KERNEL_TYPE_SUB_ROW, + GGML_METAL_KERNEL_TYPE_SUB_ROW_C4, GGML_METAL_KERNEL_TYPE_MUL, - GGML_METAL_KERNEL_TYPE_MUL_ROW, + GGML_METAL_KERNEL_TYPE_MUL_ROW_C4, GGML_METAL_KERNEL_TYPE_DIV, - GGML_METAL_KERNEL_TYPE_DIV_ROW, + GGML_METAL_KERNEL_TYPE_DIV_ROW_C4, GGML_METAL_KERNEL_TYPE_REPEAT_F32, GGML_METAL_KERNEL_TYPE_REPEAT_F16, GGML_METAL_KERNEL_TYPE_REPEAT_I32, @@ -218,6 +259,8 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_RMS_NORM, + GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL, + GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD, GGML_METAL_KERNEL_TYPE_L2_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, GGML_METAL_KERNEL_TYPE_NORM, @@ -1135,13 +1178,27 @@ @implementation GGMLMetalClass // simd_sum and simd_max requires MTLGPUFamilyApple7 GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_2, add_fuse_2, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_3, add_fuse_3, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_4, add_fuse_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_5, add_fuse_5, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_6, add_fuse_6, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_7, add_fuse_7, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_8, add_fuse_8, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4, add_row_c4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2, add_row_c4_fuse_2, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3, add_row_c4_fuse_3, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4, add_row_c4_fuse_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5, add_row_c4_fuse_5, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6, add_row_c4_fuse_6, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7, add_row_c4_fuse_7, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8, add_row_c4_fuse_8, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW_C4, sub_row_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW_C4, mul_row_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW_C4, div_row_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true); @@ -1206,6 +1263,8 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, set_rows_q5_1, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, set_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL, rms_norm_mul, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD, rms_norm_mul_add, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM, l2_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); @@ -1893,7 +1952,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } } -static bool ggml_metal_encode_node( +static int ggml_metal_encode_node( ggml_backend_t backend, int idx, id encoder, @@ -1903,7 +1962,10 @@ static bool ggml_metal_encode_node( struct ggml_cgraph * gf = ctx->gf; - struct ggml_tensor * node = ggml_graph_node(gf, idx); + enum ggml_op ops[8]; + + struct ggml_tensor ** nodes = ggml_graph_nodes(gf) + idx; + struct ggml_tensor * node = nodes[0]; //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op)); @@ -1913,7 +1975,7 @@ static bool ggml_metal_encode_node( struct ggml_tensor * dst = node; if (ggml_is_empty(dst)) { - return true; + return 1; } switch (dst->op) { @@ -1924,7 +1986,7 @@ static bool ggml_metal_encode_node( case GGML_OP_PERMUTE: { // noop -> next node - } return true; + } return 1; default: { } break; @@ -1991,6 +2053,8 @@ static bool ggml_metal_encode_node( id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; + int n_fuse = 1; + #if 0 GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); if (src0) { @@ -2062,37 +2126,15 @@ static bool ggml_metal_encode_node( GGML_ASSERT(src0t == GGML_TYPE_F32); GGML_ASSERT(src1t == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous_rows(src0)); + GGML_ASSERT(ggml_is_contiguous_rows(src1)); + const size_t offs = 0; bool bcast_row = false; id pipeline = nil; - if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { - GGML_ASSERT(ggml_is_contiguous(src0)); - - // src1 is a row - GGML_ASSERT(ne11 == 1); - - switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; - case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break; - case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; - case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; - default: GGML_ABORT("fatal error"); - } - - bcast_row = true; - } else { - switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; - case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break; - case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; - case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; - default: GGML_ABORT("fatal error"); - } - } - ggml_metal_kargs_bin args = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, @@ -2119,12 +2161,117 @@ static bool ggml_metal_encode_node( /*.nb2 =*/ nb2, /*.nb3 =*/ nb3, /*.offs =*/ offs, + /*.o1 =*/ { offs_src1 }, }; + // c[0] = add(a, b[0]) + // c[1] = add(c[0], b[1]) + // c[2] = add(c[1], b[2]) + // ... + if (ctx_dev->use_fusion) { + ops[0] = GGML_OP_ADD; + ops[1] = GGML_OP_ADD; + ops[2] = GGML_OP_ADD; + ops[3] = GGML_OP_ADD; + ops[4] = GGML_OP_ADD; + ops[5] = GGML_OP_ADD; + ops[6] = GGML_OP_ADD; + ops[7] = GGML_OP_ADD; + + size_t offs_fuse; + id id_fuse; + + for (n_fuse = 0; n_fuse <= 6; ++n_fuse) { + if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) { + break; + } + + if (nodes[n_fuse] != nodes[n_fuse + 1]->src[0]) { + break; + } + + // b[0] === b[1] === ... + if (!ggml_are_same_layout(nodes[n_fuse]->src[1], nodes[n_fuse + 1]->src[1])) { + break; + } + + // only fuse nodes if src1 is in the same Metal buffer + id_fuse = ggml_metal_get_buffer(nodes[n_fuse + 1]->src[1], &offs_fuse); + if (id_fuse != id_src1) { + break; + } + + ctx_dev->fuse_cnt[nodes[n_fuse + 1]->op]++; + + args.o1[n_fuse + 1] = offs_fuse; + } + + ++n_fuse; + + if (ctx_dev->debug_fusion > 1 && n_fuse > 1) { + GGML_LOG_DEBUG("%s: fuse: ADD x %d\n", __func__, n_fuse); + } + } + + if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // src1 is a row + GGML_ASSERT(ne11 == 1); + + switch (dst->op) { + case GGML_OP_ADD: + { + switch (n_fuse) { + case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4 ].pipeline; break; + case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2].pipeline; break; + case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3].pipeline; break; + case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4].pipeline; break; + case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5].pipeline; break; + case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6].pipeline; break; + case 7: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7].pipeline; break; + case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8].pipeline; break; + default: GGML_ABORT("fatal error"); + } + } break; + case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW_C4].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW_C4].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW_C4].pipeline; break; + default: GGML_ABORT("fatal error"); + } + + bcast_row = true; + } else { + switch (dst->op) { + case GGML_OP_ADD: + { + switch (n_fuse) { + case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD ].pipeline; break; + case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_2].pipeline; break; + case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_3].pipeline; break; + case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_4].pipeline; break; + case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_5].pipeline; break; + case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_6].pipeline; break; + case 7: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_7].pipeline; break; + case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_8].pipeline; break; + default: GGML_ABORT("fatal error"); + } + } break; + case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; + default: GGML_ABORT("fatal error"); + } + } + + if (n_fuse > 1) { + id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst); + } + [encoder setComputePipelineState:pipeline]; [encoder setBytes:&args length:sizeof(args) atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_src1 offset:0 atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; if (bcast_row) { @@ -2132,7 +2279,11 @@ static bool ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } else { - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + int nth = 32; + + while (16*nth < ne0 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) { + nth *= 2; + } [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } @@ -2257,12 +2408,13 @@ static bool ggml_metal_encode_node( /*.nb2 =*/ pnb2, /*.nb3 =*/ pnb3, /*.offs =*/ offs, + /*.o1 =*/ { offs_src1}, }; [encoder setComputePipelineState:pipeline]; [encoder setBytes:&args length:sizeof(args) atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_src1 offset:0 atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); @@ -2764,7 +2916,7 @@ static bool ggml_metal_encode_node( id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); if (!h_src0) { GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); - return false; + return 0; } offs_src0 = 0; @@ -3640,7 +3792,7 @@ static bool ggml_metal_encode_node( id h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1); if (!h_src1) { GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1); - return false; + return 0; } const int64_t neh0 = ne0; @@ -3656,7 +3808,7 @@ static bool ggml_metal_encode_node( id h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst); if (!h_dst) { GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst); - return false; + return 0; } // tokens per expert @@ -3664,7 +3816,7 @@ static bool ggml_metal_encode_node( id h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe); if (!h_tpe) { GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe); - return false; + return 0; } // id map @@ -3673,7 +3825,7 @@ static bool ggml_metal_encode_node( id h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids); if (!h_ids) { GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids); - return false; + return 0; } { @@ -4105,12 +4257,95 @@ static bool ggml_metal_encode_node( case GGML_OP_RMS_NORM: { GGML_ASSERT(ne00 % 4 == 0); - GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_rows(src0)); float eps; memcpy(&eps, dst->op_params, sizeof(float)); - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; + ggml_metal_kargs_rms_norm args = { + /*.ne00 =*/ ne00, + /*.ne00_4 =*/ ne00/4, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + /*.nb3 =*/ nb3, + /*.eps =*/ eps, + /*.nef1 =*/ { ne01 }, + /*.nef2 =*/ { ne02 }, + /*.nef3 =*/ { ne03 }, + /*.nbf1 =*/ { nb01 }, + /*.nbf2 =*/ { nb02 }, + /*.nbf3 =*/ { nb03 }, + }; + + size_t offs_fuse[2] = { 0, 0 }; + id id_fuse[2] = { id_src0, id_src0 }; + + // d[0] = rms_norm(a) + // d[1] = mul(d[0], b) + // d[2] = add(d[1], c) + if (ctx_dev->use_fusion) { + ops[0] = GGML_OP_RMS_NORM; + ops[1] = GGML_OP_MUL; + ops[2] = GGML_OP_ADD; + + for (n_fuse = 0; n_fuse <= 1; ++n_fuse) { + if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) { + break; + } + + if (nodes[n_fuse] != nodes[n_fuse + 1]->src[0]) { + break; + } + + if (nodes[n_fuse + 1]->src[1]->ne[0] != node->ne[0]) { + break; + } + + if (!ggml_is_contiguous_rows(nodes[n_fuse + 1]->src[1])) { + break; + } + + if (nodes[n_fuse + 1]->type != GGML_TYPE_F32) { + break; + } + + ctx_dev->fuse_cnt[nodes[n_fuse + 1]->op]++; + + id_fuse[n_fuse] = ggml_metal_get_buffer(nodes[n_fuse + 1]->src[1], &offs_fuse[n_fuse]); + + args.nef1[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[1]; + args.nef2[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[2]; + args.nef3[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[3]; + + args.nbf1[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[1]; + args.nbf2[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[2]; + args.nbf3[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[3]; + } + + ++n_fuse; + + if (ctx_dev->debug_fusion > 1 && n_fuse > 1) { + if (n_fuse == 2) { + GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL\n", __func__); + } + if (n_fuse == 3) { + GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL + ADD\n", __func__); + } + } + } + + if (n_fuse > 1) { + id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst); + } + + id pipeline; + + switch (n_fuse) { + case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM ].pipeline; break; + case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL ].pipeline; break; + case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD].pipeline; break; + default: GGML_ABORT("unsupported n_fuse = %d\n", n_fuse); + } int nth = 32; // SIMD width @@ -4121,23 +4356,16 @@ static bool ggml_metal_encode_node( nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); - ggml_metal_kargs_rms_norm args = { - /*.ne00 =*/ ne00, - /*.ne00_4 =*/ ne00/4, - /*.nb01 =*/ nb01, - /*.eps =*/ eps, - }; - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_fuse[0] offset:offs_fuse[0] atIndex:2]; + [encoder setBuffer:id_fuse[1] offset:offs_fuse[1] atIndex:3]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:4]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - const int64_t nrows = ggml_nrows(src0); - - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_L2_NORM: { @@ -5532,7 +5760,7 @@ static bool ggml_metal_encode_node( } } - return true; + return n_fuse; } static enum ggml_status ggml_metal_graph_compute( @@ -6038,20 +6266,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; ggml_metal_mem_pool_reset(mem_pool); - for (int idx = node_start; idx < node_end; ++idx) { + for (int idx = node_start; idx < node_end;) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); + const int res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); if (should_capture) { [encoder popDebugGroup]; } - if (!res) { + if (res == 0) { break; } + + idx += res; } [encoder endEncoding]; diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 13235e28852..f62b9ad548e 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -832,7 +832,8 @@ enum ggml_sort_order { // general-purpose kernel for addition, subtraction, multiplication and division of two tensors // pros: works for non-contiguous tensors, supports broadcast across all dims // cons: not very efficient -kernel void kernel_add( +template +kernel void kernel_add_fuse_impl( constant ggml_metal_kargs_bin & args, device const char * src0, device const char * src1, @@ -848,16 +849,39 @@ kernel void kernel_add( const int i12 = i02%args.ne12; const int i11 = i01%args.ne11; - device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs; - device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11; - device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; + device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs); + device float * dst_ptr = (device float *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs); + + device const float * src1_ptr[F]; + for (short j = 0; j < F; ++j) { + src1_ptr[j] = (device const float *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11); + } for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { const int i10 = i0%args.ne10; - *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) + *((device float *)(src1_ptr + i10*args.nb10)); + + float res = src0_ptr[i0]; + +#pragma unroll + for (short j = 0; j < F; ++j) { + res += src1_ptr[j][i10]; + } + + dst_ptr[i0] = res; } } +typedef decltype(kernel_add_fuse_impl<2>) kernel_add_fuse_t; + +template [[host_name("kernel_add")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<1>; +template [[host_name("kernel_add_fuse_2")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<2>; +template [[host_name("kernel_add_fuse_3")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<3>; +template [[host_name("kernel_add_fuse_4")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<4>; +template [[host_name("kernel_add_fuse_5")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<5>; +template [[host_name("kernel_add_fuse_6")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<6>; +template [[host_name("kernel_add_fuse_7")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<7>; +template [[host_name("kernel_add_fuse_8")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<8>; + kernel void kernel_sub( constant ggml_metal_kargs_bin & args, device const char * src0, @@ -875,7 +899,7 @@ kernel void kernel_sub( const int i11 = i01%args.ne11; device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs; - device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11; + device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0]; device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { @@ -900,9 +924,9 @@ kernel void kernel_mul( const int i12 = i02%args.ne12; const int i11 = i01%args.ne11; - device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01; - device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11; - device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1; + device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs; + device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0]; + device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { const int i10 = i0%args.ne10; @@ -926,9 +950,9 @@ kernel void kernel_div( const int i12 = i02%args.ne12; const int i11 = i01%args.ne11; - device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01; - device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11; - device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1; + device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs; + device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0]; + device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { const int i10 = i0%args.ne10; @@ -970,46 +994,145 @@ template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat // assumption: src1 is a row // broadcast src1 into src0 -kernel void kernel_add_row( +template +kernel void kernel_add_row_c4_fuse_impl( constant ggml_metal_kargs_bin & args, - device const float4 * src0, - device const float4 * src1, - device float4 * dst, + device const char * src0, + device const char * src1, + device char * dst, uint tpig[[thread_position_in_grid]]) { + const uint nb = args.ne00/4; - dst[tpig] = src0[tpig] + src1[tpig % nb]; + const uint i = tpig % nb; + + device const float4 * src0_row = (device const float4 *) (src0); + device float4 * dst_row = (device float4 *) (dst); + + device const float4 * src1_row[F]; + for (short j = 0; j < F; ++j) { + src1_row[j] = (device const float4 *) (src1 + args.o1[j]); + } + + float4 res = src0_row[tpig]; + +#pragma unroll(F) + for (short j = 0; j < F; ++j) { + res += src1_row[j][i]; + } + + dst_row[tpig] = res; } -kernel void kernel_sub_row( +typedef decltype(kernel_add_row_c4_fuse_impl<1>) kernel_add_row_c4_fuse_t; + +template [[host_name("kernel_add_row_c4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>; +template [[host_name("kernel_add_row_c4_fuse_2")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<2>; +template [[host_name("kernel_add_row_c4_fuse_3")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<3>; +template [[host_name("kernel_add_row_c4_fuse_4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<4>; +template [[host_name("kernel_add_row_c4_fuse_5")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<5>; +template [[host_name("kernel_add_row_c4_fuse_6")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<6>; +template [[host_name("kernel_add_row_c4_fuse_7")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<7>; +template [[host_name("kernel_add_row_c4_fuse_8")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<8>; + +template +kernel void kernel_sub_row_c4_fuse_impl( constant ggml_metal_kargs_bin & args, - device const float4 * src0, - device const float4 * src1, - device float4 * dst, + device const char * src0, + device const char * src1, + device char * dst, uint tpig[[thread_position_in_grid]]) { + const uint nb = args.ne00/4; - dst[tpig] = src0[tpig] - src1[tpig % nb]; + const uint i = tpig % nb; + + device const float4 * src0_row = (device const float4 *) (src0); + device float4 * dst_row = (device float4 *) (dst); + + device const float4 * src1_row[F]; + for (short j = 0; j < F; ++j) { + src1_row[j] = (device const float4 *) (src1 + args.o1[j]); + } + + float4 res = src0_row[tpig]; + +#pragma unroll(F) + for (short j = 0; j < F; ++j) { + res -= src1_row[j][i]; + } + + dst_row[tpig] = res; } -kernel void kernel_mul_row( +typedef decltype(kernel_sub_row_c4_fuse_impl<1>) kernel_sub_row_c4_fuse_t; + +template [[host_name("kernel_sub_row_c4")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>; + +template +kernel void kernel_mul_row_c4_fuse_impl( constant ggml_metal_kargs_bin & args, - device const float4 * src0, - device const float4 * src1, - device float4 * dst, + device const char * src0, + device const char * src1, + device char * dst, uint tpig[[thread_position_in_grid]]) { + const uint nb = args.ne00/4; - dst[tpig] = src0[tpig] * src1[tpig % nb]; + const uint i = tpig % nb; + + device const float4 * src0_row = (device const float4 *) (src0); + device float4 * dst_row = (device float4 *) (dst); + + device const float4 * src1_row[F]; + for (short j = 0; j < F; ++j) { + src1_row[j] = (device const float4 *) (src1 + args.o1[j]); + } + + float4 res = src0_row[tpig]; + +#pragma unroll(F) + for (short j = 0; j < F; ++j) { + res *= src1_row[j][i]; + } + + dst_row[tpig] = res; } -kernel void kernel_div_row( +typedef decltype(kernel_mul_row_c4_fuse_impl<1>) kernel_mul_row_c4_fuse_t; + +template [[host_name("kernel_mul_row_c4")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>; + +template +kernel void kernel_div_row_c4_fuse_impl( constant ggml_metal_kargs_bin & args, - device const float4 * src0, - device const float4 * src1, - device float4 * dst, + device const char * src0, + device const char * src1, + device char * dst, uint tpig[[thread_position_in_grid]]) { + const uint nb = args.ne00/4; - dst[tpig] = src0[tpig] / src1[tpig % nb]; + const uint i = tpig % nb; + + device const float4 * src0_row = (device const float4 *) (src0); + device float4 * dst_row = (device float4 *) (dst); + + device const float4 * src1_row[F]; + for (short j = 0; j < F; ++j) { + src1_row[j] = (device const float4 *) (src1 + args.o1[j]); + } + + float4 res = src0_row[tpig]; + +#pragma unroll(F) + for (short j = 0; j < F; ++j) { + res /= src1_row[j][i]; + } + + dst_row[tpig] = res; } +typedef decltype(kernel_div_row_c4_fuse_impl<1>) kernel_div_row_c4_fuse_t; + +template [[host_name("kernel_div_row_c4")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>; + kernel void kernel_scale( device const float * src0, device float * dst, @@ -2116,26 +2239,39 @@ kernel void kernel_norm( } } -kernel void kernel_rms_norm( +// F == 1 : rms_norm (no fuse) +// F == 2 : rms_norm + mul +// F == 3 : rms_norm + mul + add +template +kernel void kernel_rms_norm_fuse_impl( constant ggml_metal_kargs_rms_norm & args, device const char * src0, + device const char * src1_0, + device const char * src1_1, device char * dst, threadgroup float * shmem_f32 [[threadgroup(0)]], - uint tgpig[[threadgroup_position_in_grid]], - ushort tpitg[[thread_position_in_threadgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort ntg[[threads_per_threadgroup]]) { + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { if (sgitg == 0) { shmem_f32[tiisg] = 0.0f; } - device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01); + const int i01 = tgpig.x; + const int i02 = tgpig.y; + const int i03 = tgpig.z; + + device const float4 * x = (device const float4 *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]); + + device const float4 * f0 = (device const float4 *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]); + device const float4 * f1 = (device const float4 *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]); float sumf = 0.0f; // parallel sum - for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) { + for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) { sumf += dot(x[i00], x[i00]); } sumf = simd_sum(sumf); @@ -2154,12 +2290,26 @@ kernel void kernel_rms_norm( const float mean = sumf/args.ne00; const float scale = 1.0f/sqrt(mean + args.eps); - device float4 * y = (device float4 *) dst + tgpig*args.ne00_4; - for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) { - y[i00] = x[i00] * scale; + device float4 * y = (device float4 *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1); + for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) { + if (F == 1) { + y[i00] = (x[i00]*scale); + } + if (F == 2) { + y[i00] = (x[i00]*scale)*f0[i00]; + } + if (F == 3) { + y[i00] = (x[i00]*scale)*f0[i00] + f1[i00]; + } } } +typedef decltype(kernel_rms_norm_fuse_impl<1>) kernel_rms_norm_fuse_t; + +template [[host_name("kernel_rms_norm")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<1>; +template [[host_name("kernel_rms_norm_mul")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<2>; +template [[host_name("kernel_rms_norm_mul_add")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<3>; + kernel void kernel_l2_norm( constant ggml_metal_kargs_l2_norm & args, device const char * src0, From c0dc39134904610778ac3c36554a5ba94531498a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 19 Jul 2025 17:48:07 +0300 Subject: [PATCH 0120/1348] sync : ggml ggml-ci --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index ca009adb83b..9b223827afb 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -d62df60a07ba3deeb85e5cfc9b1ee07645ff35e2 +a0361ace408ba2c30820deb39e793ad9ed787a85 From 2e6be2f38086e6c0930e94731b8a964c6579e465 Mon Sep 17 00:00:00 2001 From: BVK Chaitanya Date: Mon, 21 Jul 2025 01:47:35 -0500 Subject: [PATCH 0121/1348] go: fix Mac OS X builds (#3310) This commit fixes Go bindings build failure for Mac OS X (15.1) which is currently failing. Co-authored-by: Chaitanya Bayapuneni --- bindings/go/Makefile | 5 +++-- bindings/go/whisper.go | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bindings/go/Makefile b/bindings/go/Makefile index edcc0166b74..e4436a6a291 100644 --- a/bindings/go/Makefile +++ b/bindings/go/Makefile @@ -15,7 +15,7 @@ BUILD_DIR := build_go MODELS_DIR := models EXAMPLES_DIR := $(wildcard examples/*) INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include) -LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src:$(abspath ../../${BUILD_DIR}/ggml/src)) +LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src):$(abspath ../../${BUILD_DIR}/ggml/src) ifeq ($(GGML_CUDA),1) LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/ @@ -23,7 +23,8 @@ ifeq ($(GGML_CUDA),1) endif ifeq ($(UNAME_S),Darwin) - EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit + LIBRARY_PATH := $(LIBRARY_PATH):$(abspath ../../${BUILD_DIR}/ggml/src/ggml-blas):$(abspath ../../${BUILD_DIR}/ggml/src/ggml-metal) + EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit -lggml-metal -lggml-blas endif all: clean whisper examples diff --git a/bindings/go/whisper.go b/bindings/go/whisper.go index 525b72d2318..3ef73414d90 100644 --- a/bindings/go/whisper.go +++ b/bindings/go/whisper.go @@ -9,7 +9,9 @@ import ( // CGO /* -#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu -lm -lstdc++ -fopenmp +#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu -lm -lstdc++ +#cgo linux LDFLAGS: -fopenmp +#cgo darwin LDFLAGS: -lggml-metal -lggml-blas #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics #include #include From 1f5cf0b2888402d57bb17b2029b2caa97e5f3baf Mon Sep 17 00:00:00 2001 From: Sacha Arbonel Date: Mon, 21 Jul 2025 13:03:54 +0200 Subject: [PATCH 0122/1348] server : hide language probabilities option behind flag (#3328) * examples/server: hide language probabilities option behind flag * code review * fix --- examples/server/server.cpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 643d08a799a..901f65f6c35 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -104,6 +104,7 @@ struct whisper_params { bool flash_attn = false; bool suppress_nst = false; bool no_context = false; + bool no_language_probabilities = false; std::string language = "en"; std::string prompt = ""; @@ -178,6 +179,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false"); fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false"); + fprintf(stderr, " -nlp, --no-language-probabilities [%-7s] exclude language probabilities from verbose_json output\n", params.no_language_probabilities ? "true" : "false"); // Voice Activity Detection (VAD) parameters fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n"); fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false"); @@ -237,6 +239,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; } else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(argv[++i]); } else if (arg == "-nc" || arg == "--no-context") { params.no_context = true; } + else if (arg == "-nlp" || arg == "--no-language-probabilities") { params.no_language_probabilities = true; } // server params else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); } @@ -599,6 +602,10 @@ void get_req_parameters(const Request & req, whisper_params & params) { params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content); } + if (req.has_file("no_language_probabilities")) + { + params.no_language_probabilities = parse_str_to_bool(req.get_file_value("no_language_probabilities").content); + } } } // namespace @@ -1024,23 +1031,25 @@ int main(int argc, char ** argv) { } else if (params.response_format == vjson_format) { /* try to match openai/whisper's Python format */ std::string results = output_str(ctx, params, pcmf32s); - // Get language probabilities - std::vector lang_probs(whisper_lang_max_id() + 1, 0.0f); - const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data()); json jres = json{ {"task", params.translate ? "translate" : "transcribe"}, {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))}, {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE}, {"text", results}, - {"segments", json::array()}, - {"detected_language", whisper_lang_str_full(detected_lang_id)}, - {"detected_language_probability", lang_probs[detected_lang_id]}, - {"language_probabilities", json::object()} + {"segments", json::array()} }; - // Add all language probabilities - for (int i = 0; i <= whisper_lang_max_id(); ++i) { - if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities - jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i]; + // Only compute language probabilities if requested (expensive operation) + if (!params.no_language_probabilities) { + std::vector lang_probs(whisper_lang_max_id() + 1, 0.0f); + const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data()); + jres["detected_language"] = whisper_lang_str_full(detected_lang_id); + jres["detected_language_probability"] = lang_probs[detected_lang_id]; + jres["language_probabilities"] = json::object(); + // Add all language probabilities + for (int i = 0; i <= whisper_lang_max_id(); ++i) { + if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities + jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i]; + } } } const int n_segments = whisper_full_n_segments(ctx); From 210bbbe4d578f3bd100403a19d1b464b0f50effc Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Thu, 24 Jul 2025 18:19:57 +0800 Subject: [PATCH 0123/1348] musa: upgrade musa sdk to rc4.2.0 (#3324) * musa: upgrade musa sdk to 4.2.0 Signed-off-by: Xiaodong Ye * musa: restore rc in docker image tag Signed-off-by: Xiaodong Ye --------- Signed-off-by: Xiaodong Ye --- .devops/main-musa.Dockerfile | 6 +++--- README.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.devops/main-musa.Dockerfile b/.devops/main-musa.Dockerfile index 0c9be7c3e6e..026791e3f89 100644 --- a/.devops/main-musa.Dockerfile +++ b/.devops/main-musa.Dockerfile @@ -1,10 +1,10 @@ ARG UBUNTU_VERSION=22.04 # This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc4.0.1 +ARG MUSA_VERSION=rc4.2.0 # Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION} +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 # Target the MUSA runtime image -ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION} +ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 FROM ${BASE_MUSA_DEV_CONTAINER} AS build WORKDIR /app diff --git a/README.md b/README.md index 6b81a54f7d8..e6c07bbcb00 100644 --- a/README.md +++ b/README.md @@ -386,7 +386,7 @@ Run the inference examples as usual, for example: ## Moore Threads GPU support With Moore Threads cards the processing of the models is done efficiently on the GPU via muBLAS and custom MUSA kernels. -First, make sure you have installed `MUSA SDK rc4.0.1`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=4.0.1 +First, make sure you have installed `MUSA SDK rc4.2.0`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=4.2.0 Now build `whisper.cpp` with MUSA support: From 85e474fd557aae2c151773d6d84228c912c3d295 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 24 Jul 2025 16:04:21 +0200 Subject: [PATCH 0124/1348] ci : add paths to build.yml (#3333) This commit adds specific paths to the GitHub Actions workflow file `.github/workflows/build.yml`. The motivation for this to avoid unnecessary builds when unrelated files are changed, which can save resources and time during the CI process. Refs: https://github.com/ggml-org/whisper.cpp/issues/3285 --- .github/workflows/build.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0e80bdfae78..565f1985dd7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,6 +6,25 @@ on: - master tags: - 'v*' + paths: ['.github/workflows/build.yml', + '**/CMakeLists.txt', + '**/Makefile', + '**/*.mk', + '**/*.cmake', + '**/*.in', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + '**/*.cl', + '**/*.swift', + '**/*.m', + '**/*.mm', + '**/*.metal', + '**/*.comp', + '**/*.java'] pull_request: types: [opened, synchronize, reopened] workflow_dispatch: From 7de8dd783f7b2eab56bff6bbc5d3369e34f0e77f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 24 Jul 2025 16:06:48 +0200 Subject: [PATCH 0125/1348] examples : add note about WHISPER_WASM_SINGLE_FILE [no ci] (#3332) This commit adds a note to the README files of the WASM examples about the `WHISPER_WASM_SINGLE_FILE` option. The motivation for this is that currently this option is not documented and might be surprising to users who expect a separate .wasm file to be generated. Refs: https://github.com/ggml-org/whisper.cpp/issues/3290 --- examples/bench.wasm/README.md | 10 ++++++++++ examples/command.wasm/README.md | 10 ++++++++++ examples/stream.wasm/README.md | 10 ++++++++++ examples/whisper.wasm/README.md | 10 ++++++++++ 4 files changed, 40 insertions(+) diff --git a/examples/bench.wasm/README.md b/examples/bench.wasm/README.md index 1a16a11018f..0ebf0a21f2d 100644 --- a/examples/bench.wasm/README.md +++ b/examples/bench.wasm/README.md @@ -32,6 +32,16 @@ cp bin/libbench.js /path/to/html/ cp bin/libbench.worker.js /path/to/html/ ``` +> 📝 **Note:** By default this example is built with `WHISPER_WASM_SINGLE_FILE=ON` +> which means that that a separate .wasm file will not be generated. Instead, the +> WASM module is embedded in the main JS file as a base64 encoded string. To +> generate a separate .wasm file, you need to disable this option by passing +> `-DWHISPER_WASM_SINGLE_FILE=OFF`: +> ```console +> emcmake cmake .. -DWHISPER_WASM_SINGLE_FILE=OFF +> ``` +> This will generate a `libbench.wasm` file in the build/bin directory. + > 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no > longer generated and the worker is embedded in the main JS file. So the worker > file will not be geneated for versions later than `3.1.58`. diff --git a/examples/command.wasm/README.md b/examples/command.wasm/README.md index b46b89e2043..c50c2b43ed5 100644 --- a/examples/command.wasm/README.md +++ b/examples/command.wasm/README.md @@ -32,6 +32,16 @@ cp bin/libcommand.js /path/to/html/ cp bin/libcommand.worker.js /path/to/html/ ``` +> 📝 **Note:** By default this example is built with `WHISPER_WASM_SINGLE_FILE=ON` +> which means that that a separate .wasm file will not be generated. Instead, the +> WASM module is embedded in the main JS file as a base64 encoded string. To +> generate a separate .wasm file, you need to disable this option by passing +> `-DWHISPER_WASM_SINGLE_FILE=OFF`: +> ```console +> emcmake cmake .. -DWHISPER_WASM_SINGLE_FILE=OFF +> ``` +> This will generate a `libcommand.wasm` file in the build/bin directory. + > 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no > longer generated and the worker is embedded in the main JS file. So the worker > file will not be geneated for versions later than `3.1.58`. diff --git a/examples/stream.wasm/README.md b/examples/stream.wasm/README.md index 29ff982d617..431555655ac 100644 --- a/examples/stream.wasm/README.md +++ b/examples/stream.wasm/README.md @@ -30,6 +30,16 @@ cp bin/libstream.js /path/to/html/ cp bin/libstream.worker.js /path/to/html/ ``` +> 📝 **Note:** By default this example is built with `WHISPER_WASM_SINGLE_FILE=ON` +> which means that that a separate .wasm file will not be generated. Instead, the +> WASM module is embedded in the main JS file as a base64 encoded string. To +> generate a separate .wasm file, you need to disable this option by passing +> `-DWHISPER_WASM_SINGLE_FILE=OFF`: +> ```console +> emcmake cmake .. -DWHISPER_WASM_SINGLE_FILE=OFF +> ``` +> This will generate a `libstream.wasm` file in the build/bin directory. + > 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no > longer generated and the worker is embedded in the main JS file. So the worker > file will not be geneated for versions later than `3.1.58`. diff --git a/examples/whisper.wasm/README.md b/examples/whisper.wasm/README.md index b267d3d242b..da629c5f3eb 100644 --- a/examples/whisper.wasm/README.md +++ b/examples/whisper.wasm/README.md @@ -52,6 +52,16 @@ cp bin/libmain.js /path/to/html/ cp bin/libmain.worker.js /path/to/html/ ``` +> 📝 **Note:** By default this example is built with `WHISPER_WASM_SINGLE_FILE=ON` +> which means that that a separate .wasm file will not be generated. Instead, the +> WASM module is embedded in the main JS file as a base64 encoded string. To +> generate a separate .wasm file, you need to disable this option by passing +> `-DWHISPER_WASM_SINGLE_FILE=OFF`: +> ```console +> emcmake cmake .. -DWHISPER_WASM_SINGLE_FILE=OFF +> ``` +> This will generate a `libmain.wasm` file in the build/bin directory. + > 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no > longer generated and the worker is embedded in the main JS file. So the worker > file will not be geneated for versions later than `3.1.58`. From e7bf0294ec9099b5fc21f5ba969805dfb2108cea Mon Sep 17 00:00:00 2001 From: Rich Waters Date: Sat, 26 Jul 2025 03:25:44 -0700 Subject: [PATCH 0126/1348] Support static xcframework packaging in build-xcframework.sh (#3322) * This commit allows for the building of a static xcframework by adding a BUILD_STATIC_XCFRAMEWORK option. When enabled, the build-xcframework.sh script builds a self-contained static whisper.xcframework. The motivation for this change is so that command line binaries can link whisper.cpp without forcing users to install the whisper.xcframework separately. * Update build-xcframework.sh Co-authored-by: Daniel Bevenius * Address reviewer feedback: remove extra indentation around static xcframework creation. * squash! Address reviewer feedback: remove extra indentation around static xcframework creation. Fix whitespaces. --------- Co-authored-by: Daniel Bevenius --- build-xcframework.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/build-xcframework.sh b/build-xcframework.sh index 337e1936a56..bbf2764d729 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -15,6 +15,7 @@ GGML_METAL_EMBED_LIBRARY=ON GGML_BLAS_DEFAULT=ON GGML_METAL_USE_BF16=ON GGML_OPENMP=OFF +BUILD_STATIC_XCFRAMEWORK=${BUILD_STATIC_XCFRAMEWORK:-OFF} COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g" COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g" @@ -327,6 +328,15 @@ combine_static_libraries() { arch_flags+=" -arch $arch" done + + if [[ "${BUILD_STATIC_XCFRAMEWORK}" == "ON" ]]; then + echo "Packaging static framework for ${platform}." + mkdir -p "$(dirname "${base_dir}/${output_lib}")" + cp "${temp_dir}/combined.a" "${base_dir}/${output_lib}" + rm -rf "${temp_dir}" + return + fi + # Create dynamic library echo "Creating dynamic library for ${platform}." xcrun -sdk $sdk clang++ -dynamiclib \ @@ -529,6 +539,20 @@ combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false" # Create XCFramework with correct debug symbols paths echo "Creating XCFramework..." + +if [[ "${BUILD_STATIC_XCFRAMEWORK}" == "ON" ]]; then + xcodebuild -create-xcframework \ + -framework $(pwd)/build-ios-sim/framework/whisper.framework \ + -framework $(pwd)/build-ios-device/framework/whisper.framework \ + -framework $(pwd)/build-macos/framework/whisper.framework \ + -framework $(pwd)/build-visionos/framework/whisper.framework \ + -framework $(pwd)/build-visionos-sim/framework/whisper.framework \ + -framework $(pwd)/build-tvos-device/framework/whisper.framework \ + -framework $(pwd)/build-tvos-sim/framework/whisper.framework \ + -output $(pwd)/build-apple/whisper.xcframework + exit 0 +fi + xcodebuild -create-xcframework \ -framework $(pwd)/build-ios-sim/framework/whisper.framework \ -debug-symbols $(pwd)/build-ios-sim/dSYMs/whisper.dSYM \ From e238dc1bdd07cef9fd1069ae2f585beb205d7b43 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 21 Jul 2025 15:53:12 +0200 Subject: [PATCH 0127/1348] ggml-cpu : remove stdlib include from repack.cpp (ggml/1276) This commit removes the inclusion of ``. The motivation for this change is that this source file does not seem to use any functions from this header and the comment about `qsort` is a little misleading/confusing. --- ggml/src/ggml-cpu/repack.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 72ee93a5abc..74c1c029b94 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -14,7 +14,6 @@ #include #include #include -#include // for qsort #include // for GGML_ASSERT #include "repack.h" From 5c3b794c51c8de94af8fe0d54517d65a4eb2f64c Mon Sep 17 00:00:00 2001 From: Kai Pastor Date: Tue, 22 Jul 2025 20:13:21 +0200 Subject: [PATCH 0128/1348] cmake : fix usage issues (ggml/1257) * CMake config: Create target only once Fix error on repeated find_package(ggml). For simplicity, check only for the top-level ggml::ggml. * CMake config: Add CUDA link libs * CMake config: Add OpenCL link libs * CMake config: Use canonical find_dependency Use set and append to control link lib variables. Apply more $. * CMake config: Wire OpenMP dependency --- ggml/cmake/ggml-config.cmake.in | 132 ++++++++++++++++++++----------- ggml/src/ggml-cpu/CMakeLists.txt | 2 + 2 files changed, 87 insertions(+), 47 deletions(-) diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in index 8c2dc31c6da..48704352cf4 100644 --- a/ggml/cmake/ggml-config.cmake.in +++ b/ggml/cmake/ggml-config.cmake.in @@ -1,94 +1,130 @@ - -@GGML_VARIABLES_EXPANDED@ - @PACKAGE_INIT@ -set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@") -set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@") -#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@") - -find_package(Threads REQUIRED) - -find_library(GGML_LIBRARY ggml - REQUIRED - HINTS ${GGML_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - -add_library(ggml::ggml UNKNOWN IMPORTED) -set_target_properties(ggml::ggml - PROPERTIES - IMPORTED_LOCATION "${GGML_LIBRARY}") - -find_library(GGML_BASE_LIBRARY ggml-base - REQUIRED - HINTS ${GGML_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - -add_library(ggml::ggml-base UNKNOWN IMPORTED) -set_target_properties(ggml::ggml-base - PROPERTIES - IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") +@GGML_VARIABLES_EXPANDED@ +# Find all dependencies before creating any target. +include(CMakeFindDependencyMacro) +find_dependency(Threads) if (NOT GGML_SHARED_LIB) + set(GGML_CPU_INTERFACE_LINK_LIBRARIES "") + set(GGML_CPU_INTERFACE_LINK_OPTIONS "") + if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if(NOT ACCELERATE_FRAMEWORK) + set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0) + return() + endif() list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK}) endif() - if (GGML_OPENMP) - find_package(OpenMP REQUIRED) + if (GGML_OPENMP_ENABLED) + find_dependency(OpenMP) list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX) endif() if (GGML_CPU_HBM) - find_library(memkind memkind REQUIRED) + find_library(memkind memkind) + if(NOT memkind) + set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0) + return() + endif() list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind) endif() if (GGML_BLAS) - find_package(BLAS REQUIRED) + find_dependency(BLAS) list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES}) list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS}) endif() if (GGML_CUDA) - find_package(CUDAToolkit REQUIRED) + set(GGML_CUDA_INTERFACE_LINK_LIBRARIES "") + find_dependency(CUDAToolkit) + if (GGML_STATIC) + list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $) + if (WIN32) + list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $ $) + else() + list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $ $) + endif() + endif() + if (NOT GGML_CUDA_NO_VMM) + list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $) + endif() endif() if (GGML_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + find_library(FOUNDATION_LIBRARY Foundation) + find_library(METAL_FRAMEWORK Metal) + find_library(METALKIT_FRAMEWORK MetalKit) + if(NOT FOUNDATION_LIBRARY OR NOT METAL_FRAMEWORK OR NOT METALKIT_FRAMEWORK) + set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0) + return() + endif() + set(GGML_METAL_INTERFACE_LINK_LIBRARIES + ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) + endif() - list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES - ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) + if (GGML_OPENCL) + find_dependency(OpenCL) + set(GGML_OPENCL_INTERFACE_LINK_LIBRARIES $) endif() if (GGML_VULKAN) - find_package(Vulkan REQUIRED) - list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan) + find_dependency(Vulkan) + set(GGML_VULKAN_INTERFACE_LINK_LIBRARIES $) endif() if (GGML_HIP) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) - list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas) + find_dependency(hip) + find_dependency(hipblas) + find_dependency(rocblas) + set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas) endif() if (GGML_SYCL) + set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "") find_package(DNNL) if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl) endif() if (WIN32) - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) + find_dependency(IntelSYCL) + find_dependency(MKL) list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) endif() endif() endif() +set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@") +set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@") +#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@") + +if(NOT TARGET ggml::ggml) + +find_package(Threads REQUIRED) + +find_library(GGML_LIBRARY ggml + REQUIRED + HINTS ${GGML_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH) + +add_library(ggml::ggml UNKNOWN IMPORTED) +set_target_properties(ggml::ggml + PROPERTIES + IMPORTED_LOCATION "${GGML_LIBRARY}") + +find_library(GGML_BASE_LIBRARY ggml-base + REQUIRED + HINTS ${GGML_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH) + +add_library(ggml::ggml-base UNKNOWN IMPORTED) +set_target_properties(ggml::ggml-base + PROPERTIES + IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") + set(_ggml_all_targets "") foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}") @@ -149,4 +185,6 @@ set_target_properties(ggml::all PROPERTIES INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}") +endif() # TARGET ggml::ggml + check_required_components(ggml) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 66a5ad8d2ed..13f745b2062 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -70,10 +70,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (GGML_OPENMP) find_package(OpenMP) if (OpenMP_FOUND) + set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "") target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) else() + set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "") message(WARNING "OpenMP not found") endif() endif() From b06f314667ec497becf3e9be6ac996b8282e7d83 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 19 Jul 2025 17:47:53 +0200 Subject: [PATCH 0129/1348] Vulkan: Fix fprintf format-security warning (llama/14770) --- ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 809c0bd9bd3..b1457583a4b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -765,8 +765,8 @@ void write_output_files() { len += "};\n"; } } - fprintf(src, data.c_str()); - fprintf(src, len.c_str()); + fputs(data.c_str(), src); + fputs(len.c_str(), src); } fclose(hdr); fclose(src); From 50f983a17e646d9e333bd3ae13e175abe66decdc Mon Sep 17 00:00:00 2001 From: Peter0x44 Date: Sat, 19 Jul 2025 16:58:03 +0100 Subject: [PATCH 0130/1348] vulkan: Add logging for bf16 features to ggml_vk_print_gpu_info (#13274) (llama/14707) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 32 ++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 3019a545d58..0707d71bb6c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -328,6 +328,7 @@ struct vk_device_struct { uint64_t max_memory_allocation_size; uint64_t suballocation_block_size; bool fp16; + bool bf16; bool pipeline_robustness; vk::Device device; uint32_t vendor_id; @@ -3273,6 +3274,12 @@ static vk_device ggml_vk_get_device(size_t idx) { device->fp16 = device->fp16 && vk12_features.shaderFloat16; +#if defined(VK_KHR_shader_bfloat16) + device->bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; +#else + device->bf16 = false; +#endif + device->pipeline_robustness = pl_robustness_features.pipelineRobustness; if (device->subgroup_size_control) { @@ -3615,6 +3622,7 @@ static void ggml_vk_print_gpu_info(size_t idx) { bool coopmat_support = false; bool coopmat2_support = false; bool integer_dot_product = false; + bool bfloat16_support = false; for (auto properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { @@ -3635,6 +3643,11 @@ static void ggml_vk_print_gpu_info(size_t idx) { } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { integer_dot_product = true; +#endif +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_BFLOAT16")) { + bfloat16_support = true; #endif } } @@ -3701,10 +3714,25 @@ static void ggml_vk_print_gpu_info(size_t idx) { last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features; } +#if defined(VK_KHR_shader_bfloat16) + VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; + bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; + if (bfloat16_support) { + last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; + last_struct = (VkBaseOutStructure *)&bfloat16_features; + } +#endif + vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); fp16 = fp16 && vk12_features.shaderFloat16; +#if defined(VK_KHR_shader_bfloat16) + bool bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; +#else + bool bf16 = false; +#endif + uint32_t default_subgroup_size = get_subgroup_size("", device_architecture); const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; @@ -3722,8 +3750,8 @@ static void ggml_vk_print_gpu_info(size_t idx) { std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n", - idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, bf16, subgroup_size, props2.properties.limits.maxComputeSharedMemorySize, integer_dot_product, matrix_cores.c_str()); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { From 92a9e85d8bcd432ae5753784e6a325c5bcd4a07b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ervin=20=C3=81ron=20Tasn=C3=A1di?= Date: Sat, 19 Jul 2025 21:59:08 +0200 Subject: [PATCH 0131/1348] ggml: adds CONV_2D op and direct GEMM Vulkan implementation (llama/14316) * ggml/ggml-vulkan/test-backend-ops: adds CONV_2D for Vulkan * ggml-vulkan: adds f32 scalar shader to compute 2D convolution directly with gemm (no need for im2col), * test-backend-ops: adds test_case_ref to check the validity/performance of ops against reference implementations having different graphs, adds tests * * Performance fixes: minimized branch divergence, uses collectives to eliminate redundant calculation, macros removed. * Kernel shared memory size check * Updates test-backend-ops to support graphs for performance measurement. * * Apple/Win32 compile errors fixed * Subgroup size used to determine tile size -> fixes llvmpipe errors. * Collectives disabled by default. * Intel support is disabled as the performance is poor. * Conv2d enabled for Intel with disabled collectives, disabled for Apple * test-backend-ops modifications are reverted * Trailing spaces and missing override fixed. * Triggering pipeline relaunch. * Code formatted with .clang-format. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 257 ++++++++++++++++- .../ggml-vulkan/vulkan-shaders/conv2d_mm.comp | 265 ++++++++++++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 + 3 files changed, 513 insertions(+), 11 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0707d71bb6c..c3f1369b663 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -483,6 +483,7 @@ struct vk_device_struct { vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; vk_pipeline pipeline_opt_step_adamw_f32; + vk_pipeline pipeline_conv2d_f32; vk_pipeline pipeline_conv2d_dw_whcn_f32; vk_pipeline pipeline_conv2d_dw_cwhn_f32; @@ -876,6 +877,38 @@ struct vk_op_rwkv_wkv7_push_constants { uint32_t H; }; +struct vk_op_conv2d_push_constants { + uint32_t Cout; + uint32_t Cin; + uint32_t N; + + uint32_t KW; + uint32_t KH; + uint32_t W; + uint32_t H; + uint32_t OW; + uint32_t OH; + + uint32_t s0; + uint32_t s1; + uint32_t p0; + uint32_t p1; + uint32_t d0; + uint32_t d1; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + + uint32_t nb1; + uint32_t nb2; + uint32_t nb3; +}; + struct vk_op_conv2d_dw_push_constants { uint32_t ne; uint32_t batches; @@ -975,18 +1008,45 @@ class vk_memory_logger { #endif // GGML_VULKAN_MEMORY_DEBUG class vk_perf_logger { -public: + public: void print_timings() { + if (timings.empty()) { + return; + } + uint64_t total_all_op_times = 0; std::cerr << "----------------\nVulkan Timings:" << std::endl; - for (const auto& t : timings) { - uint64_t total = 0; - for (const auto& time : t.second) { - total += time; + for (const auto & t : timings) { + uint64_t total_op_times = 0; + for (const auto & time : t.second) { + total_op_times += time; + } + std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0) + << " us"; + + // If we have as many flops entries as timing entries for the op, then compute and log the flops/S. + auto it = flops.find(t.first); + if (it != flops.end() && (it->second).size() == t.second.size()) { + uint64_t total_op_flops = 0; + for (const auto & elem : it->second) { + total_op_flops += elem; + } + std::cerr << " (" + << (double(total_op_flops) / (1000.0 * 1000.0 * 1000.0)) / + (double(total_op_times) / (1000.0 * 1000.0 * 1000.0)) + << " GFLOPS/s)"; } - std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl; + + total_all_op_times += total_op_times; + + std::cerr << std::endl; + } + + if (timings.size() > 0) { + std::cerr << "Total time: " << total_all_op_times / 1000.0 << " us." << std::endl; } timings.clear(); + flops.clear(); } void log_timing(const ggml_tensor * node, uint64_t time) { @@ -995,22 +1055,45 @@ class vk_perf_logger { return; } if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { - const uint64_t m = node->src[0]->ne[1]; - const uint64_t n = node->src[1]->ne[1]; - const uint64_t k = node->src[1]->ne[0]; - std::string name = ggml_op_name(node->op); + const uint64_t m = node->src[0]->ne[1]; + const uint64_t n = node->src[1]->ne[1]; + const uint64_t k = node->src[1]->ne[0]; + std::string name = ggml_op_name(node->op); if (n == 1) { name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k); } else { name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k); } timings[name].push_back(time); + flops[name].push_back(m * n * (k + (k - 1))); + return; + } + if (node->op == GGML_OP_CONV_2D) { + std::string name = ggml_op_name(node->op); + ggml_tensor * knl = node->src[0]; + uint64_t OW = node->ne[0]; + uint64_t OH = node->ne[1]; + uint64_t N = node->ne[3]; + uint64_t Cout = node->ne[2]; + uint64_t KW = knl->ne[0]; + uint64_t KH = knl->ne[1]; + uint64_t Cin = knl->ne[2]; + // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ + uint64_t size_M = Cout; + uint64_t size_K = Cin * KW * KH; + uint64_t size_N = N * OW * OH; + uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1)); + name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + + ", N=N*OW*OH=" + std::to_string(size_N); + flops[name].push_back(n_flops); + timings[name].push_back(time); return; } timings[ggml_op_name(node->op)].push_back(time); } -private: + private: std::map> timings; + std::map> flops; }; struct ggml_backend_vk_context { @@ -2113,6 +2196,7 @@ static void ggml_vk_load_shaders(vk_device& device) { } compile_count++; } + compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint, parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); }; @@ -2962,6 +3046,42 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + // conv2d + uint32_t conv2d_WG_SIZE = 256; + uint32_t conv2d_BS_K = 128; + uint32_t conv2d_BS_CRS = 16; + uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices. + if (device->subgroup_shuffle && + device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316 + use_collectives = 1; + conv2d_BS_CRS = std::min( + device->subgroup_size, + conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used. + } + uint32_t conv2d_BS_NPQ = 128; + uint32_t conv2d_TS_K = 8; + uint32_t conv2d_shmem_req = + (conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float); + if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) { + conv2d_BS_CRS = 8; + if (use_collectives) { + conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS); + } + } + + if (use_collectives) { + ggml_vk_create_pipeline( + device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, + sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 }, + { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true); + } else { + ggml_vk_create_pipeline( + device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, + sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 }, + { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, + false); + } + ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); @@ -6837,6 +6957,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_leaky_relu_f32; } return nullptr; + case GGML_OP_CONV_2D: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { + return ctx->device->pipeline_conv2d_f32; + } + return nullptr; case GGML_OP_CONV_2D_DW: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (ggml_is_contiguous(src1)) { @@ -7159,6 +7285,31 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint32_t OW = dst->ne[0]; elements = { N * OC * OH * OW, 1, 1}; } break; + case GGML_OP_CONV_2D: + { + // src0 - kernel: [KW, KH, Cin, Cout] + // src1 - input: [W, H, Cin, N] + // dst - result: [OW, OH, Cout, N] + + // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) + auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; + }; + // parallelize in {OW/BS_K, OH/BS_NPQ, 1} + int64_t W = src1->ne[0]; + int64_t H = src1->ne[1]; + int64_t KW = src0->ne[0]; + int64_t KH = src0->ne[1]; + int64_t Cout = src0->ne[3]; + int64_t N = src1->ne[3]; + int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]); + int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]); + int64_t NPQ = N * OW * OH; + + // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups + elements = { static_cast(Cout), static_cast(NPQ), 1 }; + } + break; case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_DIV: @@ -8025,6 +8176,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c }, dryrun); } +static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + vk_op_conv2d_push_constants p{}; + p.Cout = static_cast(ne03); + p.Cin = static_cast(ne02); + p.N = static_cast(ne13); + + p.KW = static_cast(ne00); + p.KH = static_cast(ne01); + p.W = static_cast(ne10); + p.H = static_cast(ne11); + p.OW = static_cast(ne0); + p.OH = static_cast(ne1); + + p.s0 = static_cast(dst->op_params[0]); + p.s1 = static_cast(dst->op_params[1]); + p.p0 = static_cast(dst->op_params[2]); + p.p1 = static_cast(dst->op_params[3]); + p.d0 = static_cast(dst->op_params[4]); + p.d1 = static_cast(dst->op_params[5]); + + p.nb01 = static_cast(nb01 / nb00); + p.nb02 = static_cast(nb02 / nb00); + p.nb03 = static_cast(nb03 / nb00); + + p.nb11 = static_cast(nb11 / nb10); + p.nb12 = static_cast(nb12 / nb10); + p.nb13 = static_cast(nb13 / nb10); + + p.nb1 = static_cast(nb1 / nb0); + p.nb2 = static_cast(nb2 / nb0); + p.nb3 = static_cast(nb3 / nb0); + + GGML_ASSERT(ne03 == ne2); + GGML_ASSERT(ne02 == ne12); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun); +} + static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { vk_op_conv2d_dw_push_constants p{}; p.ne = ggml_nelements(dst); @@ -9087,6 +9287,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: @@ -9154,6 +9355,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_LEAKY_RELU: { @@ -9360,6 +9562,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_POOL_2D: ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_CONV_2D: + ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun); + break; case GGML_OP_CONV_2D_DW: ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun); @@ -9490,6 +9696,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: @@ -10071,6 +10278,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false); if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); + } else if (cgraph->nodes[i]->op == GGML_OP_CONV_2D) { + // Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode. + auto CRS_size = + cgraph->nodes[i]->src[0]->ne[0] * cgraph->nodes[i]->src[0]->ne[1] * cgraph->nodes[i]->src[0]->ne[2]; + auto NPQ_size = cgraph->nodes[i]->ne[0] * cgraph->nodes[i]->ne[1] * cgraph->nodes[i]->ne[3]; + total_mat_mul_bytes += NPQ_size * CRS_size * ggml_type_size(cgraph->nodes[i]->type); } i += ctx->num_additional_fused_ops; ctx->num_additional_fused_ops = 0; @@ -10647,6 +10860,20 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return true; case GGML_OP_CONV_TRANSPOSE_1D: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_CONV_2D: + { + // Op is disabled for Apple because it segfaults at pipeline create time on MoltenVK + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + const vk_device& device = ggml_vk_get_device(ctx->device); + bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE; + // Channel-contiguous format is not supported yet. + return (op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op->src[1]) && + ggml_is_contiguous(op)) && !is_Apple; + } default: return false; } @@ -11205,6 +11432,14 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * const int32_t p1 = tensor->op_params[6]; tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1); + } else if (tensor->op == GGML_OP_CONV_2D) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); } else if (tensor->op == GGML_OP_LEAKY_RELU) { const float * op_params = (const float *)tensor->op_params; tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp new file mode 100644 index 00000000000..481940a52b3 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp @@ -0,0 +1,265 @@ +#version 450 + +#ifdef USE_COLLECTIVES +# extension GL_KHR_shader_subgroup_shuffle : enable +#endif + +#include "types.comp" + +// Make spec constant +#define SHMEM_PAD 0 + +// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j +layout(binding = 0) readonly buffer A { + A_TYPE knl_data[]; +}; // src0 - kernel: [KW, KH, Cin, Cout] + +layout(binding = 1) readonly buffer B { + B_TYPE src_data[]; +}; // src1 - input: [W, H, Cin, N] -- channel_first format + +layout(binding = 2) writeonly buffer D { + D_TYPE dst_data[]; +}; // dst - result: [OW, OH, Cout, N] + +layout(push_constant) uniform parameter { + // I/O channels, batch size + uint32_t Cout; + uint32_t Cin; + uint32_t N; + + // Tensor spatial sizes: kernel, input, output + uint32_t KW; + uint32_t KH; + uint32_t W; + uint32_t H; + uint32_t OW; + uint32_t OH; + + // Parameters: stride, padding, dilation - 0=y, 1=x + uint32_t s0; + uint32_t s1; + uint32_t p0; + uint32_t p1; + uint32_t d0; + uint32_t d1; + + // Strides in elements + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + + uint32_t nb1; + uint32_t nb2; + uint32_t nb3; +} + +p; + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +// Blocktile sizes +layout(constant_id = 1) const uint BS_K = 128; +layout(constant_id = 2) const uint BS_CRS = 16; +layout(constant_id = 3) const uint BS_NPQ = 128; +// Thread-tile sizes +layout(constant_id = 4) const uint TS_K = 8; +layout(constant_id = 5) const uint use_collectives = 1; + +uint32_t tid = gl_LocalInvocationID.x; +const uint32_t WG_SIZE = gl_WorkGroupSize.x; + +uint splitWork(uint work_size, uint block_size) { + return (block_size + work_size - 1) / block_size; +} + +uint32_t K = p.Cout; +uint32_t CRS = p.Cin * p.KH * p.KW; +uint32_t NPQ = p.N * p.OH * p.OW; + +uint32_t n_elems_out = K * NPQ; + +// Number of blocktiles per input +uint32_t NB_CRS = splitWork(CRS, BS_CRS); + +const uint32_t Ash_stride = BS_CRS + SHMEM_PAD; +const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD; + +const uint32_t Ash_numel = BS_K * BS_CRS; +const uint32_t Bsh_numel = BS_CRS * BS_NPQ; + +const uint32_t Ash_len = BS_K * Ash_stride; +const uint32_t Bsh_len = BS_CRS * Bsh_stride; + +shared float Ash[Ash_len]; // K x CRS +shared float Bsh[Bsh_len]; // CRS x NPQ + +// Threadtile sizes +const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K; + +// Number of threadtiles per blocktile +const uint32_t NT_K = BS_K / TS_K; +const uint32_t NT_NPQ = BS_NPQ / TS_NPQ; + +float regA[TS_K]; +float regB[TS_NPQ]; +float regC[TS_K][TS_NPQ]; + +/* +Compute +KxCRS @ CRSxNPQ = K x NPQ +K=Cout +C=Cin +R,S=KH,KW +P,Q=OH,OW +*/ + +uint32_t B_idx_K = gl_WorkGroupID.x; +uint32_t B_idx_NPQ = gl_WorkGroupID.y; + +uint32_t T_y = tid / NT_NPQ; +uint32_t T_x = tid % NT_NPQ; + +uint32_t Ar = tid / BS_CRS; +uint32_t Ac = tid % BS_CRS; +const uint32_t ArpWg = WG_SIZE / BS_CRS; + +uint32_t Br = tid / BS_NPQ; +uint32_t Bc = tid % BS_NPQ; +const uint32_t BrpWg = WG_SIZE / BS_NPQ; + +void main() { + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regC[T_ly][T_lx] = 0.0; + } + } + /* Advance block in CRS dim */ + for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) { + uint32_t CRS_idx_a; + uint32_t Cin_idx_a; + uint32_t KH_idx_a; + uint32_t KW_idx_a; + +#ifdef USE_COLLECTIVES + uint32_t cached_CRS_idx; + uint32_t cached_Cin_idx; + uint32_t cached_KH_idx; + uint32_t cached_KW_idx; + if (use_collectives == 1) { + cached_CRS_idx = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID; + cached_Cin_idx = cached_CRS_idx / (p.KW * p.KH); + uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH); + cached_KH_idx = cached_CRS_remainder / p.KW; + cached_KW_idx = cached_CRS_remainder - cached_KH_idx * p.KW; + + CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac); + Cin_idx_a = subgroupShuffle(cached_Cin_idx, Ac); + KH_idx_a = subgroupShuffle(cached_KH_idx, Ac); + KW_idx_a = subgroupShuffle(cached_KW_idx, Ac); + } else { + CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A) + Cin_idx_a = CRS_idx_a / (p.KW * p.KH); + uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH; + KH_idx_a = CRS_remainder / p.KW; + KW_idx_a = CRS_remainder - KH_idx_a * p.KW; + } +#else + CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A) + Cin_idx_a = CRS_idx_a / (p.KW * p.KH); + CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH; + KH_idx_a = CRS_remainder / p.KW; + KW_idx_a = CRS_remainder - KH_idx_a * p.KW; +#endif + + /* Load kernel to A_block: (BS_K x BS_CRS)*/ + for (uint32_t r_offset = 0; r_offset < BS_K; r_offset += ArpWg) { + uint32_t B_ly = r_offset + Ar; + uint32_t B_lx = Ac; + uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/ + uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1); + float val = knl_data[knl_idx]; + if (K_idx >= K || CRS_idx_a >= CRS) { + val = 0.0; + } + Ash[B_ly * Ash_stride + B_lx] = val; + } + /* Load input to B_block: (BS_CRS x BS_NPQ) */ + for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) { + uint32_t B_ly = r_offset + Br; /* Row index of B block */ + uint32_t B_lx = Bc; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */ + uint32_t N_idx = NPQ_idx / (p.OH * p.OW); + uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW; + uint32_t OH_idx = NPQ_remainder / p.OW; + uint32_t OW_idx = NPQ_remainder - OH_idx * p.OW; + + uint32_t CRS_idx_b; + uint32_t Cin_idx_b; + uint32_t KH_idx_b; + uint32_t KW_idx_b; +#ifdef USE_COLLECTIVES + if (use_collectives == 1) { + CRS_idx_b = subgroupShuffle(cached_CRS_idx, r_offset + Br); + Cin_idx_b = subgroupShuffle(cached_Cin_idx, r_offset + Br); + KH_idx_b = subgroupShuffle(cached_KH_idx, r_offset + Br); + KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br); + } else { + CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */ + Cin_idx_b = CRS_idx_b / (p.KW * p.KH); + uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH; + KH_idx_b = CRS_remainder / p.KW; + KW_idx_b = CRS_remainder - KH_idx_b * p.KW; + } +#else + CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */ + Cin_idx_b = CRS_idx_b / (p.KW * p.KH); + uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH; + KH_idx_b = CRS_remainder / p.KW; + KW_idx_b = CRS_remainder - KH_idx_b * p.KW; +#endif + + uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1; + uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0; + uint32_t src_idx = + min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1); + float val = src_data[src_idx]; + if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) { + val = 0.0; + } + Bsh[B_ly * Bsh_stride + B_lx] = val; + } + barrier(); + for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) { + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx]; + } + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx]; + } + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]); + } + } + } + barrier(); + } + /* Save C* */ + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx; + uint32_t N_idx = NPQ_idx / (p.OH * p.OW); + uint32_t OH_idx = (NPQ_idx - N_idx * p.OH * p.OW) / p.OW; + uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW; + uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3; + if (K_idx < K && NPQ_idx < NPQ) { + dst_data[dst_idx] = regC[T_ly][T_lx]; + } + } + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index b1457583a4b..598f0370fb8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -655,6 +655,8 @@ void process_shaders() { string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); + string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}}); + string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); From e3f4162a06db61c105ea0d0614043dcb864a8fa3 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 21 Jul 2025 06:35:40 -0500 Subject: [PATCH 0132/1348] vulkan/cuda: Fix im2col when KW!=KH (llama/14789) The tid is decomposed into "ow + ky*OW + kx*OW*KH". Change "ksize" to match. --- ggml/src/ggml-cuda/im2col.cu | 2 +- ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 86a54e42bb7..5bb85b4807b 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -10,7 +10,7 @@ static __global__ void im2col_kernel( return; } - const int64_t ksize = OW * (KH > 1 ? KW : 1); + const int64_t ksize = OW * KH; const int64_t kx = i / ksize; const int64_t kd = kx * ksize; const int64_t ky = (i - kd) / OW; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp index 17c7ccb90d0..fdbcf7eba0f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp @@ -40,12 +40,10 @@ void main() { const uint src_base = ic * p.offset_delta + batch * p.batch_offset; const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH); const int oh_s1 = int(oh) * p.s1; - const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1); + const uint ksize = p.OW * p.KH; const uint base_linear_idx = gidx * NUM_ITER; - const uint max_ky = ksize / p.OW; - uint current_kx = base_linear_idx / ksize; const uint rem = base_linear_idx - (current_kx * ksize); uint current_ky = rem / p.OW; @@ -76,7 +74,7 @@ void main() { if (++current_ix == p.OW) { current_ix = 0; - if (++current_ky == max_ky) { + if (++current_ky == p.KH) { current_ky = 0; current_kx++; } From fc2ff438fd8c75f0190148cfe4bb3ec6a1c1a046 Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Mon, 21 Jul 2025 15:49:52 +0200 Subject: [PATCH 0133/1348] kleidiai: add support for get_rows (llama/14676) * kleidiai: add support for get_rows * apply fixes based on code review * apply more fixes based on code review --- ggml/src/ggml-cpu/CMakeLists.txt | 4 +- ggml/src/ggml-cpu/kleidiai/kernels.cpp | 121 +++++++++++++++++++++--- ggml/src/ggml-cpu/kleidiai/kernels.h | 3 + ggml/src/ggml-cpu/kleidiai/kleidiai.cpp | 98 +++++++++++++++++-- 4 files changed, 202 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 13f745b2062..2cc42d4b02a 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -496,9 +496,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_TAG "v1.9.0") + set(KLEIDIAI_COMMIT_TAG "v1.11.0") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017") + set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.cpp b/ggml/src/ggml-cpu/kleidiai/kernels.cpp index 910fd0ee4e7..ddd29d002d1 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp @@ -22,9 +22,94 @@ #include "kai_common.h" +#include "simd-mappings.h" + #include "kernels.h" #define NELEMS(x) sizeof(x) / sizeof(*x) + +static const size_t INT4_PER_BYTE = 2; +static const size_t INT4_BITS = 4; +static const int Q4_0_ZERO_POINT = 8; +const size_t INT4_PER_UINT16 = 4; + +static void dequantize_row_qsi4c32pscalef16( + const void *packed_data, + int32_t row_idx, + int64_t nc, + float *out, + size_t nr_pack, + size_t packed_row_stride, + size_t kr, + size_t bl, + size_t num_bytes_multiplier +) { + size_t group_idx = row_idx / nr_pack; + size_t row_in_group = row_idx % nr_pack; + const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride; + size_t num_blocks = nc / bl; + const uint8_t *block_ptr = packed_group; + + for (size_t b = 0; b < num_blocks; ++b) { + uint16_t scale_f16 = *((const uint16_t *)(block_ptr + row_in_group * num_bytes_multiplier)); + float scale = GGML_CPU_FP16_TO_FP32(scale_f16); + + const uint8_t *segment_ptr = block_ptr + nr_pack * num_bytes_multiplier; + size_t num_segments = bl / kr; + size_t num_bytes_per_segment = kr / INT4_PER_BYTE; + + for (size_t s = 0; s < num_segments; ++s) { + const uint8_t *seg_base = segment_ptr + s * nr_pack * num_bytes_per_segment; + const uint8_t *qbytes = seg_base + row_in_group * num_bytes_per_segment; + for (size_t k = 0; k < num_bytes_per_segment; ++k) { + uint8_t byte = qbytes[k] ^ 0x88; + int x0 = (byte & 0x0F) - Q4_0_ZERO_POINT; + int x1 = (byte >> INT4_BITS) - Q4_0_ZERO_POINT; + out[b * bl + s * num_bytes_per_segment + k] = x0 * scale; + out[b * bl + s * num_bytes_per_segment + k + bl/2] = x1 * scale; + } + } + block_ptr += nr_pack * num_bytes_multiplier + num_segments * nr_pack * num_bytes_per_segment; + } +} + +static void dequantize_row_qsi4c32ps1s0scalef16( + const void *packed_data, + int32_t row_idx, + int64_t k, + float *out, + size_t nr, + size_t packed_row_stride, + size_t kr, + size_t bl, + size_t num_bytes_multiplier +) { + const size_t num_blocks = k / bl; + const size_t bl4 = bl / INT4_PER_UINT16; + + size_t group_idx = row_idx / nr; + size_t row_in_group = row_idx % nr; + + const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride; + const uint16_t *qdata = (const uint16_t *)packed_group; + const uint16_t *scales = (const uint16_t *)(packed_group + packed_row_stride - (nr * num_blocks * num_bytes_multiplier)); + + for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) { + uint16_t scale_f16 = scales[row_in_group + block_idx * nr]; + float scale = GGML_CPU_FP16_TO_FP32(scale_f16); + + for (size_t bl4_idx = 0; bl4_idx < bl4; ++bl4_idx) { + uint16_t q = qdata[(block_idx * bl4 + bl4_idx) * nr + row_in_group]; + + for (size_t qidx = 0; qidx < INT4_PER_UINT16; ++qidx) { + int v = ((q >> (qidx * 4)) & 0xF) - Q4_0_ZERO_POINT; + out[block_idx * bl + bl4_idx * INT4_BITS + qidx] = v * scale; + } + } + } + GGML_UNUSED(kr); +} + static ggml_kleidiai_kernels gemm_gemv_kernels[] = { #if defined(__ARM_FEATURE_SME) { @@ -63,8 +148,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, + /* .to_float = */ dequantize_row_qsi4c32ps1s0scalef16, }, /* .required_cpu = */ CPU_FEATURE_SME, /* .lhs_type = */ GGML_TYPE_F32, @@ -107,8 +194,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme, - /* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme, + /* .packed_stride = */ NULL, + /* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme, + /* .to_float = */ NULL, }, /* .required_cpu = */ CPU_FEATURE_SME, /* .lhs_type = */ GGML_TYPE_F32, @@ -154,8 +243,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD, /* .lhs_type = */ GGML_TYPE_F32, @@ -200,8 +291,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM, /* .lhs_type = */ GGML_TYPE_F32, @@ -247,8 +340,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM, /* .lhs_type = */ GGML_TYPE_F32, @@ -293,8 +388,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, }, /* .rhs_info = */ { - /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, - /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD, /* .lhs_type = */ GGML_TYPE_F32, diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.h b/ggml/src/ggml-cpu/kleidiai/kernels.h index 3b268d4a22a..bc8f33405d1 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.h +++ b/ggml/src/ggml-cpu/kleidiai/kernels.h @@ -71,12 +71,15 @@ struct rhs_packing_info { std::function, std::function > packed_size; + size_t (*packed_stride)(size_t k, size_t nr, size_t kr, size_t bl); std::variant< std::function, std::function > pack_func; + void (*to_float)(const void *packed_data, int32_t row_idx, int64_t nc, float *out, size_t nr_pack, size_t packed_row_stride, + size_t kr, size_t bl, size_t num_bytes_multiplier); }; struct ggml_kleidiai_kernels { diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index fafe45e6c5c..3a513a55d76 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -40,6 +40,17 @@ struct ggml_kleidiai_context { ggml_kleidiai_kernels * kernels; } static ctx = { CPU_FEATURE_NONE, NULL }; +static const char* cpu_feature_to_string(cpu_feature f) { + switch (f) { + case CPU_FEATURE_NONE: return "NONE"; + case CPU_FEATURE_DOTPROD: return "DOTPROD"; + case CPU_FEATURE_I8MM: return "I8MM"; + case CPU_FEATURE_SVE: return "SVE"; + case CPU_FEATURE_SME: return "SME"; + default: return "UNKNOWN"; + } +} + static void init_kleidiai_context(void) { ggml_critical_section_start(); @@ -62,6 +73,11 @@ static void init_kleidiai_context(void) { ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE; } ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features); +#ifndef NDEBUG + if (ctx.kernels) { + GGML_LOG_DEBUG("kleidiai: using kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels->required_cpu)); + } +#endif } ggml_critical_section_end(); } @@ -102,6 +118,9 @@ static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint1 class tensor_traits : public ggml::cpu::tensor_traits { bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + if (op->op != GGML_OP_MUL_MAT) { + return false; + } ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op); GGML_ASSERT(kernels); kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm; @@ -135,6 +154,10 @@ class tensor_traits : public ggml::cpu::tensor_traits { } else if (dst->src[0]->type == GGML_TYPE_F16) { return compute_forward_kv_cache(params, dst); } + } else if (dst->op == GGML_OP_GET_ROWS) { + if (dst->src[0]->type == GGML_TYPE_Q4_0) { + return compute_forward_get_rows(params, dst); + } } return false; } @@ -270,6 +293,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { } bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0); + const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -342,8 +367,49 @@ class tensor_traits : public ggml::cpu::tensor_traits { return true; } + bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0); + GGML_ASSERT(ctx.kernels); + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + rhs_packing_info * rhs_info = &ctx.kernels->rhs_info; + kernel_info * kernel = &ctx.kernels->gemm; + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + const size_t block_rows = kernel->get_nr(); + const size_t kr = kernel->get_kr(); + + const size_t num_bytes_multiplier = sizeof(uint16_t); + const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, QK4_0); + + const int ith = params->ith; + const int nth = params->nth; + + const int dr = (nr + nth - 1) / nth; + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + GGML_ASSERT(src1->type == GGML_TYPE_I32); + int64_t row_idx = ((const int32_t *)src1->data)[i]; + GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]); + + float *out = (float *)((char *)dst->data + i * nb1); + rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, QK4_0, num_bytes_multiplier); + } + + return true; + } + public: int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) { + GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0); GGML_ASSERT(ctx.kernels); const size_t n = tensor->ne[1]; const size_t k = tensor->ne[0]; @@ -351,17 +417,12 @@ class tensor_traits : public ggml::cpu::tensor_traits { size_t kr = ctx.kernels->gemm.get_kr(); size_t sr = ctx.kernels->gemm.get_sr(); -#ifndef NDEBUG - const size_t repacked_size = variant_call(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0); - GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!"); -#endif struct kai_rhs_pack_qs4cxs1s0_param params; params.lhs_zero_point = 1; params.rhs_zero_point = 8; variant_call(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, ¶ms); return 0; - GGML_UNUSED(data_size); } }; @@ -375,8 +436,8 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor); - GGML_UNUSED(buffer); return GGML_STATUS_SUCCESS; + GGML_UNUSED(buffer); } static void ggml_backend_cpu_kleidiai_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, @@ -418,18 +479,35 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b GGML_UNUSED(buft); } +static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0); + GGML_ASSERT(ctx.kernels); + + const size_t n = tensor->ne[1]; + const size_t k = tensor->ne[0]; + const size_t nr = ctx.kernels->gemm.get_nr(); + const size_t kr = ctx.kernels->gemm.get_kr(); + + return variant_call(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0); + + GGML_UNUSED(buft); +} + namespace ggml::cpu::kleidiai { class extra_buffer_type : ggml::cpu::extra_buffer_type { bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT && + if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) && op->src[0]->type == GGML_TYPE_Q4_0 && op->src[0]->buffer && (ggml_n_dims(op->src[0]) == 2) && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) { + if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) { + return false; + } if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { return false; } - if (op->src[1]->type == GGML_TYPE_F32 && + if ((op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_I32) && ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) { return true; } @@ -438,7 +516,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { } ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT) { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) { if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) { return (ggml::cpu::tensor_traits *) op->src[0]->extra; } @@ -469,7 +547,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) { /* .alloc_buffer = */ ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_kleidiai_buffer_type_get_alignment, /* .get_max_size = */ nullptr, // defaults to SIZE_MAX - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .get_alloc_size = */ ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size, /* .is_host = */ nullptr, }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), From 52ad451c8a0231eeb9d230bd7746fd16527fec86 Mon Sep 17 00:00:00 2001 From: Romain Biessy Date: Mon, 21 Jul 2025 18:39:29 +0200 Subject: [PATCH 0134/1348] sycl: Fix im2col (llama/14797) --- ggml/src/ggml-sycl/im2col.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp index 52737cc746d..7adcb3d9d9c 100644 --- a/ggml/src/ggml-sycl/im2col.cpp +++ b/ggml/src/ggml-sycl/im2col.cpp @@ -26,7 +26,7 @@ static void im2col_kernel(const float * x, T * dst, int64_t batch_offset, int64_ // make each work-item deal with more elements since sycl global range can not exceed max int for (int64_t i = global_id; i < pelements; i += (work_group_size * item_ct1.get_group_range(2))) { - const int64_t ksize = OW * (KH > 1 ? KW : 1); + const int64_t ksize = OW * KH; const int64_t kx = i / ksize; const int64_t kd = kx * ksize; const int64_t ky = (i - kd) / OW; From a2a5612402c0940a6d41404da1f36bb8ec970df8 Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 21 Jul 2025 19:03:19 +0200 Subject: [PATCH 0135/1348] opencl: add conv2d kernel (llama/14403) * add conv2d kernel * fix trailing whitespace * whitespace fixe * handle f16 input and f16 kernel, more opt * resolve conflicts * use enqueue_ndrange_kernel --- ggml/src/ggml-opencl/CMakeLists.txt | 2 + ggml/src/ggml-opencl/ggml-opencl.cpp | 134 +++++++++++++ ggml/src/ggml-opencl/kernels/conv2d.cl | 185 ++++++++++++++++++ .../src/ggml-opencl/kernels/conv2d_f16_f32.cl | 176 +++++++++++++++++ 4 files changed, 497 insertions(+) create mode 100644 ggml/src/ggml-opencl/kernels/conv2d.cl create mode 100644 ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index ec5d8cf5955..015fa8f0682 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -105,6 +105,8 @@ set(GGML_OPENCL_KERNELS pad repeat mul_mat_f16_f32 + conv2d + conv2d_f16_f32 ) foreach (K ${GGML_OPENCL_KERNELS}) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 3388259152b..a31483b6108 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -390,6 +390,9 @@ struct ggml_backend_opencl_context { cl_program program_tanh; cl_program program_upscale; cl_program program_concat; + cl_program program_conv_2d_f16; + cl_program program_conv_2d_f32; + cl_program program_conv_2d_f16_f32; cl_program program_tsembd; cl_program program_mul_mv_id_q4_0_f32_8x_flat; @@ -441,6 +444,9 @@ struct ggml_backend_opencl_context { cl_kernel kernel_upscale_bilinear; cl_kernel kernel_concat_f32_contiguous; cl_kernel kernel_concat_f32_non_contiguous; + cl_kernel kernel_conv_2d_f16; + cl_kernel kernel_conv_2d_f32; + cl_kernel kernel_conv_2d_f16_f32; cl_kernel kernel_timestep_embedding; cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat; @@ -1478,6 +1484,47 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // conv2d + { + #ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "conv2d.cl.h" + }; + const std::string kernel_src_f16_f32 { + #include "conv2d_f16_f32.cl.h" + }; + #else + const std::string kernel_src = read_file("conv2d.cl"); + const std::string kernel_src_f16_f32 = read_file("conv2d_f16_f32.cl"); + #endif + if (!kernel_src.empty()) { + backend_ctx->program_conv_2d_f16 = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), (std::string(compile_opts) + " -DUSE_FP16=1").c_str()); + CL_CHECK((backend_ctx->kernel_conv_2d_f16 = clCreateKernel(backend_ctx->program_conv_2d_f16, "kernel_conv_2d", &err), err)); + GGML_LOG_CONT("."); + backend_ctx->program_conv_2d_f32 = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_conv_2d_f32 = clCreateKernel(backend_ctx->program_conv_2d_f32, "kernel_conv_2d", &err), err)); + GGML_LOG_CONT("."); + } else { + GGML_LOG_WARN("ggml_opencl: conv2d kernel source not found or empty. This op will not be available.\n"); + backend_ctx->program_conv_2d_f16 = nullptr; + backend_ctx->kernel_conv_2d_f16 = nullptr; + backend_ctx->program_conv_2d_f32 = nullptr; + backend_ctx->kernel_conv_2d_f32 = nullptr; + } + if (!kernel_src_f16_f32.empty()) { + backend_ctx->program_conv_2d_f16_f32 = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16_f32.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_conv_2d_f16_f32 = clCreateKernel(backend_ctx->program_conv_2d_f16_f32, "kernel_conv_2d", &err), err)); + GGML_LOG_CONT("."); + } else { + GGML_LOG_WARN("ggml_opencl: conv2d_f16_f32 kernel source not found or empty. This op will not be available.\n"); + backend_ctx->program_conv_2d_f16_f32 = nullptr; + backend_ctx->kernel_conv_2d_f16_f32 = nullptr; + } + } + // mul_mv_id_q4_0_f32_8x_flat { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -2361,6 +2408,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te op->src[0]->ne[3] == 1 && op->ne[3] == 1; case GGML_OP_UPSCALE: return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + case GGML_OP_CONV_2D: + return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) || + (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) || + (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32); case GGML_OP_CONCAT: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; case GGML_OP_TIMESTEP_EMBEDDING: @@ -4998,6 +5049,83 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); } +static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_TENSOR_BINARY_OP_LOCALS; + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + const cl_uint Cout = ne03; const cl_uint Cin = ne02; const cl_uint N = ne13; + const cl_uint KW = ne00; const cl_uint KH = ne01; const cl_uint W = ne10; const cl_uint H = ne11; const cl_uint OW = ne0; const cl_uint OH = ne1; + + const cl_uint s0 = dst->op_params[0]; const cl_uint s1 = dst->op_params[1]; + const cl_uint p0 = dst->op_params[2]; const cl_uint p1 = dst->op_params[3]; + const cl_uint d0 = dst->op_params[4]; const cl_uint d1 = dst->op_params[5]; + + const cl_uint cl_nb01 = nb01/ggml_type_size(src0->type); const cl_uint cl_nb02 = nb02/ggml_type_size(src0->type); const cl_uint cl_nb03 = nb03/ggml_type_size(src0->type); + const cl_uint cl_nb11 = nb11/ggml_type_size(src1->type); const cl_uint cl_nb12 = nb12/ggml_type_size(src1->type); const cl_uint cl_nb13 = nb13/ggml_type_size(src1->type); + const cl_uint cl_nb1 = nb1/ggml_type_size(dst->type); const cl_uint cl_nb2 = nb2/ggml_type_size(dst->type); const cl_uint cl_nb3 = nb3/ggml_type_size(dst->type); + + const int64_t NPQ = (int64_t)N * OW * OH; + + const uint32_t BS_K = 64; + const uint32_t BS_NPQ = 64; + const uint32_t BS_CRS = 16; + const uint32_t VEC_SIZE = 4; + + const uint32_t TS_K = 4; + const uint32_t TS_NPQ = 8; + + const uint32_t WG_K = BS_K / TS_K; + const uint32_t WG_NPQ = BS_NPQ / TS_NPQ; + + auto splitWork = [](uint32_t work_size, uint32_t block_size) { return (block_size + work_size - 1) / block_size; }; + const uint32_t NB_K = splitWork(Cout, BS_K); + const uint32_t NB_NPQ = splitWork(NPQ, BS_NPQ); + + cl_kernel kernel; + size_t shmem_size; + + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + kernel = backend_ctx->kernel_conv_2d_f16; + shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_half4)); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_conv_2d_f32; + shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_float) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4)); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_conv_2d_f16_f32; + shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4)); + } else { + GGML_ASSERT(false && "Unsupported data type combination for conv2d"); + return; + } + + cl_uint idx = 0; + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra1->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, idx++, shmem_size, NULL)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &Cout)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &Cin)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &N)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &KW)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &KH)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &W)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &OW)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &OH)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s1)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &p0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &p1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &d0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &d1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb01)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb02)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb03)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb11)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb12)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb13)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb1)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb2)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb3)); + + size_t global_work_size[] = { (size_t)NB_K * WG_K, (size_t)NB_NPQ * WG_NPQ, 1 }; + size_t local_work_size[] = { (size_t)WG_K, (size_t)WG_NPQ, 1 }; + + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); +} + static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -6752,6 +6880,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } ggml_cl_upscale(backend, tensor->src[0], tensor); return true; + case GGML_OP_CONV_2D: + if (!any_on_device) { + return false; + } + func = ggml_cl_conv_2d; + break; case GGML_OP_CONCAT: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/conv2d.cl b/ggml/src/ggml-opencl/kernels/conv2d.cl new file mode 100644 index 00000000000..e339c90cff5 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/conv2d.cl @@ -0,0 +1,185 @@ +#ifdef USE_FP16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define T_FLOAT half +#define T_FLOAT4 half4 +#define VSTORE_T_FLOAT4(data, offset, p) vstore_half4_rte(data, offset, p) +#else +#define T_FLOAT float +#define T_FLOAT4 float4 +#define VSTORE_T_FLOAT4(data, offset, p) vstore4(data, offset, p) +#endif + +#if defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#else +#define REQD_SUBGROUP_SIZE_128 +#endif + +#define T_ACCUM float4 +#define VEC_SIZE 4 + +#define BS_K 64 +#define BS_NPQ 64 +#define BS_CRS 16 + +#define TS_K 4 +#define TS_NPQ 8 + +#define WG_K (BS_K / TS_K) +#define WG_NPQ (BS_NPQ / TS_NPQ) + +#define BS_NPQ_VEC (BS_NPQ / VEC_SIZE) +#define TS_NPQ_VEC (TS_NPQ / VEC_SIZE) + +static inline uint splitWork(uint work_size, uint block_size){ + return (work_size + block_size - 1) / block_size; +} + +REQD_SUBGROUP_SIZE_128 +kernel void kernel_conv_2d( + global void* p_knl, + ulong off_knl, + global void* p_src, + ulong off_src, + global void* p_dst, + ulong off_dst, + local void* shared, + uint Cout, uint Cin, uint N, + uint KW, uint KH, uint W, uint H, uint OW, uint OH, + uint s0, uint s1, uint p0, uint p1, uint d0, uint d1, + uint nb01, uint nb02, uint nb03, + uint nb11, uint nb12, uint nb13, + uint nb1, uint nb2, uint nb3 +) { + global T_FLOAT* knl_data = (global T_FLOAT*) ((global char*)p_knl + off_knl); + global T_FLOAT* src_data = (global T_FLOAT*) ((global char*)p_src + off_src); + global T_FLOAT* dst_data = (global T_FLOAT*) ((global char*)p_dst + off_dst); + + const uint K = Cout; + const uint CRS = Cin*KH*KW; + const uint NPQ = N*OH*OW; + + const uint lid_k = get_local_id(0); + const uint lid_npq = get_local_id(1); + const uint tid = lid_npq * WG_K + lid_k; + + const uint B_idx_K = get_group_id(0); + const uint B_idx_NPQ = get_group_id(1); + + const uint offset_k = B_idx_K * BS_K; + const uint offset_npq = B_idx_NPQ * BS_NPQ; + + local T_FLOAT* Ash = (local T_FLOAT*)shared; + local T_FLOAT4* Bsh = (local T_FLOAT4*) &Ash[BS_K * BS_CRS]; + + T_ACCUM regC[TS_K][TS_NPQ_VEC]; + for (int i = 0; i < TS_K; ++i) { + for (int j = 0; j < TS_NPQ_VEC; ++j) { + regC[i][j] = (T_ACCUM)(0.0f); + } + } + + const uint NB_CRS = splitWork(CRS, BS_CRS); + + for (uint B_idx_CRS = 0; B_idx_CRS < NB_CRS; ++B_idx_CRS) { + const uint offset_crs = B_idx_CRS * BS_CRS; + + for (int i = tid; i < BS_K * BS_CRS; i += (WG_K * WG_NPQ)) { + const uint k_l = i / BS_CRS; + const uint crs_l = i % BS_CRS; + const uint k_g = offset_k + k_l; + const uint crs_g = offset_crs + crs_l; + + if (k_g < K && crs_g < CRS) { + const uint Cin_idx = crs_g / (KW*KH); + const uint KH_idx = (crs_g - Cin_idx*KW*KH) / KW; + const uint KW_idx = crs_g - Cin_idx*KW*KH - KH_idx*KW; + const uint knl_idx = KW_idx + KH_idx*nb01 + Cin_idx*nb02 + k_g*nb03; + Ash[k_l * BS_CRS + crs_l] = knl_data[knl_idx]; + } else { + Ash[k_l * BS_CRS + crs_l] = (T_FLOAT)0.0f; + } + } + + for (int i = tid; i < BS_CRS * BS_NPQ_VEC; i += (WG_K * WG_NPQ)) { + const uint crs_l = i / BS_NPQ_VEC; + const uint npq_l_vec = i % BS_NPQ_VEC; + const uint crs_g = offset_crs + crs_l; + + T_FLOAT4 val = (T_FLOAT4)(0.0f); + if (crs_g < CRS) { + const uint Cin_idx = crs_g / (KW * KH); + const uint KH_idx = (crs_g - Cin_idx * KW * KH) / KW; + const uint KW_idx = crs_g - Cin_idx * KW * KH - KH_idx * KW; + for (int v = 0; v < VEC_SIZE; ++v) { + const uint npq_g = offset_npq + npq_l_vec * VEC_SIZE + v; + if (npq_g < NPQ) { + const uint N_idx = npq_g / (OH * OW); + const uint pq_idx = npq_g % (OH * OW); + const uint OH_idx = pq_idx / OW; + const uint OW_idx = pq_idx % OW; + const int H_idx = (int)(OH_idx * s1 + KH_idx * d1 - p1); + const int W_idx = (int)(OW_idx * s0 + KW_idx * d0 - p0); + + if (H_idx >= 0 && H_idx < H && W_idx >= 0 && W_idx < W) { + const uint src_idx = W_idx + H_idx * nb11 + Cin_idx * nb12 + N_idx * nb13; + ((T_FLOAT*)&val)[v] = src_data[src_idx]; + } + } + } + } + Bsh[crs_l * BS_NPQ_VEC + npq_l_vec] = val; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + #pragma unroll + for (uint crs_l = 0; crs_l < BS_CRS; ++crs_l) { + T_FLOAT regA[TS_K]; + for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) { + regA[k_l_reg] = Ash[(lid_k * TS_K + k_l_reg) * BS_CRS + crs_l]; + } + + for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) { + T_FLOAT4 regB = Bsh[crs_l * BS_NPQ_VEC + lid_npq * TS_NPQ_VEC + npq_l_vec_reg]; + for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) { + regC[k_l_reg][npq_l_vec_reg] = mad(convert_float(regA[k_l_reg]), convert_float4(regB), regC[k_l_reg][npq_l_vec_reg]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) { + const uint k_g = offset_k + lid_k * TS_K + k_l_reg; + if (k_g >= K) continue; + + for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) { + const uint npq_g_base = offset_npq + (lid_npq * TS_NPQ_VEC + npq_l_vec_reg) * VEC_SIZE; + + const uint N_idx = npq_g_base / (OH * OW); + const uint pq_idx = npq_g_base % (OH * OW); + const uint OH_idx = pq_idx / OW; + const uint OW_idx = pq_idx % OW; + + if (nb1 == OW && OW_idx + VEC_SIZE <= OW && npq_g_base + VEC_SIZE <= NPQ) { + const uint dst_idx = OW_idx + OH_idx*nb1 + k_g*nb2 + N_idx*nb3; + VSTORE_T_FLOAT4(regC[k_l_reg][npq_l_vec_reg], 0, &dst_data[dst_idx]); + } else { + T_ACCUM res = regC[k_l_reg][npq_l_vec_reg]; + for (int v = 0; v < VEC_SIZE; ++v) { + const uint npq_g = npq_g_base + v; + if (npq_g < NPQ) { + const uint N_idx_s = npq_g / (OH*OW); + const uint pq_idx_s = npq_g % (OH*OW); + const uint OH_idx_s = pq_idx_s / OW; + const uint OW_idx_s = pq_idx_s % OW; + const uint dst_idx_s = OW_idx_s + OH_idx_s*nb1 + k_g*nb2 + N_idx_s*nb3; + dst_data[dst_idx_s] = (T_FLOAT)(((float*)&res)[v]); + } + } + } + } + } +} diff --git a/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl b/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl new file mode 100644 index 00000000000..cb05637f33a --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl @@ -0,0 +1,176 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#if defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#else +#define REQD_SUBGROUP_SIZE_128 +#endif + +#define T_ACCUM float4 +#define VEC_SIZE 4 + +#define BS_K 64 +#define BS_NPQ 64 +#define BS_CRS 16 + +#define TS_K 4 +#define TS_NPQ 8 + +#define WG_K (BS_K / TS_K) +#define WG_NPQ (BS_NPQ / TS_NPQ) + +#define BS_NPQ_VEC (BS_NPQ / VEC_SIZE) +#define TS_NPQ_VEC (TS_NPQ / VEC_SIZE) + +static inline uint splitWork(uint work_size, uint block_size){ + return (work_size + block_size - 1) / block_size; +} + +REQD_SUBGROUP_SIZE_128 +kernel void kernel_conv_2d( + global void* p_knl, + ulong off_knl, + global void* p_src, + ulong off_src, + global void* p_dst, + ulong off_dst, + local void* shared, + uint Cout, uint Cin, uint N, + uint KW, uint KH, uint W, uint H, uint OW, uint OH, + uint s0, uint s1, uint p0, uint p1, uint d0, uint d1, + uint nb01, uint nb02, uint nb03, + uint nb11, uint nb12, uint nb13, + uint nb1, uint nb2, uint nb3 +) { + global half* knl_data = (global half*) ((global char*)p_knl + off_knl); + global float* src_data = (global float*) ((global char*)p_src + off_src); + global float* dst_data = (global float*) ((global char*)p_dst + off_dst); + + const uint K = Cout; + const uint CRS = Cin*KH*KW; + const uint NPQ = N*OH*OW; + + const uint lid_k = get_local_id(0); + const uint lid_npq = get_local_id(1); + const uint tid = lid_npq * WG_K + lid_k; + + const uint B_idx_K = get_group_id(0); + const uint B_idx_NPQ = get_group_id(1); + + const uint offset_k = B_idx_K * BS_K; + const uint offset_npq = B_idx_NPQ * BS_NPQ; + + local half* Ash = (local half*)shared; + local float4* Bsh = (local float4*) &Ash[BS_K * BS_CRS]; + + T_ACCUM regC[TS_K][TS_NPQ_VEC]; + for (int i = 0; i < TS_K; ++i) { + for (int j = 0; j < TS_NPQ_VEC; ++j) { + regC[i][j] = (T_ACCUM)(0.0f); + } + } + + const uint NB_CRS = splitWork(CRS, BS_CRS); + + for (uint B_idx_CRS = 0; B_idx_CRS < NB_CRS; ++B_idx_CRS) { + const uint offset_crs = B_idx_CRS * BS_CRS; + + for (int i = tid; i < BS_K * BS_CRS; i += (WG_K * WG_NPQ)) { + const uint k_l = i / BS_CRS; + const uint crs_l = i % BS_CRS; + const uint k_g = offset_k + k_l; + const uint crs_g = offset_crs + crs_l; + + if (k_g < K && crs_g < CRS) { + const uint Cin_idx = crs_g / (KW*KH); + const uint KH_idx = (crs_g - Cin_idx*KW*KH) / KW; + const uint KW_idx = crs_g - Cin_idx*KW*KH - KH_idx*KW; + const uint knl_idx = KW_idx + KH_idx*nb01 + Cin_idx*nb02 + k_g*nb03; + Ash[k_l * BS_CRS + crs_l] = knl_data[knl_idx]; + } else { + Ash[k_l * BS_CRS + crs_l] = (half)0.0f; + } + } + + for (int i = tid; i < BS_CRS * BS_NPQ_VEC; i += (WG_K * WG_NPQ)) { + const uint crs_l = i / BS_NPQ_VEC; + const uint npq_l_vec = i % BS_NPQ_VEC; + const uint crs_g = offset_crs + crs_l; + + float4 val = (float4)(0.0f); + if (crs_g < CRS) { + const uint Cin_idx = crs_g / (KW * KH); + const uint KH_idx = (crs_g - Cin_idx * KW * KH) / KW; + const uint KW_idx = crs_g - Cin_idx * KW * KH - KH_idx * KW; + for (int v = 0; v < VEC_SIZE; ++v) { + const uint npq_g = offset_npq + npq_l_vec * VEC_SIZE + v; + if (npq_g < NPQ) { + const uint N_idx = npq_g / (OH * OW); + const uint pq_idx = npq_g % (OH * OW); + const uint OH_idx = pq_idx / OW; + const uint OW_idx = pq_idx % OW; + const int H_idx = (int)(OH_idx * s1 + KH_idx * d1 - p1); + const int W_idx = (int)(OW_idx * s0 + KW_idx * d0 - p0); + + if (H_idx >= 0 && H_idx < H && W_idx >= 0 && W_idx < W) { + const uint src_idx = W_idx + H_idx * nb11 + Cin_idx * nb12 + N_idx * nb13; + ((float*)&val)[v] = src_data[src_idx]; + } + } + } + } + Bsh[crs_l * BS_NPQ_VEC + npq_l_vec] = val; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + #pragma unroll + for (uint crs_l = 0; crs_l < BS_CRS; ++crs_l) { + half regA[TS_K]; + for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) { + regA[k_l_reg] = Ash[(lid_k * TS_K + k_l_reg) * BS_CRS + crs_l]; + } + + for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) { + float4 regB = Bsh[crs_l * BS_NPQ_VEC + lid_npq * TS_NPQ_VEC + npq_l_vec_reg]; + for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) { + regC[k_l_reg][npq_l_vec_reg] = mad(convert_float(regA[k_l_reg]), regB, regC[k_l_reg][npq_l_vec_reg]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) { + const uint k_g = offset_k + lid_k * TS_K + k_l_reg; + if (k_g >= K) continue; + + for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) { + const uint npq_g_base = offset_npq + (lid_npq * TS_NPQ_VEC + npq_l_vec_reg) * VEC_SIZE; + + const uint N_idx = npq_g_base / (OH * OW); + const uint pq_idx = npq_g_base % (OH * OW); + const uint OH_idx = pq_idx / OW; + const uint OW_idx = pq_idx % OW; + + if (nb1 == OW && OW_idx + VEC_SIZE <= OW && npq_g_base + VEC_SIZE <= NPQ) { + const uint dst_idx = OW_idx + OH_idx*nb1 + k_g*nb2 + N_idx*nb3; + vstore4(regC[k_l_reg][npq_l_vec_reg], 0, &dst_data[dst_idx]); + } else { + T_ACCUM res = regC[k_l_reg][npq_l_vec_reg]; + for (int v = 0; v < VEC_SIZE; ++v) { + const uint npq_g = npq_g_base + v; + if (npq_g < NPQ) { + const uint N_idx_s = npq_g / (OH*OW); + const uint pq_idx_s = npq_g % (OH*OW); + const uint OH_idx_s = pq_idx_s / OW; + const uint OW_idx_s = pq_idx_s % OW; + const uint dst_idx_s = OW_idx_s + OH_idx_s*nb1 + k_g*nb2 + N_idx_s*nb3; + dst_data[dst_idx_s] = ((float*)&res)[v]; + } + } + } + } + } +} From e81e17b048669cb135244c8d7413962121f3143d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 21 Jul 2025 22:55:10 +0200 Subject: [PATCH 0136/1348] opencl: fix `im2col` when `KW!=KH` (llama/14803) --- ggml/src/ggml-opencl/kernels/im2col_f16.cl | 2 +- ggml/src/ggml-opencl/kernels/im2col_f32.cl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-opencl/kernels/im2col_f16.cl b/ggml/src/ggml-opencl/kernels/im2col_f16.cl index b84c8984653..cf6cdaa4ce5 100644 --- a/ggml/src/ggml-opencl/kernels/im2col_f16.cl +++ b/ggml/src/ggml-opencl/kernels/im2col_f16.cl @@ -31,7 +31,7 @@ kernel void kernel_im2col_f16( src1 = (global float*)((global char*)src1 + offset1); dst = (global half*)((global char*)dst + offsetd); - long ksize = OW * (KH > 1 ? KW : 1); + long ksize = OW * KH; long kx = i / ksize; long kd = kx * ksize; long ky = (i - kd) / OW; diff --git a/ggml/src/ggml-opencl/kernels/im2col_f32.cl b/ggml/src/ggml-opencl/kernels/im2col_f32.cl index 4bf65e4eaaf..1ecdb2344ad 100644 --- a/ggml/src/ggml-opencl/kernels/im2col_f32.cl +++ b/ggml/src/ggml-opencl/kernels/im2col_f32.cl @@ -31,7 +31,7 @@ kernel void kernel_im2col_f32( src1 = (global float*)((global char*)src1 + offset1); dst = (global float*)((global char*)dst + offsetd); - long ksize = OW * (KH > 1 ? KW : 1); + long ksize = OW * KH; long kx = i / ksize; long kd = kx * ksize; long ky = (i - kd) / OW; From 9008410087d0dc7f71f2debf7b1f8ab0d631b6ba Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Tue, 22 Jul 2025 07:45:26 +0800 Subject: [PATCH 0137/1348] cuda: remove linking to cublasLt (llama/14790) Signed-off-by: Xiaodong Ye --- ggml/src/ggml-cuda/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index c9ff4aa321b..98ed29bc9c1 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -102,12 +102,12 @@ if (CUDAToolkit_FOUND) if (GGML_STATIC) if (WIN32) # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas) else () - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static) endif() else() - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt) + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas) endif() if (GGML_CUDA_NO_VMM) From de49384ab3d34cc5368aa7429d76e9ec2718ed67 Mon Sep 17 00:00:00 2001 From: lhez Date: Mon, 21 Jul 2025 23:53:30 -0700 Subject: [PATCH 0138/1348] opencl: remove unreachable `return` (llama/14806) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index a31483b6108..63ac4a989b0 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -5103,7 +5103,6 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4)); } else { GGML_ASSERT(false && "Unsupported data type combination for conv2d"); - return; } cl_uint idx = 0; From 810018a63a61087414c6a8d379debf0a36722892 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 22 Jul 2025 12:33:10 +0200 Subject: [PATCH 0139/1348] cuda : implement bf16 cpy ops and enable bf16 cont (llama/14763) * implement bf16 cpy ops and enable bf16 cont * deduplicate copy functions * deduplicate checks --- ggml/src/ggml-cuda/cpy-utils.cuh | 46 ++++------------- ggml/src/ggml-cuda/cpy.cu | 89 ++++++++++++-------------------- ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++----- ggml/src/ggml-cuda/set-rows.cu | 20 +------ 4 files changed, 49 insertions(+), 124 deletions(-) diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh index e7a0bd2f1a0..410c12b7ba5 100644 --- a/ggml/src/ggml-cuda/cpy-utils.cuh +++ b/ggml/src/ggml-cuda/cpy-utils.cuh @@ -2,24 +2,13 @@ #include "ggml-common.h" -static __device__ __forceinline__ void convert_f32_f32(const float * src, float * dst) { - *dst = *src; -} - -static __device__ __forceinline__ void convert_f32_f16(const float * src, half * dst) { - *dst = __float2half(*src); -} - -static __device__ __forceinline__ void convert_f32_bf16(const float * src, nv_bfloat16 * dst) { - *dst = *src; -} - -static __device__ __forceinline__ void convert_f16_f16(const half * src, half * dst) { - *dst = *src; -} - -static __device__ __forceinline__ void convert_f16_f32(const half * src, float * dst) { - *dst = *src; +template +static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) { + if constexpr (std::is_same_v) { + *dst = *src; + } else { + *dst = float(*src); + } } static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) { @@ -230,22 +219,7 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti); } -static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) { - convert_f32_f32((const float *)cxi, (float *)cdsti); -} - -static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { - convert_f32_f16((const float *)cxi, (half *)cdsti); -} - -static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) { - convert_f32_bf16((const float *)cxi, (nv_bfloat16 *)cdsti); -} - -static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) { - convert_f16_f16((const half *)cxi, (half *)cdsti); -} - -static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { - convert_f16_f32((const half *)cxi, (float *)cdsti); +template +static __device__ void cpy_1_flt(const char * cxi, char * cdsti) { + convert_flt((const src_t *)cxi, (dst_t *)cdsti); } diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index e7d0da08705..0e5964907e1 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -8,10 +8,10 @@ typedef void (*cpy_kernel_t)(const char * cx, char * cdst); template -static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) { +static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) { const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { @@ -139,43 +139,14 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des #endif } -static void ggml_cpy_f16_f32_cuda( +template +static void ggml_cpy_flt_cuda( const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) { const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); -} - -static void ggml_cpy_f32_f32_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) { - - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); -} - -static void ggml_cpy_f32_bf16_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) { - - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); -} - -static void ggml_cpy_f32_f16_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) { - - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> + cpy_flt><<>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); } @@ -307,16 +278,6 @@ static void ggml_cpy_f32_iq4_nl_cuda( (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); } -static void ggml_cpy_f16_f16_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) { - - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); -} - void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); @@ -372,11 +333,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); } } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { - ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) { @@ -403,9 +364,17 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) { ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); } else { GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); @@ -430,11 +399,11 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { return nullptr; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - return (void*) cpy_f32_f16; + return (void*) cpy_flt>; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { - return (void*) cpy_f32_f16; + return (void*) cpy_flt>; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - return (void*) cpy_f32_f16; + return (void*) cpy_flt>; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { return (void*) cpy_f32_q; } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) { @@ -458,9 +427,17 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) { return (void*) cpy_q_f32, QK5_1>; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - return (void*) cpy_f32_f16; + return (void*) cpy_flt>; + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) { + return (void*) cpy_flt>; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - return (void*) cpy_f32_f16; + return (void*) cpy_flt>; + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) { + return (void*) cpy_flt>; + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) { + return (void*) cpy_flt>; + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) { + return (void*) cpy_flt>; } else { GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index dfc50ef0daf..548bc31ce21 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3242,13 +3242,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g { ggml_type src0_type = op->src[0]->type; ggml_type src1_type = op->src[1]->type; - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { - return true; - } - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) { - return true; - } - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { + if ((src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_F16) && + (src1_type == GGML_TYPE_F32 || src1_type == GGML_TYPE_BF16 || src1_type == GGML_TYPE_F16) + ) { return true; } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { @@ -3284,12 +3280,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) { return true; } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return true; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return true; - } if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) { return true; } @@ -3370,7 +3360,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return op->src[0]->ne[1] % 128 == 0; } case GGML_OP_CONT: - return op->src[0]->type != GGML_TYPE_BF16; + return true; case GGML_OP_DIAG_MASK_INF: return true; case GGML_OP_SOFT_MAX: diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 560604d095f..b2acdf855e9 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -4,24 +4,8 @@ typedef void (*set_rows_kernel_t)(const char * src, char * dst); template -__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) { - GGML_UNUSED(src_f); - GGML_UNUSED(dst_f); -} - -template<> -__device__ __forceinline__ void set_rows_1(const float * src_f, half * dst_h) { - convert_f32_f16(src_f, dst_h); -} - -template<> -__device__ __forceinline__ void set_rows_1(const float * src_f, nv_bfloat16 * dst_b) { - convert_f32_bf16(src_f, dst_b); -} - -template<> -__device__ __forceinline__ void set_rows_1(const float * src_f, float * dst_f) { - convert_f32_f32(src_f, dst_f); +__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) { + convert_flt(src_f, dst_f); } // Generic quantized set_rows kernel template From c91361379a4f376494f1c3678e97fd091e4eabda Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 22 Jul 2025 10:35:21 -0500 Subject: [PATCH 0140/1348] vulkan: fix rms_norm_mul to handle broadcasting dim0 (llama/14817) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 +- ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c3f1369b663..1a7a381ce59 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -10248,7 +10248,7 @@ static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, st } // if rms_norm is the B operand, then we don't handle broadcast if (rms_norm == mul->src[1] && - mul->src[0]->ne[1] != rms_norm->ne[1]) { + !ggml_are_same_shape(mul->src[0], rms_norm)) { return false; } // rms_norm shader assumes contiguous rows diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp index 6428ca7ba33..bdd7db2d698 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp @@ -50,8 +50,14 @@ void main() { const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1)); if (do_multiply) { - [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { - data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col])); + if (ncols > p.ne10) { + [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)])); + } + } else { + [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col])); + } } } else { [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { From f8402d0a95f1bb080a7d0f99047d71f97b197759 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 23 Jul 2025 09:25:42 +0800 Subject: [PATCH 0141/1348] CUDA: add fused rms norm (llama/14800) --- ggml/src/ggml-cuda/ggml-cuda.cu | 41 ++++++++++++++ ggml/src/ggml-cuda/norm.cu | 97 +++++++++++++++++++++++++++++++-- ggml/src/ggml-cuda/norm.cuh | 2 + 3 files changed, 135 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 548bc31ce21..03c380897cd 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -2765,6 +2766,39 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { } #endif +static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { + if (!ggml_can_fuse(cgraph, node_idx, ops)) { + return false; + } + + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) { + const ggml_tensor *rms_norm = cgraph->nodes[node_idx]; + const ggml_tensor *mul = cgraph->nodes[node_idx+1]; + + GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(rms_norm->type == GGML_TYPE_F32); + + //rms norm only supports F32 + if (mul->src[0]->type != GGML_TYPE_F32 || + mul->src[1]->type != GGML_TYPE_F32 || + mul->type != GGML_TYPE_F32) { + return false; + } + + //if rms norm is the B operand, then we don't handle broadcast + if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm->src[1])) { + return false; + } + + //rms_norm kernel assumes contigous rows + if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) { + return false; + } + } + + return true; +} + static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { // flag used to determine whether it is an integrated_gpu @@ -2774,6 +2808,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. // With the use of CUDA graphs, the execution will be performed by the graph launch. if (!use_cuda_graph || cuda_graph_update_required) { + for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2781,6 +2816,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } + static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); + if (!disable_fusion && ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); + i++; + continue; + } #ifndef NDEBUG assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); for (int j = 0; j < GGML_MAX_SRC; j++) { diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 0020dbcec5f..bddcca51b7b 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -104,10 +104,12 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr } } -template +template static __global__ void rms_norm_f32( const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, - const int64_t stride_sample, const float eps) { + const int64_t stride_sample, const float eps, const float * mul = nullptr, const int64_t mul_stride_row = 0, + const int64_t mul_stride_channel = 0, const int64_t mul_stride_sample = 0, const int mul_ncols = 0, + const int mul_nrows = 0, const int mul_nchannels = 0, const int mul_nsamples = 0) { const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -119,6 +121,13 @@ static __global__ void rms_norm_f32( x += sample*stride_sample + channel*stride_channel + row*stride_row; dst += ((sample*nchannels + channel)*nrows + row)*ncols; + if constexpr (do_multiply) { + const int mul_row = row % mul_nrows; + const int mul_channel = channel % mul_nchannels; + const int mul_sample = sample % mul_nsamples; + mul += mul_sample*mul_stride_sample + mul_channel*mul_stride_channel + mul_row*mul_stride_row; + } + float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { @@ -145,7 +154,12 @@ static __global__ void rms_norm_f32( const float scale = rsqrtf(mean + eps); for (int col = tid; col < ncols; col += block_size) { - dst[col] = scale * x[col]; + if constexpr (do_multiply) { + const int mul_col = col % mul_ncols; + dst[col] = scale * x[col] * mul[mul_col]; + } else { + dst[col] = scale * x[col]; + } } } @@ -310,10 +324,30 @@ static void rms_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); - rms_norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + rms_norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + } else { + const dim3 block_dims(1024, 1, 1); + rms_norm_f32<1024, false><<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + } +} + +static void rms_norm_mul_f32_cuda( + const float * x, const float * mul, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, + const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, + const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample, + const int mul_ncols, const int mul_nrows, const int mul_nchannels, const int mul_nsamples, + const float eps, cudaStream_t stream) { + const dim3 blocks_num(nrows, nchannels, nsamples); + if (mul == nullptr) { + rms_norm_f32_cuda(x, dst, ncols, nrows, nchannels, nsamples, stride_row, stride_channel, stride_sample, eps, stream); + return; + } + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + rms_norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols, mul_nrows, mul_nchannels, mul_nsamples); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024><<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + rms_norm_f32<1024, true><<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols, mul_nrows, mul_nchannels, mul_nsamples); } } @@ -407,6 +441,59 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { rms_norm_f32_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream); } +void ggml_cuda_op_rms_norm_fused(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * mul_tensor) { + const ggml_tensor * rms_norm_src = (ggml_tensor *) dst->src[0]; + float eps = 0.0f; + + memcpy(&eps, dst->op_params, sizeof(float)); + + const float * src0_d = (const float *) rms_norm_src->data; + const float * mul_d = nullptr; + const ggml_tensor * mul_src = nullptr; + + if (mul_tensor->src[0] == dst) { + mul_d = (float *) mul_tensor->src[1]->data; + mul_src = mul_tensor->src[1]; + } else if(mul_tensor->src[1] == dst) { + mul_d = (float *) mul_tensor->src[0]->data; + mul_src = mul_tensor->src[0]; + } else { + GGML_ASSERT(false); + } + + float * dst_d = (float *) mul_tensor->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(rms_norm_src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32); + GGML_ASSERT(eps >= 0.0f); + + const int64_t ne00 = rms_norm_src->ne[0]; + const int64_t ne01 = rms_norm_src->ne[1]; + const int64_t ne02 = rms_norm_src->ne[2]; + const int64_t ne03 = rms_norm_src->ne[3]; + + const size_t ts0 = ggml_type_size(rms_norm_src->type); + GGML_ASSERT(rms_norm_src->nb[0] == ts0); + const int64_t s01 = rms_norm_src->nb[1] / ts0; + const int64_t s02 = rms_norm_src->nb[2] / ts0; + const int64_t s03 = rms_norm_src->nb[3] / ts0; + + const size_t ts_mul = ggml_type_size(mul_src->type); + GGML_ASSERT(mul_src->nb[0] == ts_mul); + const int64_t mul_s01 = mul_src->nb[1] / ts_mul; + const int64_t mul_s02 = mul_src->nb[2] / ts_mul; + const int64_t mul_s03 = mul_src->nb[3] / ts_mul; + + const int mul_ncols = mul_src->ne[0]; + const int mul_nrows = mul_src->ne[1]; + const int mul_nchannels = mul_src->ne[2]; + const int mul_nsamples = mul_src->ne[3]; + + rms_norm_mul_f32_cuda(src0_d, mul_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, mul_s01, mul_s02, mul_s03, mul_ncols, mul_nrows, mul_nchannels, mul_nsamples, eps, stream); +} + void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * grad = dst->src[0]; // gradients const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass diff --git a/ggml/src/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh index 706a5660a68..7ea7bd4df3c 100644 --- a/ggml/src/ggml-cuda/norm.cuh +++ b/ggml/src/ggml-cuda/norm.cuh @@ -6,6 +6,8 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_rms_norm_fused(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * mul_tensor); + void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_l2_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); From 49d5540206a1651db99f405b7a9d748496e66b79 Mon Sep 17 00:00:00 2001 From: chen fan <350211548@qq.com> Date: Wed, 23 Jul 2025 11:58:00 +0800 Subject: [PATCH 0142/1348] CANN: weight format to NZ for Ascend310P3 (llama/14407) * weight format to nz for 310p * remove quant weight format to nz * clean code * fix * make the conditions for converting weights to NZ format consistent * clean code --- ggml/src/ggml-cann/aclnn_ops.cpp | 23 ++++++++++- ggml/src/ggml-cann/aclnn_ops.h | 32 ++++++++++++++++ ggml/src/ggml-cann/ggml-cann.cpp | 65 ++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 4d5c2c18252..76bed4e8cd0 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -1785,8 +1785,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], bcast_weight_nb[2], bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5]}; - aclTensor* acl_weight_tensor = - ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims); + aclTensor* acl_weight_tensor; + + bool weightToNZ = false; +#ifdef ASCEND_310P + weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); +#endif + if (weightToNZ && is_matmul_weight(weight)) { + int64_t acl_stride[2] = {1, transpose_ne[1]}; + + // Reverse ne. + std::reverse(transpose_ne, transpose_ne + n_dims); + + std::vector storageDims = {transpose_ne[0], transpose_ne[1]}; + + acl_weight_tensor = aclCreateTensor( + transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride, + 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data); + } else { + acl_weight_tensor = + ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND); + } aclTensor* acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 80ce80baea0..924da66ed68 100755 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -23,6 +23,7 @@ #ifndef CANN_ACLNN_OPS #define CANN_ACLNN_OPS +#include #include #include #include @@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe */ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst); +/** + * @brief Check whether a tensor is a weight tensor for matrix multiplication. + * + * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations, + * typically within neural network layers. The function maintains a static set of canonical weight + * naming suffixes from Transformer-based architectures. Uses substring matching to identify weight + * tensors even with hierarchical naming patterns. + * + * @param tensor Pointer to the target ggml_tensor object (const-qualified). + */ +static bool is_matmul_weight(const ggml_tensor* tensor) { + std::string name = ggml_get_name(tensor); + static const std::unordered_set weight_suffixes{ + "output.weight", + "attn_q.weight", + "attn_k.weight", + "attn_v.weight", + "attn_output.weight", + "ffn_gate.weight", + "ffn_up.weight", + "ffn_down.weight" + }; + + for (const auto& suffix : weight_suffixes) { + if (name.find(suffix) != std::string::npos) { + return true; + } + } + return false; +} + /** * @brief Applies a element-wise operation to two input tensors using the CANN * backend. diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index e5e11d4cdce..f30241aca40 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( return GGML_STATUS_SUCCESS; } +static int CreateAclTensorWeight(const void *hostData, const std::vector &shape, void **deviceAddr, + aclDataType dataType, aclTensor **tensor) +{ + uint64_t size = 1; + for (auto i : shape) { + size *= i; + } + + const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size()); + ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size)); + + size *= sizeof(int16_t); + + ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST)); + aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE); + + std::vector strides(shape.size(), 1); + for (int64_t i = shape.size() - 2; i >= 0; i--) { + strides[i] = shape[i + 1] * strides[i + 1]; + } + + *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND, + shape.data(), shape.size(), *deviceAddr); + return 0; +} + +static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) { + aclrtStream stream; + ACL_CHECK(aclrtCreateStream(&stream)); + + std::vector weightTransposedShape = {tensor->ne[1], tensor->ne[0]}; + void *weightTransposedDeviceAddr = nullptr; + aclTensor *weightTransposed = nullptr; + CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr, + ggml_cann_type_mapping(tensor->type), &weightTransposed); + + uint64_t workspaceSize = 0; + aclOpExecutor *executor; + void *workspaceAddr = nullptr; + + // TransMatmulWeight + ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); + std::unique_ptr workspaceAddrPtrTrans(nullptr, aclrtFree); + if (workspaceSize > 0) { + ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + workspaceAddrPtrTrans.reset(workspaceAddr); + } + ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream)); + + size_t size = ggml_nelements(tensor) * ggml_element_size(tensor); + + aclrtMemcpy((char *)tensor->data + offset, size, + weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE); + ACL_CHECK(aclDestroyTensor(weightTransposed)); + aclrtFree(weightTransposedDeviceAddr); +} + // TODO: need handle tensor which has paddings. /** * @brief Set tensor data in a CANN buffer. @@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor( // For acl, synchronous functions use this default stream. // Why aclrtSynchronizeDevice? + bool weightToNZ = false; +#ifdef ASCEND_310P + weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); +#endif if (!need_transform(tensor->type)) { ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE)); + if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) { + weight_format_to_nz(tensor, data, offset); + } } else { void *transform_buffer = malloc(size); ggml_backend_cann_transform(tensor, data, transform_buffer); From 026d8a0c6e2da9f5f9079f7e99dd1df086715eb7 Mon Sep 17 00:00:00 2001 From: lixing-star <104126818+lixing-star@users.noreply.github.com> Date: Wed, 23 Jul 2025 14:39:51 +0800 Subject: [PATCH 0143/1348] ggml: fix loongarch quantize_row_q8_1 error (llama/14827) --- ggml/src/ggml-cpu/arch/loongarch/quants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c index 9e33fb32286..7908da4d16b 100644 --- a/ggml/src/ggml-cpu/arch/loongarch/quants.c +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -544,7 +544,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) ); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); __m128 tmp = max4; - max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 )); + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x1 )); const float max_scalar = ((v4f32)max4)[0]; // Quantize these floats From a65976fc3cb3359d02a374aaaa7fb6855f2a7dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 23 Jul 2025 12:35:53 +0200 Subject: [PATCH 0144/1348] CUDA: fix quantized KV cache + multiple sequences (llama/14822) * CUDA: fix quantized KV cache + multiple sequences * Update ggml/src/ggml-cuda/fattn-common.cuh Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- ggml/src/ggml-cuda/convert.cu | 81 +++++++++++++++++++++++------ ggml/src/ggml-cuda/fattn-common.cuh | 61 +++++++++++++++------- 2 files changed, 107 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index eeaa14bf579..1b4a71bab07 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -6,24 +6,33 @@ #define CUDA_Q8_0_NE_ALIGN 2048 template -static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { - const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x); +static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, + const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t s01, const int64_t s02, const int64_t s03) { + const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x); - if (i >= k) { + if (i00 >= ne00) { return; } - const int64_t ib = i/qk; // block index - const int64_t iqs = (i%qk)/qr; // quant index - const int64_t iybs = i - i%qk; // y block start index + const int64_t i01 = blockIdx.y; + const int64_t i02 = blockIdx.z % ne02; + const int64_t i03 = blockIdx.z / ne02; + + const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01; + + const int64_t ib = ibx0 + i00/qk; // block index + const int64_t iqs = (i00%qk)/qr; // quant index + const int64_t iybs = i00 - i00%qk; // y block start index const int64_t y_offset = qr == 1 ? 1 : qk/2; // dequantize dfloat2 v; dequantize_kernel(vx, ib, iqs, v); - y[iybs + iqs + 0] = v.x; - y[iybs + iqs + y_offset] = v.y; + const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs; + y[iy0 + 0] = v.x; + y[iy0 + y_offset] = v.y; } template @@ -457,9 +466,17 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst } template -static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { - const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE); - dequantize_block<<>>(vx, y, k); +static void dequantize_block_cuda(const void * vx, dst_t * y, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) { + const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03); + dequantize_block<<>> + (vx, y, ne00, ne01, ne02, s01, s02, s03); +} + +template +static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { + dequantize_block_cuda(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream); } static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) { @@ -624,14 +641,14 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_Q4_1: return dequantize_row_q4_1_cuda; case GGML_TYPE_Q5_0: - return dequantize_block_cuda; + return dequantize_block_cont_cuda; case GGML_TYPE_Q5_1: - return dequantize_block_cuda; + return dequantize_block_cont_cuda; case GGML_TYPE_Q8_0: if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) { return dequantize_block_q8_0_f16_cuda; } - return dequantize_block_cuda; + return dequantize_block_cont_cuda; case GGML_TYPE_Q2_K: return dequantize_row_q2_K_cuda; case GGML_TYPE_Q3_K: @@ -676,11 +693,11 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_Q4_1: return dequantize_row_q4_1_cuda; case GGML_TYPE_Q5_0: - return dequantize_block_cuda; + return dequantize_block_cont_cuda; case GGML_TYPE_Q5_1: - return dequantize_block_cuda; + return dequantize_block_cont_cuda; case GGML_TYPE_Q8_0: - return dequantize_block_cuda; + return dequantize_block_cont_cuda; case GGML_TYPE_Q2_K: return dequantize_row_q2_K_cuda; case GGML_TYPE_Q3_K: @@ -722,6 +739,16 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { switch (type) { case GGML_TYPE_F32: return convert_unary_cuda; + case GGML_TYPE_Q4_0: + return dequantize_block_cuda; + case GGML_TYPE_Q4_1: + return dequantize_block_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + return dequantize_block_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: @@ -733,6 +760,16 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) { switch (type) { case GGML_TYPE_F32: return convert_unary_cuda; + case GGML_TYPE_Q4_0: + return dequantize_block_cuda; + case GGML_TYPE_Q4_1: + return dequantize_block_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + return dequantize_block_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -744,6 +781,16 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) { switch (type) { case GGML_TYPE_F16: return convert_unary_cuda; + case GGML_TYPE_Q4_0: + return dequantize_block_cuda; + case GGML_TYPE_Q4_1: + return dequantize_block_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + return dequantize_block_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 9122fca6cf9..3644ddf2fdf 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -745,33 +745,58 @@ void launch_fattn( size_t nb23 = V ? V->nb[3] : nb13; if (need_f16_K && K->type != GGML_TYPE_F16) { - GGML_ASSERT(ggml_is_contiguously_allocated(K)); - K_f16.alloc(ggml_nelements(K)); - to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type); - to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream); - K_data = (char *) K_f16.ptr; - const size_t bs = ggml_blck_size(K->type); const size_t ts = ggml_type_size(K->type); - nb11 = nb11*bs*sizeof(half)/ts; - nb12 = nb12*bs*sizeof(half)/ts; - nb13 = nb13*bs*sizeof(half)/ts; + K_f16.alloc(ggml_nelements(K)); + if (ggml_is_contiguously_allocated(K)) { + to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type); + to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream); + + nb11 = nb11*bs*sizeof(half)/ts; + nb12 = nb12*bs*sizeof(half)/ts; + nb13 = nb13*bs*sizeof(half)/ts; + } else { + GGML_ASSERT(K->nb[0] == ts); + to_fp16_nc_cuda_t to_fp16 = ggml_get_to_fp16_nc_cuda(K->type); + const int64_t s01 = nb11 / ts; + const int64_t s02 = nb12 / ts; + const int64_t s03 = nb13 / ts; + to_fp16(K_data, K_f16.ptr, K->ne[0], K->ne[1], K->ne[2], K->ne[3], s01, s02, s03, main_stream); + + nb11 = K->ne[0] * sizeof(half); + nb12 = K->ne[1] * nb11; + nb13 = K->ne[2] * nb12; + } + K_data = (char *) K_f16.ptr; } if (V && need_f16_V && V->type != GGML_TYPE_F16) { - GGML_ASSERT(ggml_is_contiguously_allocated(V)); - V_f16.alloc(ggml_nelements(V)); - to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type); - to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream); - V_data = (char *) V_f16.ptr; - const size_t bs = ggml_blck_size(V->type); const size_t ts = ggml_type_size(V->type); - nb21 = nb21*bs*sizeof(half)/ts; - nb22 = nb22*bs*sizeof(half)/ts; - nb23 = nb23*bs*sizeof(half)/ts; + V_f16.alloc(ggml_nelements(V)); + if (ggml_is_contiguously_allocated(V)) { + to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type); + to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream); + V_data = (char *) V_f16.ptr; + + nb21 = nb21*bs*sizeof(half)/ts; + nb22 = nb22*bs*sizeof(half)/ts; + nb23 = nb23*bs*sizeof(half)/ts; + } else { + GGML_ASSERT(V->nb[0] == ts); + to_fp16_nc_cuda_t to_fp16 = ggml_get_to_fp16_nc_cuda(V->type); + const int64_t s01 = nb21 / ts; + const int64_t s02 = nb22 / ts; + const int64_t s03 = nb23 / ts; + to_fp16(V_data, V_f16.ptr, V->ne[0], V->ne[1], V->ne[2], V->ne[3], s01, s02, s03, main_stream); + + nb21 = V->ne[0] * sizeof(half); + nb22 = V->ne[1] * nb21; + nb23 = V->ne[2] * nb22; + } + V_data = (char *) V_f16.ptr; } int parallel_blocks = 1; From 8272aa9f14c4bcd9762d9c202f6e3eb21d128bcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 23 Jul 2025 18:22:30 +0200 Subject: [PATCH 0145/1348] CUDA: fix compilation with GGML_CUDA_F16 (llama/14837) --- ggml/src/ggml-cuda/convert.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 1b4a71bab07..15c927861f0 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ dequantize_kernel(vx, ib, iqs, v); const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs; - y[iy0 + 0] = v.x; - y[iy0 + y_offset] = v.y; + y[iy0 + 0] = float(v.x); + y[iy0 + y_offset] = float(v.y); } template From 95efcf011d298032c12f70c99b1f232d0b3696fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 23 Jul 2025 21:43:25 +0200 Subject: [PATCH 0146/1348] CUDA: fix overflow in FA, tune performance (llama/14840) --- ggml/src/ggml-cuda/fattn-common.cuh | 45 ++++++----------------- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 55 ++++++++-------------------- ggml/src/ggml-cuda/fattn-tile-f16.cu | 41 +++++---------------- ggml/src/ggml-cuda/fattn-tile-f32.cu | 45 ++++++----------------- ggml/src/ggml-cuda/fattn-vec-f16.cuh | 52 ++++++++++---------------- ggml/src/ggml-cuda/fattn-vec-f32.cuh | 51 +++++++++----------------- ggml/src/ggml-cuda/fattn-wmma-f16.cu | 39 +++++--------------- ggml/src/ggml-cuda/fattn.cu | 16 ++------ 8 files changed, 98 insertions(+), 246 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 3644ddf2fdf..95e704e393c 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -23,33 +23,13 @@ typedef void (* fattn_kernel_t)( const float m1, const uint32_t n_head_log2, const float logit_softcap, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int ne32, - const int ne33, - const int nb31, - const int nb32, - const int nb33, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int nb21, - const int nb22, - const int nb23, - const int ne0, - const int ne1, - const int ne2, - const int ne3); + const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03, + const int32_t nb01, const int32_t nb02, const int32_t nb03, + const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13, + const int32_t nb11, const int32_t nb12, const int64_t nb13, + const int32_t nb21, const int32_t nb22, const int64_t nb23, + const int32_t ne31, const int32_t ne32, const int32_t ne33, + const int32_t nb31, const int32_t nb32, const int64_t nb33); typedef half (*vec_dot_KQ_f16_t)( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds); @@ -892,14 +872,11 @@ void launch_fattn( mask ? ((const char *) mask->data) : nullptr, !stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr, scale, max_bias, m0, m1, n_head_log2, logit_softcap, - Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], - K->ne[0], K->ne[1], K->ne[2], K->ne[3], - mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0, - mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0, - Q->nb[1], Q->nb[2], Q->nb[3], - nb11, nb12, nb13, + Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3], + K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13, nb21, nb22, nb23, - KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3] + mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0, + mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0 ); CUDA_CHECK(cudaGetLastError()); diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 6fa2e77299e..565853bfecd 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -408,7 +408,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( const int stride_K, const int stride_V, const int stride_mask, - const int jt, half2 * const __restrict__ tile_Q, half2 * const __restrict__ tile_K, half2 * const __restrict__ tile_V, @@ -455,7 +454,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( cp_async_wait_all(); __syncthreads(); flash_attn_ext_f16_load_tile - (V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V); + (V_h2 + int64_t(k_VKQ_0)*stride_V, tile_V, nbatch_V2, stride_V); } else { constexpr bool use_cp_async = nstages == 1; if (ncols2 > 1 || mask_h2) { @@ -471,7 +470,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( if (nstages <= 1) { constexpr bool use_cp_async = nstages == 1; flash_attn_ext_f16_load_tile - (K_h2 + k_VKQ_0*stride_K + k0_start, tile_K, k0_diff, stride_K); + (K_h2 + int64_t(k_VKQ_0)*stride_K + k0_start, tile_K, k0_diff, stride_K); if (use_cp_async) { cp_async_wait_all(); } @@ -715,7 +714,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( (mask_h2 + (k_VKQ_0 + c::nbatch_fa)/2, tile_mask, stride_mask); } flash_attn_ext_f16_load_tile - (K_h2 + (k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, nbatch_K2, stride_K); + (K_h2 + int64_t(k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, nbatch_K2, stride_K); } } @@ -732,7 +731,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( if (nstages <= 1 && i0_start < reusable_cutoff) { constexpr bool use_cp_async = nstages == 1; flash_attn_ext_f16_load_tile - (V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V); + (V_h2 + int64_t(k_VKQ_0)*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V); if (use_cp_async) { cp_async_wait_all(); } @@ -771,8 +770,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup); GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); - GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K); - GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K); + GGML_UNUSED(stride_mask); GGML_UNUSED(tile_K); GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B); GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum); GGML_UNUSED(kb0); GGML_UNUSED(tile_Q); @@ -920,7 +918,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( (mask_h2 + kb0_start*c::nbatch_fa/2, tile_mask, stride_mask); } flash_attn_ext_f16_load_tile - (K_h2 + kb0_start*c::nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K); + (K_h2 + int64_t(kb0_start)*c::nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K); } // Iterate over ne11 == previous tokens: @@ -928,13 +926,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( constexpr bool last_iter = false; flash_attn_ext_f16_iter (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap, - ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0); + ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0); } { // kb0_start is always < kb0_stop so the last iter can be executed unconditionally. constexpr bool last_iter = true; flash_attn_ext_f16_iter (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap, - ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1); + ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1); } // With multi-stage loading there is no __syncthreads at the end of the iter, @@ -1214,33 +1212,13 @@ static __global__ void flash_attn_ext_f16( const float m1, const uint32_t n_head_log2, const float logit_softcap, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int ne32, - const int ne33, - const int nb31, - const int nb32, - const int nb33, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int nb21, - const int nb22, - const int nb23, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { + const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03, + const int32_t nb01, const int32_t nb02, const int32_t nb03, + const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13, + const int32_t nb11, const int32_t nb12, const int64_t nb13, + const int32_t nb21, const int32_t nb22, const int64_t nb23, + const int32_t ne31, const int32_t ne32, const int32_t ne33, + const int32_t nb31, const int32_t nb32, const int64_t nb33) { #if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE) // Skip unused kernel variants for faster compilation: @@ -1359,8 +1337,7 @@ static __global__ void flash_attn_ext_f16( GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); - GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); - GGML_UNUSED(ne2); GGML_UNUSED(ne3); + GGML_UNUSED(nb22); GGML_UNUSED(nb23); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE) } diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 1f141328845..7661c21efbb 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -21,33 +21,13 @@ static __global__ void flash_attn_tile_ext_f16( const float m1, const uint32_t n_head_log2, const float logit_softcap, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int ne32, - const int ne33, - const int nb31, - const int nb32, - const int nb33, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int nb21, - const int nb22, - const int nb23, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { + const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03, + const int32_t nb01, const int32_t nb02, const int32_t nb03, + const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13, + const int32_t nb11, const int32_t nb12, const int64_t nb13, + const int32_t nb21, const int32_t nb22, const int64_t nb23, + const int32_t ne31, const int32_t ne32, const int32_t ne33, + const int32_t nb31, const int32_t nb32, const int64_t nb33) { #if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) // Skip unused kernel variants for faster compilation: @@ -127,7 +107,7 @@ static __global__ void flash_attn_tile_ext_f16( for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { const int k_KQ = k_KQ_0 + threadIdx.x; - KV_tmp[i_KQ][k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; + KV_tmp[i_KQ][k_KQ] = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; } } @@ -221,7 +201,7 @@ static __global__ void flash_attn_tile_ext_f16( for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { const int i = i0 + threadIdx.x; - KV_tmp[k][i] = V_h2[(k_VKQ_0 + k)*stride_KV2 + i]; + KV_tmp[k][i] = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i]; } } @@ -300,8 +280,7 @@ static __global__ void flash_attn_tile_ext_f16( GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); - GGML_UNUSED(ne2); GGML_UNUSED(ne3); + GGML_UNUSED(nb23); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) } diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index a4965583cef..2e2ed5cd566 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -21,33 +21,13 @@ static __global__ void flash_attn_tile_ext_f32( const float m1, const uint32_t n_head_log2, const float logit_softcap, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int ne32, - const int ne33, - const int nb31, - const int nb32, - const int nb33, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int nb21, - const int nb22, - const int nb23, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { + const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03, + const int32_t nb01, const int32_t nb02, const int32_t nb03, + const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13, + const int32_t nb11, const int32_t nb12, const int64_t nb13, + const int32_t nb21, const int32_t nb22, const int64_t nb23, + const int32_t ne31, const int32_t ne32, const int32_t ne33, + const int32_t nb31, const int32_t nb32, const int64_t nb33) { #ifdef FLASH_ATTN_AVAILABLE // Skip unused kernel variants for faster compilation: @@ -66,8 +46,7 @@ static __global__ void flash_attn_tile_ext_f32( GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); - GGML_UNUSED(ne2); GGML_UNUSED(ne3); + GGML_UNUSED(nb23); NO_DEVICE_CODE; return; } @@ -135,7 +114,7 @@ static __global__ void flash_attn_tile_ext_f32( #pragma unroll for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) { - const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x]; + const half2 tmp = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x]; KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp); KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp); } @@ -231,8 +210,9 @@ static __global__ void flash_attn_tile_ext_f32( for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { const int i = i0 + threadIdx.x; - KV_tmp2[k*(D/2) + i].x = __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]); - KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]); + const half2 tmp = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i]; + KV_tmp2[k*(D/2) + i].x = __low2float(tmp); + KV_tmp2[k*(D/2) + i].y = __high2float(tmp); } } @@ -312,7 +292,6 @@ static __global__ void flash_attn_tile_ext_f32( GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); - GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3); NO_DEVICE_CODE; #endif // FLASH_ATTN_AVAILABLE } diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index b2d469938ab..f6ef236be98 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -18,33 +18,13 @@ static __global__ void flash_attn_vec_ext_f16( const float m1, const uint32_t n_head_log2, const float logit_softcap, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int ne32, - const int ne33, - const int nb31, - const int nb32, - const int nb33, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int nb21, - const int nb22, - const int nb23, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { + const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03, + const int32_t nb01, const int32_t nb02, const int32_t nb03, + const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13, + const int32_t nb11, const int32_t nb12, const int64_t nb13, + const int32_t nb21, const int32_t nb22, const int64_t nb23, + const int32_t ne31, const int32_t ne32, const int32_t ne33, + const int32_t nb31, const int32_t nb32, const int64_t nb33) { #if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) // Skip unused kernel variants for faster compilation: @@ -191,13 +171,16 @@ static __global__ void flash_attn_vec_ext_f16( half2 VKQ[ncols] = {{0.0f, 0.0f}}; + K += blockIdx.y*D * nb11; + V += blockIdx.y*D * nb21; + maskh += blockIdx.y*D; for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) { // Calculate KQ tile and keep track of new maximum KQ values: if (mask) { #pragma unroll for (int j = 0; j < ncols; ++j) { - maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid]; + maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid]; } __syncthreads(); @@ -244,7 +227,7 @@ static __global__ void flash_attn_vec_ext_f16( #pragma unroll for (int j = 0; j < ncols; ++j) { - half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]); + half sum = vec_dot_KQ(K + i_KQ*nb11, Q_h2[j], Q_i32[j], Q_ds[j]); sum = warp_reduce_sum((float)sum); if (use_logit_softcap) { @@ -300,14 +283,18 @@ static __global__ void flash_attn_vec_ext_f16( } half2 V_k; - reinterpret_cast(V_k.x) = dequantize_1_v(V + (k_VKQ_0 + k0 + 0)*nb21, tid); - reinterpret_cast(V_k.y) = dequantize_1_v(V + (k_VKQ_0 + k0 + 1)*nb21, tid); + reinterpret_cast(V_k.x) = dequantize_1_v(V + (k0 + 0)*nb21, tid); + reinterpret_cast(V_k.y) = dequantize_1_v(V + (k0 + 1)*nb21, tid); #pragma unroll for (int j = 0; j < ncols; ++j) { VKQ[j] += V_k*KQ2[j*(D/2) + k0/2]; } } + K += gridDim.y*D * nb11; + V += gridDim.y*D * nb21; + maskh += gridDim.y*D; + __syncthreads(); } @@ -351,8 +338,7 @@ static __global__ void flash_attn_vec_ext_f16( GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); - GGML_UNUSED(ne2); GGML_UNUSED(ne3); + GGML_UNUSED(nb23); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) } diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh index 405b6f5106e..6a4bdc0ff9a 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -18,33 +18,13 @@ static __global__ void flash_attn_vec_ext_f32( const float m1, const uint32_t n_head_log2, const float logit_softcap, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int ne32, - const int ne33, - const int nb31, - const int nb32, - const int nb33, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int nb21, - const int nb22, - const int nb23, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { + const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03, + const int32_t nb01, const int32_t nb02, const int32_t nb03, + const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13, + const int32_t nb11, const int32_t nb12, const int64_t nb13, + const int32_t nb21, const int32_t nb22, const int64_t nb23, + const int32_t ne31, const int32_t ne32, const int32_t ne33, + const int32_t nb31, const int32_t nb32, const int64_t nb33) { #ifdef FLASH_ATTN_AVAILABLE // Skip unused kernel variants for faster compilation: @@ -59,8 +39,7 @@ static __global__ void flash_attn_vec_ext_f32( GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1); - GGML_UNUSED(ne2); GGML_UNUSED(ne3); + GGML_UNUSED(nb23); NO_DEVICE_CODE; return; } @@ -198,13 +177,16 @@ static __global__ void flash_attn_vec_ext_f32( float VKQ[ncols] = {0.0f}; + K += blockIdx.y*D * nb11; + V += blockIdx.y*D * nb21; + maskh += blockIdx.y*D; for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) { // Calculate KQ tile and keep track of new maximum KQ values: if (mask) { #pragma unroll for (int j = 0; j < ncols; ++j) { - maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]); + maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]); } __syncthreads(); @@ -246,7 +228,7 @@ static __global__ void flash_attn_vec_ext_f32( #pragma unroll for (int j = 0; j < ncols; ++j) { - float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]); + float sum = vec_dot_KQ(K + i_KQ*nb11, Q_f2[j], Q_i32[j], Q_ds[j]); sum = warp_reduce_sum(sum); if (use_logit_softcap) { @@ -297,13 +279,17 @@ static __global__ void flash_attn_vec_ext_f32( break; } - const float V_ki = dequantize_1_v(V + (k_VKQ_0 + k)*nb21, tid); + const float V_ki = dequantize_1_v(V + k*nb21, tid); #pragma unroll for (int j = 0; j < ncols; ++j) { VKQ[j] += V_ki*KQ[j*D + k]; } } + K += gridDim.y*D * nb11; + V += gridDim.y*D * nb21; + maskh += gridDim.y*D; + __syncthreads(); } @@ -348,7 +334,6 @@ static __global__ void flash_attn_vec_ext_f32( GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); - GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3); NO_DEVICE_CODE; #endif // FLASH_ATTN_AVAILABLE } diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index 741b8781d29..c9b083bed01 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -37,33 +37,13 @@ static __global__ void flash_attn_ext_f16( const float m1, const uint32_t n_head_log2, const float logit_softcap, - const int ne00, - const int ne01, - const int ne02, - const int ne03, - const int ne10, - const int ne11, - const int ne12, - const int ne13, - const int ne31, - const int ne32, - const int ne33, - const int nb31, - const int nb32, - const int nb33, - const int nb01, - const int nb02, - const int nb03, - const int nb11, - const int nb12, - const int nb13, - const int nb21, - const int nb22, - const int nb23, - const int ne0, - const int ne1, - const int ne2, - const int ne3) { + const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03, + const int32_t nb01, const int32_t nb02, const int32_t nb03, + const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13, + const int32_t nb11, const int32_t nb12, const int64_t nb13, + const int32_t nb21, const int32_t nb22, const int64_t nb23, + const int32_t ne31, const int32_t ne32, const int32_t ne33, + const int32_t nb31, const int32_t nb32, const int64_t nb33) { #if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { @@ -197,7 +177,7 @@ static __global__ void flash_attn_ext_f16( #pragma unroll for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) { frag_a_K K_a; - wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV); + wmma::load_matrix_sync(K_a, K_h + int64_t(k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV); #pragma unroll for (int j = 0; j < ncols/frag_n; ++j) { wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]); @@ -344,7 +324,7 @@ static __global__ void flash_attn_ext_f16( const int k = k0 + (threadIdx.y % VKQ_ratio)*16; frag_a_V v_a; - wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV); + wmma::load_matrix_sync(v_a, V_h + int64_t(k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV); #pragma unroll for (int j = 0; j < ncols/frag_n; ++j) { wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]); @@ -451,7 +431,6 @@ static __global__ void flash_attn_ext_f16( GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); - GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 6bc0096cc65..d9f1613051d 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -280,22 +280,12 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size; const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); - if (GGML_CUDA_CC_IS_AMD(cc)) { #if defined(GGML_HIP_ROCWMMA_FATTN) - if (fp16_mma_available(cc)) { - ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); - return; - } -#endif // defined(GGML_HIP_ROCWMMA_FATTN) - - // On AMD the tile kernels perform poorly, use the vec kernel instead: - if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { - ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); - } else { - ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); - } + if (GGML_CUDA_CC_IS_AMD(cc) && fp16_mma_available(cc)) { + ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); return; } +#endif // defined(GGML_HIP_ROCWMMA_FATTN) if (!fast_fp16_available(cc)) { if (Q->ne[1] <= 8 || Q->ne[0] == 256) { From e648f9f0796e2bc07409d645fac32c433411a9b1 Mon Sep 17 00:00:00 2001 From: Donghyeon Jeong <54725479+djeong20@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:50:41 +0900 Subject: [PATCH 0147/1348] sycl: fix undefined variable in work group size check (llama/14843) --- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 872eb4b052d..a023d6fb452 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3531,7 +3531,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, stream->memset(dev_cur_src1_row.get(), 0, sizeof(int)))); const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device]; - assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0); + assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0); { sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size)); From faedce5dcbb93eb5c15ee742074ef639995ef368 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Jul 2025 10:24:05 +0300 Subject: [PATCH 0148/1348] metal : fix fusion across different encoders (llama/14849) * metal : fix fusion across different encoders ggml-ci * cont : add assertion ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index dc391a0d4d5..1a9999325fe 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1955,6 +1955,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex static int ggml_metal_encode_node( ggml_backend_t backend, int idx, + int idx_end, id encoder, struct ggml_metal_mem_pool * mem_pool) { struct ggml_backend_metal_context * ctx = backend->context; @@ -2181,7 +2182,9 @@ static int ggml_metal_encode_node( size_t offs_fuse; id id_fuse; - for (n_fuse = 0; n_fuse <= 6; ++n_fuse) { + // note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing nodes + // across splits. idx_end indicates the last node in the current split + for (n_fuse = 0; n_fuse <= 6 && idx + n_fuse + 1 < idx_end; ++n_fuse) { if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) { break; } @@ -4288,7 +4291,7 @@ static int ggml_metal_encode_node( ops[1] = GGML_OP_MUL; ops[2] = GGML_OP_ADD; - for (n_fuse = 0; n_fuse <= 1; ++n_fuse) { + for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) { if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) { break; } @@ -6271,7 +6274,11 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - const int res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); + const int res = ggml_metal_encode_node(backend, idx, node_end, encoder, mem_pool); + if (idx + res > node_end) { + GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s", + "https://github.com/ggml-org/llama.cpp/pull/14849"); + } if (should_capture) { [encoder popDebugGroup]; From 7dc5ae2d6a2080e1a719ce9944b34d276d0a85d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Thu, 24 Jul 2025 11:09:57 +0100 Subject: [PATCH 0149/1348] sycl: fixed semantics of block offset calculation (llama/14814) --- ggml/src/ggml-sycl/quants.hpp | 17 ++++++++--------- ggml/src/ggml-sycl/vecdotq.hpp | 8 ++------ 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp index 8b952db43bf..d0d5ac9a4e8 100644 --- a/ggml/src/ggml-sycl/quants.hpp +++ b/ggml/src/ggml-sycl/quants.hpp @@ -48,11 +48,11 @@ template <> struct block_q_t { }; static constexpr std::pair get_block_offset(const int block_index, const int /* nblocks */) { - return { block_index * (traits::qk / traits::qr), 0 }; + return { block_index * (QK4_0 / QR4_0), 0 }; } static constexpr std::pair get_d_offset(int nrows, int ncols, const int block_index) { - return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 }; + return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 }; } static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } @@ -71,14 +71,12 @@ template <> struct block_q_t { } static constexpr std::pair get_d_offset(int nrows, int ncols, const int block_index) { - auto nblocks = (nrows * (ncols / traits::qk)); - return { nblocks * (QK_K / 2), + auto nblocks = (nrows * (ncols / QK_K)); + return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE), (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) }; } static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } - - constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; } }; template <> struct block_q_t { @@ -90,22 +88,23 @@ template <> struct block_q_t { }; static constexpr std::pair get_block_offset(const int block_index, const int n_blocks) { - auto low_bits_index = block_index * (traits::qk / traits::qr); + auto low_bits_index = block_index * (QK_K / QR6_K); // the index of high bits it's after all low bits auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4)); return { low_bits_index, high_bits_index }; } static constexpr std::pair get_d_offset(int nrows, int ncols, const int block_index) { - auto nblocks = (nrows * (ncols / traits::qk)); + auto nblocks = (nrows * (ncols / QK_K)); auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4); auto block_scales = total_qs_bytes + block_index * (QK_K / 16); - auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16); + auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half); return { block_scales, sb_scale }; } static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } }; + } // namespace ggml_sycl_reordered #endif // GGML_SYCL_QUANTS_HPP diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index 0a5d4999419..4088ddb54f0 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -350,11 +350,9 @@ template <> struct reorder_vec_dot_q_sycl { __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair ibx_offset, const std::pair d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds, const int & iqs) { - const int ib = ibx_offset.first / (QK_K / 2); - const uint8_t * base = static_cast(vbq); const uint8_t * qs = base + ibx_offset.first; - const uint8_t * scs = base + d_offset.first + ib * K_SCALE_SIZE; + const uint8_t * scs = base + d_offset.first; const ggml_half2 * dms = reinterpret_cast(base + d_offset.second); const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2)); @@ -427,13 +425,11 @@ template <> struct reorder_vec_dot_q_sycl { __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair ibx_offset, const std::pair d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds, const int iqs) { - const int ib = ibx_offset.first / (QK_K / 2); - const uint8_t * base = static_cast(vbq); const uint8_t * ql = base + ibx_offset.first; const uint8_t * qh = base + ibx_offset.second; const int8_t * scales = reinterpret_cast(base + d_offset.first); - const ggml_half * d = (const ggml_half *) (base + d_offset.second) + ib; + const ggml_half * d = (const ggml_half *) (base + d_offset.second); const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4); const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8); From 5823eabc783c7694e4fda4737b07d22dacd5d0b2 Mon Sep 17 00:00:00 2001 From: Kai Pastor Date: Thu, 24 Jul 2025 19:58:02 +0200 Subject: [PATCH 0150/1348] cmake : Indent ggml-config.cmake (ggml/1310) --- ggml/cmake/ggml-config.cmake.in | 127 ++++++++++++++++---------------- 1 file changed, 63 insertions(+), 64 deletions(-) diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in index 48704352cf4..fe34cda4e01 100644 --- a/ggml/cmake/ggml-config.cmake.in +++ b/ggml/cmake/ggml-config.cmake.in @@ -102,89 +102,88 @@ set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@") #set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@") if(NOT TARGET ggml::ggml) + find_package(Threads REQUIRED) -find_package(Threads REQUIRED) - -find_library(GGML_LIBRARY ggml - REQUIRED - HINTS ${GGML_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - -add_library(ggml::ggml UNKNOWN IMPORTED) -set_target_properties(ggml::ggml - PROPERTIES - IMPORTED_LOCATION "${GGML_LIBRARY}") - -find_library(GGML_BASE_LIBRARY ggml-base - REQUIRED - HINTS ${GGML_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - -add_library(ggml::ggml-base UNKNOWN IMPORTED) -set_target_properties(ggml::ggml-base - PROPERTIES - IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") + find_library(GGML_LIBRARY ggml + REQUIRED + HINTS ${GGML_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH) -set(_ggml_all_targets "") -foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) - string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}") - string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx) + add_library(ggml::ggml UNKNOWN IMPORTED) + set_target_properties(ggml::ggml + PROPERTIES + IMPORTED_LOCATION "${GGML_LIBRARY}") - find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend} + find_library(GGML_BASE_LIBRARY ggml-base REQUIRED HINTS ${GGML_LIB_DIR} NO_CMAKE_FIND_ROOT_PATH) - message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}") - - add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED) - set_target_properties(ggml::${_ggml_backend} + add_library(ggml::ggml-base UNKNOWN IMPORTED) + set_target_properties(ggml::ggml-base PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}" - IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" - IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}" - INTERFACE_COMPILE_FEATURES c_std_90 - POSITION_INDEPENDENT_CODE ON) - - string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}") - if(is_cpu_variant) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}") + IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") - if(GGML_CPU_INTERFACE_LINK_OPTIONS) - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}") - endif() + set(_ggml_all_targets "") + foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) + string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}") + string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx) - else() - list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") + find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend} + REQUIRED + HINTS ${GGML_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH) + + message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}") + + add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED) set_target_properties(ggml::${_ggml_backend} PROPERTIES - INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}") + INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}" + INTERFACE_COMPILE_FEATURES c_std_90 + POSITION_INDEPENDENT_CODE ON) + + string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}") + if(is_cpu_variant) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}") + + if(GGML_CPU_INTERFACE_LINK_OPTIONS) + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}") + endif() - if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS) + else() + list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") set_target_properties(ggml::${_ggml_backend} PROPERTIES - INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}") + INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}") + + if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS) + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}") + endif() endif() - endif() - list(APPEND _ggml_all_targets ggml::${_ggml_backend}) -endforeach() + list(APPEND _ggml_all_targets ggml::${_ggml_backend}) + endforeach() -list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}") -set_target_properties(ggml::ggml - PROPERTIES - INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}") + list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}") + set_target_properties(ggml::ggml + PROPERTIES + INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}") -add_library(ggml::all INTERFACE IMPORTED) -set_target_properties(ggml::all - PROPERTIES - INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}") + add_library(ggml::all INTERFACE IMPORTED) + set_target_properties(ggml::all + PROPERTIES + INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}") -endif() # TARGET ggml::ggml +endif() check_required_components(ggml) From 89ae7894506bfe2c432111ee71f1be07ca85684c Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Fri, 25 Jul 2025 03:05:37 +0800 Subject: [PATCH 0151/1348] musa: upgrade musa sdk to rc4.2.0 (llama/14498) * musa: apply mublas API changes Signed-off-by: Xiaodong Ye * musa: update musa version to 4.2.0 Signed-off-by: Xiaodong Ye * musa: restore MUSA graph settings in CMakeLists.txt Signed-off-by: Xiaodong Ye * musa: disable mudnnMemcpyAsync by default Signed-off-by: Xiaodong Ye * musa: switch back to non-mudnn images Signed-off-by: Xiaodong Ye * minor changes Signed-off-by: Xiaodong Ye * musa: restore rc in docker image tag Signed-off-by: Xiaodong Ye --------- Signed-off-by: Xiaodong Ye --- ggml/CMakeLists.txt | 2 ++ ggml/src/ggml-cuda/common.cuh | 2 +- ggml/src/ggml-cuda/cpy.cu | 14 +++++++------- ggml/src/ggml-cuda/vendors/musa.h | 4 ++-- ggml/src/ggml-musa/CMakeLists.txt | 22 ++++++++++++++++++---- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index de6d789c98a..8ca1053cab3 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON) option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF) +option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF) +option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1a2708ec9df..9435daf0b3f 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -765,7 +765,7 @@ struct ggml_tensor_extra_gpu { }; -#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) +#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS) #define USE_CUDA_GRAPH #endif diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 0e5964907e1..f9bb025643c 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -1,9 +1,9 @@ #include "cpy.cuh" #include "dequantize.cuh" #include "cpy-utils.cuh" -#ifdef GGML_USE_MUSA +#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) #include "ggml-musa/mudnn.cuh" -#endif // GGML_USE_MUSA +#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY typedef void (*cpy_kernel_t)(const char * cx, char * cdst); @@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int // Copy destination pointers to GPU to be available when pointer indirection is in use void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) { -#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) +#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers CUDA_CHECK(cudaStreamSynchronize(stream)); if (cuda_graph->dest_ptrs_d != nullptr) { @@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg char ** dest_ptrs_d = nullptr; int graph_cpynode_index = -1; -#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) +#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d; graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index; @@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg #endif if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); -#ifdef GGML_USE_MUSA +#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) { CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0)); } else -#endif // GGML_USE_MUSA +#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY { CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); } @@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); } -#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) +#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; } diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 937779a90af..19896320244 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -13,7 +13,7 @@ #define CUBLAS_OP_N MUBLAS_OP_N #define CUBLAS_OP_T MUBLAS_OP_T #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT +#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH #define CUDA_R_16F MUSA_R_16F #define CUDA_R_16BF MUSA_R_16BF #define CUDA_R_32F MUSA_R_32F @@ -29,7 +29,7 @@ #define cublasSgemm mublasSgemm #define cublasStatus_t mublasStatus_t #define cublasOperation_t mublasOperation_t -#define cublasGetStatusString mublasStatus_to_string +#define cublasGetStatusString mublasGetStatusString #define cudaDataType_t musaDataType_t #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index 971314debc7..02904526ade 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND) list(APPEND GGML_SOURCES_MUSA ${SRCS}) file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu") list(APPEND GGML_SOURCES_MUSA ${SRCS}) - file(GLOB SRCS "../ggml-musa/*.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) + + if (GGML_MUSA_MUDNN_COPY) + file(GLOB SRCS "../ggml-musa/*.cu") + list(APPEND GGML_SOURCES_MUSA ${SRCS}) + add_compile_definitions(GGML_MUSA_MUDNN_COPY) + endif() if (GGML_CUDA_FA_ALL_QUANTS) file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu") @@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND) add_compile_definitions(GGML_USE_MUSA) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + if (GGML_MUSA_GRAPHS) + add_compile_definitions(GGML_MUSA_GRAPHS) + endif() + if (GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() @@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND) endif() if (GGML_STATIC) - # TODO: mudnn has not provided static libraries yet target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static) + # TODO: mudnn has not provided static libraries yet + # if (GGML_MUSA_MUDNN_COPY) + # target_link_libraries(ggml-musa PRIVATE mudnn_static) + # endif() else() - target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn) + target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas) + if (GGML_MUSA_MUDNN_COPY) + target_link_libraries(ggml-musa PRIVATE mudnn) + endif() endif() if (GGML_CUDA_NO_VMM) From 270fa9b25c80341a3379160e0977e1577deb9b89 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 25 Jul 2025 01:07:26 -0700 Subject: [PATCH 0152/1348] sched : fix multiple evaluations of the same graph with pipeline parallelism (llama/14855) ggml-ci --- ggml/src/ggml-backend.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b7498b8d402..eaf41e5a6c8 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -647,6 +647,7 @@ struct ggml_backend_sched { // pipeline parallelism support int n_copies; int cur_copy; + int next_copy; ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; int n_graph_inputs; @@ -1433,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } - sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies; - return GGML_STATUS_SUCCESS; } @@ -1535,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); - ggml_backend_sched_split_graph(sched, measure_graph); - ggml_backend_sched_synchronize(sched); + ggml_backend_sched_split_graph(sched, measure_graph); + if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { return false; } @@ -1550,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs); + GGML_ASSERT(!sched->is_alloc); + + sched->cur_copy = sched->next_copy; + sched->next_copy = (sched->next_copy + 1) % sched->n_copies; ggml_backend_sched_split_graph(sched, graph); @@ -1590,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { // if the graph is not already allocated, always use copy 0 after a synchronization // this ensures that during generation the same copy is used every time, // which avoids changes in the graph that could cause CUDA or other graphs to be disabled - sched->cur_copy = 0; + sched->next_copy = 0; } } From 56350ecc12cc3d7120535297000ec22bb549bb72 Mon Sep 17 00:00:00 2001 From: Chris Rohlf Date: Fri, 25 Jul 2025 06:17:02 -0400 Subject: [PATCH 0153/1348] rpc : check for null buffers in get/set/copy tensor endpoints (llama/14868) --- ggml/src/ggml-rpc/ggml-rpc.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index f468f796d57..29bc421d58f 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1055,7 +1055,7 @@ bool rpc_server::set_tensor(const std::vector & input) { GGML_ASSERT(ctx_ptr != nullptr); ggml_context * ctx = ctx_ptr.get(); ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor); - if (tensor == nullptr) { + if (tensor == nullptr || tensor->buffer == nullptr) { GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__); return false; } @@ -1124,7 +1124,7 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp GGML_ASSERT(ctx_ptr != nullptr); ggml_context * ctx = ctx_ptr.get(); ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor); - if (tensor == nullptr) { + if (tensor == nullptr || tensor->buffer == nullptr) { GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__); return false; } @@ -1192,7 +1192,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector< GGML_ASSERT(ctx_ptr != nullptr); ggml_context * ctx = ctx_ptr.get(); ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor); - if (tensor == nullptr) { + if (tensor == nullptr || tensor->buffer == nullptr) { GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__); return false; } @@ -1229,7 +1229,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co ggml_tensor * src = deserialize_tensor(ctx, &request.src); ggml_tensor * dst = deserialize_tensor(ctx, &request.dst); - if (src == nullptr || dst == nullptr) { + if (src == nullptr || dst == nullptr || src->buffer == nullptr || dst->buffer == nullptr) { GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__); return false; } From bbf2389919a950988cc2b7a084a9e0009852483a Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Fri, 25 Jul 2025 13:29:57 +0200 Subject: [PATCH 0154/1348] ggml : remove invalid portPos specifiers from dot files (llama/14838) Neither "g" nor "x" are valid portPos specifiers per the official [graphviz documents](https://graphviz.org/docs/attr-types/portPos/): > If a compass point is used, it must have the form "n","ne","e","se","s","sw","w","nw","c","_". I tested locally for it to fall back to default portPos specifier if an invalid portPos is specified. As a consequence, we can remove associated code. --- ggml/src/ggml.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 5ae1c527df6..124cf3e8b60 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6640,20 +6640,18 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node); struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent); - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n", + fprintf(fp, " \"%p\" -> \"%p\" [ arrowhead = %s; style = %s; label = \"%s\"; ]\n", gparent0 ? (void *) gparent0 : (void *) parent, - gparent0 ? "g" : "x", gparent ? (void *) gparent : (void *) node, - gparent ? "g" : "x", gparent ? "empty" : "vee", gparent ? "dashed" : "solid", label); } static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n", - (void *) parent, "x", - (void *) node, "x", + fprintf(fp, " \"%p\" -> \"%p\" [ label = \"%s\"; ]\n", + (void *) parent, + (void *) node, label); } From d414c3f6acec1a72074228088501b3953a76a1a0 Mon Sep 17 00:00:00 2001 From: lhez Date: Fri, 25 Jul 2025 08:12:13 -0700 Subject: [PATCH 0155/1348] opencl: add fused `rms_norm_mul` (llama/14841) * opencl: add fused `rms_norm` + `mul` * opencl: improve workgroup size for `rms_norm_mul` --- ggml/src/ggml-opencl/ggml-opencl.cpp | 163 ++++++++++++++++++++++- ggml/src/ggml-opencl/kernels/rms_norm.cl | 79 +++++++++++ 2 files changed, 240 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 63ac4a989b0..c87a32383c8 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -333,6 +333,7 @@ struct ggml_backend_opencl_context { size_t max_alloc_size; bool fp16_support; bool has_vector_subgroup_broadcast; + bool disable_fusion; ggml_cl_compiler_version adreno_cl_compiler_version; int adreno_wave_size; @@ -411,7 +412,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick, kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16; cl_kernel kernel_norm; - cl_kernel kernel_rms_norm; + cl_kernel kernel_rms_norm, kernel_rms_norm_mul; cl_kernel kernel_group_norm; cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8; cl_kernel kernel_soft_max, kernel_soft_max_4; @@ -1100,7 +1101,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve backend_ctx->program_rms_norm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); - CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err)); + CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err)); + CL_CHECK((backend_ctx->kernel_rms_norm_mul = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm_mul", &err), err)); GGML_LOG_CONT("."); } @@ -2110,6 +2112,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err)); #endif // GGML_OPENCL_USE_ADRENO_KERNELS + backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr; + dev_ctx->backend_ctx = backend_ctx.release(); return dev_ctx->backend_ctx; } @@ -2279,7 +2283,45 @@ static void sync_with_other_backends(ggml_backend_t backend) { sync_with_other_backends(backend_ctx); } +static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { + if (!ggml_can_fuse(cgraph, node_idx, ops)) { + return false; + } + + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) { + const ggml_tensor *rms_norm = cgraph->nodes[node_idx]; + const ggml_tensor *mul = cgraph->nodes[node_idx+1]; + + GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(rms_norm->type == GGML_TYPE_F32); + + // rms_norm only supports f32 + if (mul->src[0]->type != GGML_TYPE_F32 || + mul->src[1]->type != GGML_TYPE_F32 || + mul->type != GGML_TYPE_F32) { + return false; + } + + // if rms_norm is the B operand, then we don't handle broadcast + if (rms_norm == mul->src[1] && + !ggml_are_same_shape(mul->src[0], rms_norm->src[1])) { + return false; + } + + // rms_norm assumes contiguous rows + if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) { + return false; + } + } + + return true; +} + +static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor); + static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2292,6 +2334,12 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm continue; } + if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ggml_opencl_op_rms_norm_fused(backend, node, cgraph->nodes[i+1]); + i++; + continue; + } + bool ok = ggml_cl_compute_forward(backend, node); if (!ok) { GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); @@ -4455,6 +4503,117 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } +static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) { + GGML_ASSERT(mul_tensor); + GGML_ASSERT(rms_norm_tensor); + + // src0 is the src of rms_norm, src1 is the other src of mul (one being rms_norm) + const ggml_tensor * src0 = rms_norm_tensor->src[0]; + const ggml_tensor * src1; + if (mul_tensor->src[0] == rms_norm_tensor) { + src1 = mul_tensor->src[1]; + } else if (mul_tensor->src[1] == rms_norm_tensor) { + src1 = mul_tensor->src[0]; + } else { + GGML_ASSERT(false && "Invalid args for rms_norm and mul"); + } + const ggml_tensor * dst = mul_tensor; + + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + float eps; + memcpy(&eps, rms_norm_tensor->op_params, sizeof(float)); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const cl_ulong nb01 = src0->nb[1]; + const cl_ulong nb02 = src0->nb[2]; + const cl_ulong nb03 = src0->nb[3]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + + const cl_ulong nb11 = src1->nb[1]; + const cl_ulong nb12 = src1->nb[2]; + const cl_ulong nb13 = src1->nb[3]; + + const cl_ulong nb1 = dst->nb[1]; + const cl_ulong nb2 = dst->nb[2]; + const cl_ulong nb3 = dst->nb[3]; + + GGML_ASSERT(ne00 % 4 == 0); + + size_t sgs; + if (backend_ctx->gpu_family == ADRENO) { + sgs = 64; + } else if (backend_ctx->gpu_family == INTEL) { + sgs = 32; + } else { + GGML_ASSERT(false && "Unsupported GPU"); + } + + cl_kernel kernel = backend_ctx->kernel_rms_norm_mul; + + int nth = sgs; + int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel); + while (nth < ne00 && nth < max_workgroup_size) { + nth *= 2; + } + nth = MIN(nth, max_workgroup_size); + nth = MIN(nth, ne00); + + size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; + size_t local_work_size[] = {(size_t)nth, 1, 1}; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne13)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3)); + CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &eps)); + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*nth/sgs, NULL)); + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} + static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); diff --git a/ggml/src/ggml-opencl/kernels/rms_norm.cl b/ggml/src/ggml-opencl/kernels/rms_norm.cl index 9d21f3398ec..ecd053cb4c1 100644 --- a/ggml/src/ggml-opencl/kernels/rms_norm.cl +++ b/ggml/src/ggml-opencl/kernels/rms_norm.cl @@ -94,3 +94,82 @@ kernel void kernel_rms_norm( } } } + +//------------------------------------------------------------------------------ +// rms_norm_mul +//------------------------------------------------------------------------------ +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_32 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_rms_norm_mul( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne03, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb11, + ulong nb12, + ulong nb13, + ulong nb1, + ulong nb2, + ulong nb3, + float eps, + local float * sum +) { + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; + + int i03 = get_group_id(2); + int i02 = get_group_id(1); + int i01 = get_group_id(0); + + global float4 * x = (global float4 *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); + global float4 * f = (global float4 *) (src1 + (i03%ne13)*nb13 + (i02%ne12)*nb12 + (i01%ne11)*nb11); + + float sumf = 0; + + // parallel sum + for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) { + sumf += dot(x[i00], x[i00]); + } + sumf = sub_group_reduce_add(sumf); + if (get_sub_group_local_id() == 0) { + sum[get_sub_group_id()] = sumf; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) { + if (get_local_id(0) < i) { + sum[get_local_id(0)] += sum[get_local_id(0) + i]; + } + } + if (get_local_id(0) == 0) { + sum[0] /= ne00; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float mean = sum[0]; + float scale = 1.0f/sqrt(mean + eps); + + global float4 * y = (global float4 *) (dst + i03*nb3 + i02*nb2 + i01*nb1); + for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) { + y[i00] = (x[i00] * scale) * f[i00%(ne10/4)]; + } +} From 0b0de0bbf2ebd43940509c75b59615b281549b40 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 25 Jul 2025 10:47:39 -0600 Subject: [PATCH 0156/1348] metal: SSM_SCAN performance (llama/14743) * feat: Add s_off as a parameter in the args struct This may not be necessary, but it more closely mirrors the CUDA kernel Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart * perf: Parallelize mamba2 SSM_SCAN metal kernel over d_state This is a first attempt at optimizing the metal kernel. The changes here are: - Launch the kernel with a thread group of size d_state - Use simd groups and shared memory to do the summation for the y computation When tested with G4 tiny preview, this shows roughly a 3x speedup on prefill and 15% speedup on decode. Signed-off-by: Gabe Goodhart * fix: Update logic to correctly do the multi-layer parallel sum Signed-off-by: Gabe Goodhart * fix: Correctly size the shared memory bufer and assert expected size relationships Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart * refactor: Compute block offsets once rather than once per token Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart * feat: Use local variable for state recursion Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart * feat: Use a secondary simd_sum instead of a for loop Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart * feat: Add assertion and comment about relationship between simd size and num simd groups Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart * feat: Parallelize of d_state for mamba-1 Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart * feat: Parallel sum in SSM_CONV Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart * Revert "feat: Parallel sum in SSM_CONV" After discussion with @compilade, the size of the parallelism here is not worth the cost in complexity or overhead of the parallel for. https://github.com/ggml-org/llama.cpp/pull/14743#discussion_r2223395357 This reverts commit 16bc059660c1c59e566628201c0ca2c20c9f4bc3. Signed-off-by: Gabe Goodhart * refactor: Simplify shared memory sizing Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart Co-Authored-By: Georgi Gerganov --------- Signed-off-by: Gabe Goodhart Co-authored-by: Georgi Gerganov --- ggml/src/ggml-metal/ggml-metal-impl.h | 1 + ggml/src/ggml-metal/ggml-metal.m | 15 ++- ggml/src/ggml-metal/ggml-metal.metal | 182 ++++++++++++++++++++------ 3 files changed, 156 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index b7b3fc49af3..8424464d8ca 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -528,6 +528,7 @@ typedef struct { int64_t n_group; int64_t n_seq_tokens; int64_t n_seqs; + int64_t s_off; uint64_t nb01; uint64_t nb02; uint64_t nb03; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 1a9999325fe..337f7985bad 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -3141,6 +3141,7 @@ static int ggml_metal_encode_node( /*.n_group =*/ n_group, /*.n_seq_tokens =*/ n_seq_tokens, /*.n_seqs =*/ n_seqs, + /*.s_off =*/ ggml_nelements(src1) * sizeof(float), /*.nb01 =*/ nb01, /*.nb02 =*/ nb02, /*.nb03 =*/ nb03, @@ -3169,12 +3170,22 @@ static int ggml_metal_encode_node( [encoder setBuffer:id_dst offset:offs_dst atIndex:7]; [encoder setBytes:&args length:sizeof(args) atIndex:8]; + // One shared memory bucket for each simd group in the threadgroup + // NOTE: Metal kernels require the buffer size to be multiple of 16 bytes + // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength + if (d_state >= 32) { + GGML_ASSERT((int64_t)(d_state / 32) <= 32); + const int64_t shmem_size = 32; + GGML_ASSERT(d_state <= (int64_t)pipeline.maxTotalThreadsPerThreadgroup); + [encoder setThreadgroupMemoryLength:(shmem_size)*sizeof(float) atIndex:0]; + } + if (ne30 == 1) { // Mamba-2 - [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)]; } else { GGML_ASSERT(d_inner == 1); - [encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)]; } } break; case GGML_OP_RWKV_WKV6: diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index f62b9ad548e..99a453090f6 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1823,10 +1823,16 @@ kernel void kernel_ssm_scan_f32( device const void * src5, device const void * src6, device float * dst, + threadgroup float * shared [[threadgroup(0)]], constant ggml_metal_kargs_ssm_scan & args, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgptg[[simdgroups_per_threadgroup]], + uint3 tgpg[[threadgroups_per_grid]]) { + + const int64_t i0 = tpitg.x; const int64_t i1 = 0; const int64_t ir = tgpig.x; // current head const int64_t i3 = tgpig.y; // current seq @@ -1841,41 +1847,88 @@ kernel void kernel_ssm_scan_f32( const int64_t ng = args.n_group; const int64_t n_t = args.n_seq_tokens; - const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float); + const int64_t s_off = args.s_off; device const int32_t * ids = (device const int32_t *) src6; - device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03); - device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off); + device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03); + device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off); + const int64_t i = i0 + i1*nc; + float s0 = s0_buff[i]; + float s = s_buff[i]; + + device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); + device const float * x_block = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i3*args.nb13); + device const float * dt_block = (device const float *) ((device const char *) src2 + ir*nb20 + i3*args.nb22); + device const float * B_block = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i3*args.nb43); + device const float * C_block = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i3*args.nb53); + device float * y_block = (device float *) ((device char *) dst + (i1 + ir*(nr) + i3*(n_t*nh*nr))*nb00); for (int64_t i2 = 0; i2 < n_t; ++i2) { - device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns} - device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns} - device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {d_state, nh} - device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns} - device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns} - device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns} + device const float * x = (device const float *) ((device const char *) x_block + i2*args.nb12); // {dim, nh, nt, ns} + device const float * dt = (device const float *) ((device const char *) dt_block + i2*args.nb21); // {nh, nt, ns} + device const float * B = (device const float *) ((device const char *) B_block + i2*args.nb42); // {d_state, ng, nt, ns} + device const float * C = (device const float *) ((device const char *) C_block + i2*args.nb52); // {d_state, ng, nt, ns} + device float * y = (device float *) ((device char *) y_block + i2*(nh*nr*nb00)); // {dim, nh, nt, ns} const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0]; const float x_dt = x[0] * dt_soft_plus; - float sumf = 0.0f; - for (int64_t i0 = 0; i0 < nc; ++i0) { - const int64_t i = i0 + i1*nc; - const float state = (s0[i] * exp(dt_soft_plus * A[i0])) + (B[i0] * x_dt); - sumf += state * C[i0]; - s[i] = state; - } + const float state = (s0 * exp(dt_soft_plus * A[i0])) + (B[i0] * x_dt); + s = state; + + // Parallel sum: This relies on the fact that this kernel will be + // dispatched with each threadgroup having (d_state, 1, 1) threads which + // are subdivided into SIMD groups of size `sgptg`. The goal is to + // compute y = sum({state * C[i] for i in range(d_state)}). + // To parallelize this effectively, we first use simd_sum over each SIMD + // group to compute the sum of each SIMD group, then place the result in + // the SIMD group's indexed bucket in the shared memory. We then sum + // over the individual group sums to compute the final sum. + + // Computed for each thread + float sumf = state * C[i0]; - y[0] = sumf; + // Sum the threads in the simd group => simd sum + sumf = simd_sum(sumf); + + if (sgptg > 1) { + + // Once per simd group, place the group sum into the shared buffer + if (tiisg == 0) { + shared[sgitg] = sumf; + } + + // Wait for all threads in the threadgroup to reach this point. This + // ensures that all elements of the shared buffer are populated with the + // sum of the individual simd groups. + threadgroup_barrier(mem_flags::mem_threadgroup); + + // For simd group 0 at indices < num simd groups, extract the shared + // simd sum + sumf = 0.0f; + if (sgitg == 0) { + if (tiisg < sgptg) { + sumf = shared[tiisg]; + } + sumf = simd_sum(sumf); + if (tiisg == 0) { + y[0] = sumf; + } + } + } else if (tiisg == 0) { + y[0] = sumf; + } // recurse s0 = s; } + + // Assign the final state to the output buffer + s_buff[i] = s; } // ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part -// TODO: optimize (e.g. by parallelizing over d_state) kernel void kernel_ssm_scan_f32_group( device const void * src0, device const void * src1, @@ -1885,10 +1938,16 @@ kernel void kernel_ssm_scan_f32_group( device const void * src5, device const void * src6, device float * dst, + threadgroup float * shared [[threadgroup(0)]], constant ggml_metal_kargs_ssm_scan & args, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgptg[[simdgroups_per_threadgroup]], + uint3 tgpg[[threadgroups_per_grid]]) { + + const int64_t i0 = tpitg.x; const int64_t i1 = tgpig.x; const int64_t ir = tgpig.y; // current head const int64_t i3 = tgpig.z; // current seq @@ -1903,38 +1962,81 @@ kernel void kernel_ssm_scan_f32_group( const int64_t ng = args.n_group; const int64_t n_t = args.n_seq_tokens; - const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float); + const int64_t s_off = args.s_off; device const int32_t * ids = (device const int32_t *) src6; - device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03); - device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off); + device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03); + device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off); + const int64_t i = i0 + i1*nc; + float s0 = s0_buff[i]; + float s = s_buff[i]; + + device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh} + device const float * x_block = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i3*args.nb13); + device const float * dt_block = (device const float *) ((device const char *) src2 + ir*nb20 + i3*args.nb22); + device const float * B_block = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i3*args.nb43); + device const float * C_block = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i3*args.nb53); + device float * y_block = (device float *) ((device char *) dst + (i1 + ir*(nr) + i3*(n_t*nh*nr))*nb00); for (int64_t i2 = 0; i2 < n_t; ++i2) { - device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns} - device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns} - device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh} - device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns} - device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns} - device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns} + device const float * x = (device const float *) ((device const char *) x_block + i2*args.nb12); // {dim, nh, nt, ns} + device const float * dt = (device const float *) ((device const char *) dt_block + i2*args.nb21); // {nh, nt, ns} + device const float * B = (device const float *) ((device const char *) B_block + i2*args.nb42); // {d_state, ng, nt, ns} + device const float * C = (device const float *) ((device const char *) C_block + i2*args.nb52); // {d_state, ng, nt, ns} + device float * y = (device float *) ((device char *) y_block + i2*(nh*nr*nb00)); // {dim, nh, nt, ns} const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0]; const float x_dt = x[0] * dt_soft_plus; const float dA = exp(dt_soft_plus * A[0]); - float sumf = 0.0f; - for (int64_t i0 = 0; i0 < nc; ++i0) { - const int64_t i = i0 + i1*nc; - const float state = (s0[i] * dA) + (B[i0] * x_dt); - sumf += state * C[i0]; - s[i] = state; + const float state = (s0 * dA) + (B[i0] * x_dt); + s = state; + + // Parallel sum: This relies on the fact that this kernel will be + // dispatched with each threadgroup having (d_state, 1, 1) threads which + // are subdivided into SIMD groups of size `sgptg`. The goal is to + // compute y = sum({state * C[i] for i in range(d_state)}). + // To parallelize this effectively, we first use simd_sum over each SIMD + // group to compute the sum of each SIMD group, then place the result in + // the SIMD group's indexed bucket in the shared memory. We then sum + // over the individual group sums to compute the final sum. + + // Computed for each thread + float sumf = state * C[i0]; + + // Sum the threads in the simd group => simd sum + sumf = simd_sum(sumf); + + // Once per simd group, place the group sum into the shared buffer + if (tiisg == 0) { + shared[sgitg] = sumf; } - y[0] = sumf; + // Wait for all threads in the threadgroup to reach this point. This + // ensures that all elements of the shared buffer are populated with the + // sum of the individual simd groups. + threadgroup_barrier(mem_flags::mem_threadgroup); + + // For simd group 0 at indices < num simd groups, extract the shared + // simd sum + sumf = 0.0f; + if (sgitg == 0) { + if (tiisg < sgptg) { + sumf = shared[tiisg]; + } + sumf = simd_sum(sumf); + if (tiisg == 0) { + y[0] = sumf; + } + } // recurse s0 = s; } + + // Assign the final state to the output buffer + s_buff[i] = s; } kernel void kernel_rwkv_wkv6_f32( From 662920147170a29d340a63cb3c73ecc2c1594767 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 26 Jul 2025 01:09:03 +0800 Subject: [PATCH 0157/1348] ggml-cpu : disable GGML_NNPA by default due to instability (llama/14880) * docs: update s390x document for sentencepiece Signed-off-by: Aaron Teo (cherry picked from commit e086c5e3a7ab3463d8e0906efcfa39352db0a48d) * docs: update huggingface links + reword Signed-off-by: Aaron Teo (cherry picked from commit 8410b085ea8c46e22be38266147a1e94757ef108) * ggml-cpu: disable ggml-nnpa compile flag by default fixes #14877 Signed-off-by: Aaron Teo (cherry picked from commit 412f4c7c88894b8f55846b4719c76892a23cfe09) * docs: update s390x build docs to reflect nnpa disable Signed-off-by: Aaron Teo (cherry picked from commit c1eeae1d0c2edc74ab9fbeff2707b0d357cf0b4d) --------- Signed-off-by: Aaron Teo --- ggml/CMakeLists.txt | 2 +- ggml/src/ggml-cpu/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 8ca1053cab3..20467c54da1 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -131,7 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF) option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF) option(GGML_VXE "ggml: enable vxe" ON) -option(GGML_NNPA "ggml: enable nnpa" ON) +option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 2cc42d4b02a..f188d1638dc 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -458,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) list(APPEND ARCH_FLAGS -march=z16) elseif (${S390X_M} MATCHES "9175|9176") # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version. + # binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15. message(STATUS "z17 target") list(APPEND ARCH_FLAGS -march=z17) else() From 8643960accd5ff5813fc29ac328b7059899a42a4 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Sat, 26 Jul 2025 10:36:02 +0800 Subject: [PATCH 0158/1348] musa: fix build warnings (unused variable) (llama/14869) Signed-off-by: Xiaodong Ye --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 18 ++++++++++-------- ggml/src/ggml-cuda/fattn-tile-f32.cu | 28 ++++++++++++++-------------- ggml/src/ggml-cuda/fattn-vec-f16.cuh | 18 +++++++++--------- ggml/src/ggml-cuda/set-rows.cu | 3 +++ 4 files changed, 36 insertions(+), 31 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 565853bfecd..83cf872f68a 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1330,14 +1330,16 @@ static __global__ void flash_attn_ext_f16( ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); - GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); - GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10); - GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); - GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); - GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED(dst); GGML_UNUSED(dst_meta); + GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); + GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); + GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); + GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); + GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); + GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); + GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE) } diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index 2e2ed5cd566..11778bb9611 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -37,16 +37,16 @@ static __global__ void flash_attn_tile_ext_f32( #endif // FP16_MMA_AVAILABLE if (use_logit_softcap && !(D == 128 || D == 256)) { GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); - GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); + GGML_UNUSED(dst); GGML_UNUSED(dst_meta); + GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); - GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); - GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); - GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); + GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); + GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); + GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); + GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); + GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); NO_DEVICE_CODE; return; } @@ -282,16 +282,16 @@ static __global__ void flash_attn_tile_ext_f32( } #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); - GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); + GGML_UNUSED(dst); GGML_UNUSED(dst_meta); + GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); - GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); + GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); NO_DEVICE_CODE; #endif // FLASH_ATTN_AVAILABLE } diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index f6ef236be98..e9b5c306365 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -329,16 +329,16 @@ static __global__ void flash_attn_vec_ext_f16( } #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); - GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); + GGML_UNUSED(dst); GGML_UNUSED(dst_meta); + GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); - GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne32); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); - GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); - GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); + GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); + GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); + GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); + GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); + GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); + GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) } diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index b2acdf855e9..07983436459 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -44,6 +44,9 @@ static __global__ void k_set_rows_quant( block_type * dst_block = dst_row_ptr + i00 / qk; quantize_func(src_block, dst_block); + + GGML_UNUSED(ne10); + GGML_UNUSED(ne13); } // Template dispatch function for quantized set_rows From 4692558a1fa1fac1d7d8b90e2c52d6e6051da812 Mon Sep 17 00:00:00 2001 From: hipudding Date: Sat, 26 Jul 2025 17:56:18 +0800 Subject: [PATCH 0159/1348] CANN: Implement GLU ops (llama/14884) Implement REGLU, GEGLU, SWIGLU ops according to #14158 --- ggml/src/ggml-cann/acl_tensor.cpp | 4 +- ggml/src/ggml-cann/aclnn_ops.cpp | 38 +++++++++- ggml/src/ggml-cann/aclnn_ops.h | 120 ++++++++++++++++++++++++------ ggml/src/ggml-cann/ggml-cann.cpp | 72 ++++++++++++++---- 4 files changed, 194 insertions(+), 40 deletions(-) diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp index f311864d486..8ffac31dd66 100755 --- a/ggml/src/ggml-cann/acl_tensor.cpp +++ b/ggml/src/ggml-cann/acl_tensor.cpp @@ -77,6 +77,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne, for (int i = 0; i < final_dims; i++) { acl_storage_len += (acl_ne[i] - 1) * acl_stride[i]; } + size_t elem_offset = offset / ggml_element_size(tensor); + acl_storage_len += elem_offset; // Reverse ne and stride. std::reverse(acl_ne, acl_ne + final_dims); @@ -84,7 +86,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne, aclTensor* acl_tensor = aclCreateTensor( acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride, - offset / ggml_element_size(tensor), format, &acl_storage_len, 1, + elem_offset, format, &acl_storage_len, 1, tensor->data); return acl_tensor; diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 76bed4e8cd0..d616c491ae9 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -99,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT } } -void ggml_cann_unary_op( +void ggml_cann_op_unary( std::function unary_op, ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; @@ -111,6 +111,42 @@ void ggml_cann_unary_op( ggml_cann_release_resources(ctx, acl_src, acl_dst); } +void ggml_cann_op_unary_gated( + std::function unary_op, + ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr; + if(src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + + acl_src0 = ggml_cann_create_tensor(src0); + acl_src1 = ggml_cann_create_tensor(src1); + } else { + int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]}; + size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]}; + acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0); + acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0)); + if (swapped) { + std::swap(acl_src0, acl_src1); + } + } + + unary_op(ctx, acl_src0, acl_dst); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1); + + ggml_cann_release_resources(ctx, acl_src0, acl_dst); + if(src1) + ggml_cann_release_resources(ctx, acl_src1); +} + /** * @brief Repeats elements of a tensor along each dimension according to the * specified repeat array. diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 924da66ed68..8deaf7ea1db 100755 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -1098,7 +1098,7 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param dst The destination tensor. Its src[0] is treated as the input tensor. */ template - void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; aclTensor* acl_src = ggml_cann_create_tensor(src); @@ -1109,49 +1109,125 @@ template } /** - * @brief Applies a unary operation to a ggml tensor using the CANN backend. + * @brief Applies a unary operation to a ggml tensor using the CANN backend. * - * @details This function performs a unary operation on the input tensor using - * a user-provided lambda or callable object `unary_op`, which accepts the CANN - * context and two ACL tensors (source and destination). Internally, this function - * creates ACL representations of the ggml tensors and invokes the unary operation. - * The result is stored in the destination tensor `dst`. This utility abstracts the - * common boilerplate of tensor conversion and cleanup when implementing unary ops. + * @details This function applies a unary operation to the input tensor using + * a user-provided lambda or callable `unary_op`. The lambda receives the + * CANN backend context and two ACL tensors: the source and the destination. * - * @param unary_op A callable that performs the unary operation using CANN APIs. - * @param ctx The CANN context used for operations. - * @param dst The destination tensor where the result will be stored. - * The source tensor is retrieved from `dst->src[0]`. + * Internally, this function handles the conversion from GGML tensors to ACL tensors, + * calls the provided unary op, and manages resource cleanup. The input is assumed + * to be `dst->src[0]`, and the result is written to `dst`. + * + * This utility simplifies writing unary op wrappers by abstracting tensor preparation. + * + * @param unary_op A callable that performs the unary operation using CANN ACL APIs. + * @param ctx The CANN context for operation execution. + * @param dst The destination ggml_tensor where the result will be stored. + * The input tensor is assumed to be `dst->src[0]`. + * + * @see GGML_CANN_CALL_OP_UNARY */ -void ggml_cann_unary_op( +void ggml_cann_op_unary( std::function unary_op, ggml_backend_cann_context& ctx, ggml_tensor* dst); /** - * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op. + * @brief Applies a gated (GLU-style) unary operation using the CANN backend. + * + * @details This function performs a gated activation such as GEGLU or ReGLU. + * It supports two input modes: + * + * 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors. + * These are used directly as the value and gate tensors. + * + * 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to + * contain a concatenation of value and gate along the first dimension. This tensor + * will be split into two equal halves to form the value and gate inputs. + * + * The function applies a user-provided unary operation (e.g., GELU) to the value tensor, + * then multiplies the result in-place with the gate tensor: + * + * @code + * dst = unary_op(value) * gate; + * @endcode + * + * The `swapped` parameter (from `dst->op_params[1]`) allows flipping the + * order of value/gate in the packed input case. + * + * @param unary_op A callable that performs the unary operation using CANN ACL APIs. + * It receives (ctx, acl_value_tensor, acl_output_tensor). + * @param ctx The CANN context used for execution. + * @param dst The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`. + * + * @see GGML_CANN_CALL_OP_UNARY_GATED + */ +void ggml_cann_op_unary_gated( + std::function unary_op, + ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary. + * + * This macro wraps the specified ACLNN unary operator name into a lambda expression, + * and passes it to `ggml_cann_op_unary`, which handles the common logic for executing + * unary ops in the CANN backend. + * + * Internally, this macro expands to a lambda like: + * @code + * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) { + * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); + * }; + * @endcode + * + * This lambda is then passed to `ggml_cann_op_unary`, which applies the operation. + * + * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP. + * + * @see ggml_cann_op_unary + * @see GGML_CANN_CALL_ACLNN_OP + */ +#define GGML_CANN_CALL_OP_UNARY(OP_NAME) \ + do { \ + auto lambda = [](ggml_backend_cann_context& ctx, \ + aclTensor* acl_src, \ + aclTensor* acl_dst) { \ + GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \ + }; \ + ggml_cann_op_unary(lambda, ctx, dst); \ + } \ + while (0) + +/** + * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated. * - * This macro defines an inline lambda wrapping a specific ACL operation name, - * and passes it to the templated ggml_cann_unary_op function. It simplifies - * calling unary ops by hiding the lambda boilerplate. + * This macro wraps the specified ACLNN unary operator name into a lambda expression, + * and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for + * executing gated unary ops in the CANN backend. * - * Internally, the lambda will call: + * Internally, this macro expands to a lambda like: * @code - * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); + * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) { + * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); + * }; * @endcode * + * This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation. + * * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP. * - * @see ggml_cann_unary_op + * @see ggml_cann_op_unary_gated * @see GGML_CANN_CALL_ACLNN_OP */ -#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \ +#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \ do { \ auto lambda = [](ggml_backend_cann_context& ctx, \ aclTensor* acl_src, \ aclTensor* acl_dst) { \ GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \ }; \ - ggml_cann_unary_op(lambda, ctx, dst); \ + ggml_cann_op_unary_gated(lambda, ctx, dst); \ } \ while (0) + #endif // CANN_ACLNN_OPS diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index f30241aca40..c6edb6b61bb 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1681,16 +1681,18 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_ABS: - GGML_CANN_CALL_UNARY_OP(Abs); + GGML_CANN_CALL_OP_UNARY(Abs); break; case GGML_UNARY_OP_NEG: - GGML_CANN_CALL_UNARY_OP(Neg); + GGML_CANN_CALL_OP_UNARY(Neg); break; case GGML_UNARY_OP_GELU: - GGML_CANN_CALL_UNARY_OP(Gelu); + case GGML_UNARY_OP_GELU_ERF: + // aclnnGelu internally uses the erf-based approximation. + GGML_CANN_CALL_OP_UNARY(Gelu); break; case GGML_UNARY_OP_SILU: - GGML_CANN_CALL_UNARY_OP(Silu); + GGML_CANN_CALL_OP_UNARY(Silu); break; case GGML_UNARY_OP_GELU_QUICK: { auto lambda = [](ggml_backend_cann_context& ctx, @@ -1698,31 +1700,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, aclTensor* acl_dst) { GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst); }; - ggml_cann_unary_op(lambda, ctx, dst); + ggml_cann_op_unary(lambda, ctx, dst); } break; case GGML_UNARY_OP_TANH: - GGML_CANN_CALL_UNARY_OP(Tanh); + GGML_CANN_CALL_OP_UNARY(Tanh); break; case GGML_UNARY_OP_RELU: - GGML_CANN_CALL_UNARY_OP(Relu); + GGML_CANN_CALL_OP_UNARY(Relu); break; case GGML_UNARY_OP_SIGMOID: - GGML_CANN_CALL_UNARY_OP(Sigmoid); + GGML_CANN_CALL_OP_UNARY(Sigmoid); break; case GGML_UNARY_OP_HARDSIGMOID: - GGML_CANN_CALL_UNARY_OP(Hardsigmoid); + GGML_CANN_CALL_OP_UNARY(Hardsigmoid); break; case GGML_UNARY_OP_HARDSWISH: - GGML_CANN_CALL_UNARY_OP(Hardswish); + GGML_CANN_CALL_OP_UNARY(Hardswish); break; case GGML_UNARY_OP_EXP: - GGML_CANN_CALL_UNARY_OP(Exp); + GGML_CANN_CALL_OP_UNARY(Exp); break; case GGML_UNARY_OP_ELU: ggml_cann_elu(ctx, dst); break; case GGML_UNARY_OP_SGN: - GGML_CANN_CALL_UNARY_OP(Sign); + GGML_CANN_CALL_OP_UNARY(Sign); break; case GGML_UNARY_OP_STEP: ggml_cann_step(ctx, dst); @@ -1731,6 +1733,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_REGLU: + GGML_CANN_CALL_OP_UNARY_GATED(Relu); + break; + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_GEGLU_ERF: + // aclnnGelu internally uses the erf-based approximation. + GGML_CANN_CALL_OP_UNARY_GATED(Gelu); + break; + case GGML_GLU_OP_SWIGLU: + GGML_CANN_CALL_OP_UNARY_GATED(Silu); + break; + case GGML_GLU_OP_GEGLU_QUICK: { + auto lambda = [](ggml_backend_cann_context& ctx, + aclTensor* acl_src, + aclTensor* acl_dst) { + GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst); + }; + ggml_cann_op_unary_gated(lambda, ctx, dst); + } break; + default: + return false; + } + break; case GGML_OP_NORM: ggml_cann_norm(ctx, dst); break; @@ -1773,7 +1800,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, ggml_cann_binary_op(ctx, dst); break; case GGML_OP_SQRT: - GGML_CANN_CALL_UNARY_OP(Sqrt); + GGML_CANN_CALL_OP_UNARY(Sqrt); break; case GGML_OP_CLAMP: ggml_cann_clamp(ctx, dst); @@ -1818,16 +1845,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, ggml_cann_argmax(ctx, dst); break; case GGML_OP_COS: - ggml_cann_unary_op(ctx, dst); + ggml_cann_op_unary(ctx, dst); break; case GGML_OP_SIN: - ggml_cann_unary_op(ctx, dst); + ggml_cann_op_unary(ctx, dst); break; case GGML_OP_CONV_TRANSPOSE_1D: ggml_cann_conv_transpose_1d(ctx, dst); break; case GGML_OP_LOG: - GGML_CANN_CALL_UNARY_OP(Log); + GGML_CANN_CALL_OP_UNARY(Log); break; case GGML_OP_MEAN: ggml_cann_mean(ctx, dst); @@ -2101,10 +2128,23 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_GELU_ERF: return true; default: return false; } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: + return true; + default: + return false; + } + break; case GGML_OP_MUL_MAT: { switch (op->src[0]->type) { case GGML_TYPE_F16: From b275e52b46e62c05b336f755ad5101ee52793a7f Mon Sep 17 00:00:00 2001 From: deepsek <166548550+deepsek@users.noreply.github.com> Date: Sat, 26 Jul 2025 18:28:14 -0400 Subject: [PATCH 0160/1348] HIP: Enable Matrix cores for MMQ Kernels, Enable stream-K for CDNA 3 (llama/14624) This commit adds support for MFMA instructions to MMQ. CDNA1/GFX908 CDNA2/GFX90a and CDNA3/GFX942 are supported by the MFMA-enabled code path added by this commit. The code path and stream-k is only enabled on CDNA3 for now as it fails to outperform blas in all cases on the other devices. Blas is currently only consistently outperformed on CDNA3 due to issues in the amd-provided blas libraries. This commit also improves the awareness of MMQ towards different warp sizes and as a side effect improves the performance of all quant formats besides q4_0 and q4_1, which regress slightly, on GCN gpus. --- ggml/src/ggml-cuda/common.cuh | 16 +- ggml/src/ggml-cuda/mma.cuh | 114 +- ggml/src/ggml-cuda/mmq.cu | 10 +- ggml/src/ggml-cuda/mmq.cuh | 1841 +++++++++++++++++++----------- ggml/src/ggml-cuda/vendors/hip.h | 14 +- 5 files changed, 1295 insertions(+), 700 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9435daf0b3f..cdc3bb5ae76 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -56,7 +56,7 @@ #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16 #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a -#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers +#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300 @@ -72,8 +72,9 @@ #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4) #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) -#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA) -#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1) +#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) +#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) +#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) // Moore Threads #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 @@ -226,6 +227,10 @@ typedef float2 dfloat2; #define FP16_MMA_AVAILABLE #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4))) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) +#define AMD_MFMA_AVAILABLE +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) + #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING #define NEW_MMA_AVAILABLE #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING @@ -288,6 +293,11 @@ static bool fp32_mma_hardware_available(const int cc) { return GGML_CUDA_CC_IS_CDNA(cc); } +// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later. +static bool amd_mfma_available(const int cc) { + return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc); +} + // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. static bool new_mma_available(const int cc) { return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 2af63355a19..d6817d804d2 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -12,7 +12,8 @@ // The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile. // All matrix tiles have ne physical 32 bit elements per warp. // -// As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes. +// As described in the PTX documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes. +// The API in this file also assumes that the pointers for load_generic are aligned to 16 bytes, unaligned pointers are considered undefined behavior. #include "common.cuh" @@ -66,7 +67,44 @@ namespace ggml_cuda_mma { struct tile { static constexpr int I = I_; static constexpr int J = J_; - static constexpr int ne = I * J / WARP_SIZE; + +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) + static constexpr int ne = I * J / 64; + T x[ne] = {0}; + + static __device__ __forceinline__ int get_i(const int l) { + if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8> + return threadIdx.x % 16; + } else if constexpr (I == 16 && J == 8) { + return threadIdx.x % 16; + } else if constexpr (I == 32 && J == 4) { + return threadIdx.x % 32; + } else if constexpr (I == 16 && J == 16) { + return 4 * (threadIdx.x / 16) + l; + } else if constexpr (I == 32 && J == 32) { + return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4); + } else { + static_assert(I == -1 && J == -1, "template specialization not implemented"); + } + } + + static __device__ __forceinline__ int get_j(const int l) { + if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8> + return (2 * ((threadIdx.x / 16) % 2) + l); + } else if constexpr (I == 16 && J == 8) { + return 2 * (threadIdx.x / 16) + l; + } else if constexpr (I == 32 && J == 4) { + return 2 * (threadIdx.x / 32) + l; + } else if constexpr (I == 16 && J == 16) { + return threadIdx.x % 16; + } else if constexpr (I == 32 && J == 32) { + return threadIdx.x % 32; + } else { + static_assert(I == -1 && J == -1, "template specialization not implemented"); + } + } +#else + static constexpr int ne = I * J / 32; T x[ne] = {0}; static __device__ __forceinline__ int get_i(const int l) { @@ -94,6 +132,7 @@ namespace ggml_cuda_mma { static_assert(I == -1 && J == -1, "template specialization not implemented"); } } +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) }; template @@ -148,10 +187,23 @@ namespace ggml_cuda_mma { template static __device__ __forceinline__ void load_generic(tile & t, const T * __restrict__ xs0, const int stride) { +#if defined(AMD_MFMA_AVAILABLE) + if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8> +#pragma unroll + for (int l = 0; l < t.ne; ++l) { + t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)]; + } + } else { + int64_t * xi = (int64_t *) t.x; + const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I)); + xi[0] = xs[0]; + } +#else #pragma unroll for (int l = 0; l < t.ne; ++l) { t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)]; } +#endif // defined(AMD_MFMA_AVAILABLE) } template @@ -186,7 +238,7 @@ namespace ggml_cuda_mma { template static __device__ __forceinline__ void load_ldmatrix( tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) { -#ifdef NEW_MMA_AVAILABLE +#if defined(NEW_MMA_AVAILABLE) int * xi = (int * ) t.x; const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2); asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];" @@ -393,4 +445,60 @@ namespace ggml_cuda_mma { NO_DEVICE_CODE; #endif // NEW_MMA_AVAILABLE } + + static __device__ __forceinline__ void mma( + tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) { +#if defined(AMD_MFMA_AVAILABLE) + using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int; + int32x4_t * acc = (int32x4_t *) D.x; +#if defined(CDNA3) + acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0], + ((int64_t *) B.x)[0], + acc[0], + 0, 0, 0); +#elif defined(CDNA2) || defined(CDNA) + acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], + B.x[0], + acc[0], + 0, 0, 0); + acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1], + B.x[1], + acc[0], + 0, 0, 0); +#endif // defined(CDNA3) +#else + GGML_UNUSED(D); + GGML_UNUSED(A); + GGML_UNUSED(B); + NO_DEVICE_CODE; +#endif // AMD_MFMA_AVAILABLE + } + + static __device__ __forceinline__ void mma( + tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) { +#if defined(AMD_MFMA_AVAILABLE) + using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int; + int32x16_t * acc = (int32x16_t *) D.x; +#if defined(CDNA3) + acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0], + ((int64_t *) B.x)[0], + acc[0], + 0, 0, 0); +#elif defined(CDNA2) || defined(CDNA) + acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0], + B.x[0], + acc[0], + 0, 0, 0); + acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1], + B.x[1], + acc[0], + 0, 0, 0); +#endif // defined(CDNA3) +#else + GGML_UNUSED(D); + GGML_UNUSED(A); + GGML_UNUSED(B); + NO_DEVICE_CODE; +#endif // AMD_MFMA_AVAILABLE + } } diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 2db5b4ab0f0..e2fd0c1c254 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -109,7 +109,8 @@ void ggml_cuda_mul_mat_q( const int64_t s03 = src0->nb[3] / ts_src0; const int64_t s3 = dst->nb[3] / ts_dst; - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; + const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) + || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc))); if (!ids) { const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + @@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q( // The stream-k decomposition is only faster for recent NVIDIA GPUs. // Also its fixup needs to allocate a temporary buffer in the memory pool. // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && - ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; + const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) + || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc))) + && src1_ncols == ne11; const mmq_args args = { src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst, @@ -304,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return false; } - if (new_mma_available(cc)) { + if (new_mma_available(cc) || amd_mfma_available(cc)) { return true; } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 9696a320462..36e84be154e 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -90,7 +90,7 @@ struct tile_x_sizes { }; static int get_mmq_x_max_host(const int cc) { - return new_mma_available(cc) ? 128 : + return (amd_mfma_available(cc) || new_mma_available(cc)) ? 128 : GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? #ifdef GGML_CUDA_FORCE_MMQ 128 : 64; @@ -100,12 +100,12 @@ static int get_mmq_x_max_host(const int cc) { } static constexpr __device__ int get_mmq_x_max_device() { -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) return 128; -#else // NEW_MMA_AVAILABLE +#else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) - return 128; + return 64; #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA @@ -115,12 +115,11 @@ static constexpr __device__ int get_mmq_x_max_device() { return MMQ_DP4A_MAX_BATCH_SIZE; #endif // GGML_CUDA_FORCE_MMQ #else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA - return 64; #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } static int get_mmq_y_host(const int cc) { @@ -144,16 +143,25 @@ static constexpr __device__ int get_mmq_y_device() { #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } -#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0} -#define MMQ_DP4A_TXS_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_1 + mmq_y/QI4_1, 0} -#define MMQ_DP4A_TXS_Q8_0 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_0 + mmq_y/(QI8_0/2), 0} -#define MMQ_DP4A_TXS_Q8_0_16 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*4/QI8_0 + mmq_y/(QI8_0/4), 0} -#define MMQ_DP4A_TXS_Q8_1 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_1 + mmq_y/(QI8_1/2), 0} -#define MMQ_DP4A_TXS_Q2_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE + mmq_y, 0} -#define MMQ_DP4A_TXS_Q3_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y, mmq_y*WARP_SIZE/8 + mmq_y/8} -#define MMQ_DP4A_TXS_Q4_K tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_K, mmq_y*WARP_SIZE/8 + mmq_y/8} -#define MMQ_DP4A_TXS_Q5_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_K + mmq_y/QI5_K, mmq_y*WARP_SIZE/8 + mmq_y/8} -#define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8} +// Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes. +// The K dimension of the tiles has either, +// 1*MMQ_TILE_NE_K==32 (always for TILE_Y_K) or 2*MMQ_TILE_NE_K==64 (typically for TILE_X_K), +// 32 bit elements for the quantized data (does not include scales). +// In other words, the size of the quantized data in the K dimension is a multiple of MMQ_TILE_NE_K. +// The final tile size in K direction is padded to avoid shared memory bank conflicts, +// in terms of 32 bit elements that means K % 2 == 1 for dp4a or K % 8 == 4 for mma. +#define MMQ_TILE_NE_K 32 + +#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*MMQ_TILE_NE_K + mmq_y, mmq_y*MMQ_TILE_NE_K/QI4_0 + mmq_y/QI4_0, 0} +#define MMQ_DP4A_TXS_Q4_1 tile_x_sizes{mmq_y*MMQ_TILE_NE_K + mmq_y, mmq_y*MMQ_TILE_NE_K/QI4_1 + mmq_y/QI4_1, 0} +#define MMQ_DP4A_TXS_Q8_0 tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K*2/QI8_0 + mmq_y/(QI8_0/2), 0} +#define MMQ_DP4A_TXS_Q8_0_16 tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K*4/QI8_0 + mmq_y/(QI8_0/4), 0} +#define MMQ_DP4A_TXS_Q8_1 tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K*2/QI8_1 + mmq_y/(QI8_1/2), 0} +#define MMQ_DP4A_TXS_Q2_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K + mmq_y, 0} +#define MMQ_DP4A_TXS_Q3_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y, mmq_y*MMQ_TILE_NE_K/8 + mmq_y/8} +#define MMQ_DP4A_TXS_Q4_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K + mmq_y, mmq_y*MMQ_TILE_NE_K/QI4_K, mmq_y*MMQ_TILE_NE_K/8 + mmq_y/8} +#define MMQ_DP4A_TXS_Q5_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K/QI5_K + mmq_y/QI5_K, mmq_y*MMQ_TILE_NE_K/8 + mmq_y/8} +#define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*MMQ_TILE_NE_K*2 + mmq_y, mmq_y*MMQ_TILE_NE_K/QI6_K + mmq_y/QI6_K, mmq_y*MMQ_TILE_NE_K/8 + mmq_y/8} static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) { switch (type) { @@ -179,11 +187,11 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml } } -#define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4) -#define MMQ_MMA_TILE_X_K_Q8_1 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4) -#define MMQ_MMA_TILE_X_K_Q2_K (2*WARP_SIZE + WARP_SIZE + 4) -#define MMQ_MMA_TILE_X_K_Q3_K (2*WARP_SIZE + WARP_SIZE/2 + 4) -#define MMQ_MMA_TILE_X_K_Q6_K (2*WARP_SIZE + WARP_SIZE/QI6_K + WARP_SIZE/8 + 7) +#define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4) +#define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4) +#define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4) +#define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) +#define MMQ_MMA_TILE_X_K_Q6_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI6_K + MMQ_TILE_NE_K/8 + 7) static_assert(MMQ_MMA_TILE_X_K_Q8_0 % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding."); @@ -215,42 +223,80 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { } } -#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1) +// block_q8_1_mmq has (128 8-bit ints == 32 32-bit ints + 4 32-bit scales) +#define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI8_1) static int mmq_get_granularity_host(const int mmq_x, const int cc) { - return new_mma_available(cc) && mmq_x >= 48 ? 16 : 8; + if (amd_mfma_available(cc)) { + return mmq_x >= 128 ? 32 : 16; + } else if (new_mma_available(cc) && mmq_x >= 48) { + return 16; + } else { + return 8; + } } -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) +static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) { + return mmq_x >= 128 ? 32 : 16; +} +#elif defined(NEW_MMA_AVAILABLE) static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) { return mmq_x >= 48 ? 16 : 8; } #else -static constexpr __device__ int mmq_get_granularity_device(const int /* mmq_x */) { +static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/) { return 8; } -#endif // NEW_MMA_AVAILABLE +#endif // AMD_MFMA_AVAILABLE + +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +static int mmq_get_nwarps_host(const int cc) { + return amd_mfma_available(cc) ? 8 : 4; +} +#else +static int mmq_get_nwarps_host(const int /*cc*/) { + return 8; +} +#endif // (GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) + +static constexpr __device__ int mmq_get_nwarps_device() { +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(AMD_MFMA_AVAILABLE) + return 8; +#else + return 4; +#endif // AMD_MFMA_AVAILABLE +#else + return 8; +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +} // ------------------------------------------------------------ -template static __device__ __forceinline__ void load_tiles_q4_0( +template static __device__ __forceinline__ void load_tiles_q4_0( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + 2*WARP_SIZE); + float * x_df = (float *) (x_qs + 2*MMQ_TILE_NE_K); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kbx = threadIdx.x / QI4_0; - const int kqsx = threadIdx.x % QI4_0; + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_0); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; + const int kbx = txi / QI4_0; + const int kqsx = txi % QI4_0; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); @@ -259,20 +305,21 @@ template static __device__ __forceinlin const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx; const int qs0 = get_int_b2(bxi->qs, kqsx); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + 0] = __vsubss4((qs0 >> 0) & 0x0F0F0F0F, 0x08080808); x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + QI4_0] = __vsubss4((qs0 >> 4) & 0x0F0F0F0F, 0x08080808); #else - x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_0; + constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row; const int kbxd = threadIdx.x % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { - int i = i0 + threadIdx.y * QI4_0 + threadIdx.x / blocks_per_tile_x_row; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row; if (need_check) { i = min(i, i_max); @@ -280,17 +327,19 @@ template static __device__ __forceinlin const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d; #else - x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + kbxd] = bxi->d; -#endif // NEW_MMA_AVAILABLE + x_df[i*(MMQ_TILE_NE_K/QI4_0) + i/QI4_0 + kbxd] = bxi->d; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template +template static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y); const int * x_qs = (const int *) x; @@ -299,7 +348,7 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( const half2 * y_ds = (const half2 *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_0*VDR_Q4_0_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_0*VDR_Q4_0_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -307,7 +356,7 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; const int kyqs = QI8_1 * ((k01/2) / (QI8_1/2)) + (k01/2) % (QI8_1/2); @@ -320,32 +369,37 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + kyqs + (l + QI4_0)]; } - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl - (&x_qs[i*(WARP_SIZE + 1) + k0/QR4_0], u, - x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/(QR4_0*QI4_0)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q4_0_q8_1_impl + (&x_qs[i*(MMQ_TILE_NE_K + 1) + k0/QR4_0], u, + x_df[i*(MMQ_TILE_NE_K/QI4_0) + i/QI4_0 + k0/(QR4_0*QI4_0)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); } } } } -template static __device__ __forceinline__ void load_tiles_q4_1( +template static __device__ __forceinline__ void load_tiles_q4_1( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE); + half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y); int * x_qs = (int *) x_tile; half2 * x_dm = (half2 *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kbx = threadIdx.x / QI4_1; - const int kqsx = threadIdx.x % QI4_1; + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_1); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; + const int kbx = txi / QI4_1; + const int kqsx = txi % QI4_1; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); @@ -354,20 +408,21 @@ template static __device__ __forceinlin const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx; const int qs0 = get_int_b4(bxi->qs, kqsx); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + 0] = (qs0 >> 0) & 0x0F0F0F0F; x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + QI4_1] = (qs0 >> 4) & 0x0F0F0F0F; #else - x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_1; + constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row; const int kbxd = threadIdx.x % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { - int i = i0 + threadIdx.y * QI4_1 + threadIdx.x / blocks_per_tile_x_row; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row; if (need_check) { i = min(i, i_max); @@ -375,17 +430,19 @@ template static __device__ __forceinlin const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd; -#ifdef NEW_MMA_AVAILABLE - x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm; #else - x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + kbxd] = bxi->dm; -#endif // NEW_MMA_AVAILABLE + x_dm[i*(MMQ_TILE_NE_K/QI4_1) + i/QI4_1 + kbxd] = bxi->dm; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template +template static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y); const int * x_qs = (const int *) x; @@ -394,7 +451,7 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( const half2 * y_ds = (const half2 *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_1*VDR_Q4_1_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_1*VDR_Q4_1_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -402,7 +459,7 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; const int kyqs = QI8_1 * ((k01/2) / (QI8_1/2)) + (k01/2) % (QI8_1/2); @@ -415,32 +472,37 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + kyqs + (l + QI4_1)]; } - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl - (&x_qs[i*(WARP_SIZE + 1) + k0/QR4_1], u, - x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/(QR4_1*QI4_1)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q4_1_q8_1_impl + (&x_qs[i*(MMQ_TILE_NE_K + 1) + k0/QR4_1], u, + x_dm[i*(MMQ_TILE_NE_K/QI4_1) + i/QI4_1 + k0/(QR4_1*QI4_1)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); } } } } -template static __device__ __forceinline__ void load_tiles_q5_0( +template static __device__ __forceinline__ void load_tiles_q5_0( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_0, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kbx = threadIdx.x / QI5_0; - const int kqsx = threadIdx.x % QI5_0; + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_0); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; + const int kbx = txi / QI5_0; + const int kqsx = txi % QI5_0; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); @@ -449,7 +511,7 @@ template static __device__ __forceinlin const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbx; const int ql = get_int_b2(bxi->qs, kqsx); - const int qh = get_int_b2(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_0)); + const int qh = get_int_b2(bxi->qh, 0) >> (4 * kqsx); int qs0 = (ql >> 0) & 0x0F0F0F0F; qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 @@ -465,21 +527,22 @@ template static __device__ __forceinlin qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 qs1 = __vsubss4(qs1, 0x10101010); // subtract 16 -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + 0] = qs0; x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1; #else - x_qs[i*(2*WARP_SIZE + 1) + kbx*(2*QI5_0) + kqsx + 0] = qs0; - x_qs[i*(2*WARP_SIZE + 1) + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_0) + kqsx + 0] = qs0; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } - const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI5_0; + constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row; const int kbxd = threadIdx.x % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { - int i = i0 + threadIdx.y * QI5_0 + threadIdx.x / blocks_per_tile_x_row; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row; if (need_check) { i = min(i, i_max); @@ -487,32 +550,37 @@ template static __device__ __forceinlin const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbxd; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d; #else - x_df[i*(WARP_SIZE/QI5_0) + i/QI5_0 + kbxd] = bxi->d; -#endif // NEW_MMA_AVAILABLE + x_df[i*(MMQ_TILE_NE_K/QI5_0) + i/QI5_0 + kbxd] = bxi->d; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_q5_1( +template static __device__ __forceinline__ void load_tiles_q5_1( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE); + half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y); int * x_qs = (int *) x_tile; half2 * x_dm = (half2 *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kbx = threadIdx.x / QI5_1; - const int kqsx = threadIdx.x % QI5_1; + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_1); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; + const int kbx = txi / QI5_1; + const int kqsx = txi % QI5_1; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); @@ -521,7 +589,7 @@ template static __device__ __forceinlin const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbx; const int ql = get_int_b4(bxi->qs, kqsx); - const int qh = get_int_b4(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_1)); + const int qh = get_int_b4(bxi->qh, 0) >> (4 * kqsx); int qs0 = (ql >> 0) & 0x0F0F0F0F; qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 @@ -535,21 +603,22 @@ template static __device__ __forceinlin qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + 0] = qs0; x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1; #else - x_qs[i*(2*WARP_SIZE + 1) + kbx*(2*QI5_1) + kqsx + 0] = qs0; - x_qs[i*(2*WARP_SIZE + 1) + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_1) + kqsx + 0] = qs0; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } - const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI5_1; + constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row; const int kbxd = threadIdx.x % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { - int i = i0 + threadIdx.y * QI5_1 + threadIdx.x / blocks_per_tile_x_row; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row; if (need_check) { i = min(i, i_max); @@ -557,32 +626,38 @@ template static __device__ __forceinlin const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbxd; -#ifdef NEW_MMA_AVAILABLE - x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + kbxd] = bxi->dm; #else - x_dm[i*(WARP_SIZE/QI5_1) + i/QI5_1 + kbxd] = bxi->dm; -#endif // NEW_MMA_AVAILABLE + x_dm[i*(MMQ_TILE_NE_K/QI5_1) + i/QI5_1 + kbxd] = bxi->dm; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_q8_0( +template static __device__ __forceinline__ void load_tiles_q8_0( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_tile + 2*WARP_SIZE); + float * x_df = (float *) (x_tile + 2*MMQ_TILE_NE_K); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kbx = threadIdx.x / QI8_0; - const int kqsx = threadIdx.x % QI8_0; + // MMQ_ITER_K / (4 * QR8_0) == 64 required. but NV has only 32 threads per warp + constexpr int threads_per_row = 32; + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; + const int kbx = txi / QI8_0; + const int kqsx = txi % QI8_0; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); @@ -590,21 +665,22 @@ template static __device__ __forceinlin const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbx; -#ifdef NEW_MMA_AVAILABLE - x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 0 + threadIdx.x] = get_int_b2(bxi[0].qs, kqsx); - x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + WARP_SIZE + threadIdx.x] = get_int_b2(bxi[WARP_SIZE/QI8_0].qs, kqsx); +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 0 + txi] = get_int_b2(bxi[0].qs, kqsx); + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + MMQ_TILE_NE_K + txi] = get_int_b2(bxi[MMQ_TILE_NE_K/QI8_0].qs, kqsx); #else - x_qs[i*(2*WARP_SIZE + 1) + 0 + threadIdx.x] = get_int_b2(bxi[0].qs, kqsx); - x_qs[i*(2*WARP_SIZE + 1) + WARP_SIZE + threadIdx.x] = get_int_b2(bxi[WARP_SIZE/QI8_0].qs, kqsx); -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 0 + txi] = get_int_b2(bxi[0].qs, kqsx); + x_qs[i*(2*MMQ_TILE_NE_K + 1) + MMQ_TILE_NE_K + txi] = get_int_b2(bxi[MMQ_TILE_NE_K/QI8_0].qs, kqsx); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } - const int blocks_per_tile_x_row = 2*WARP_SIZE / QI8_0; + constexpr int blocks_per_tile_x_row = 2*MMQ_TILE_NE_K / QI8_0; + constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row; const int kbxd = threadIdx.x % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0/2) { - int i = i0 + threadIdx.y * (QI8_0/2) + threadIdx.x / blocks_per_tile_x_row; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row; if (need_check) { i = min(i, i_max); @@ -612,17 +688,19 @@ template static __device__ __forceinlin const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbxd; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = bxi->d; #else - x_df[i*(2*WARP_SIZE/QI8_0) + i/(QI8_0/2) + kbxd] = bxi->d; -#endif // NEW_MMA_AVAILABLE + x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + kbxd] = bxi->d; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template +template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y); const int * x_qs = (const int *) x; @@ -631,7 +709,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( const float * y_df = (const float *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += VDR_Q8_0_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += VDR_Q8_0_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -639,21 +717,76 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl - (&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0 % WARP_SIZE], - x_df[i*(2*WARP_SIZE/QI8_0) + i/(QI8_0/2) + k0/QI8_0], y_df[j*MMQ_TILE_Y_K + (k0/QI8_1) % (WARP_SIZE/QI8_1)]); + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q8_0_q8_1_impl + (&x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0 % MMQ_TILE_NE_K], + x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + k0/QI8_0], y_df[j*MMQ_TILE_Y_K + (k0/QI8_1) % (MMQ_TILE_NE_K/QI8_1)]); } } } } -template +template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { +#if defined(AMD_MFMA_AVAILABLE) + typedef tile<16, 8, int> tile_A; + typedef tile<16, 8, int> tile_B; + typedef tile<16, 16, int> tile_C; + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + const half2 * y_ds = (const half2 *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) { + const int k0 = k00 + k01; + + tile_A A[ntx]; +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0); + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { + tile_B B; + load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + + float dB; + const int j = j0 + tile_C::get_j(0); + if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) { + dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1]; + } else { + dB = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); + } + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C C; + mma(C, A[n], B); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_A::I + tile_C::get_i(l); + const float dA = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0]; + sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l]*dA*dB; + } + } + } + } +#else typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; typedef tile<16, 8, int> tile_C; @@ -662,23 +795,23 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( constexpr int rows_per_warp = 2 * granularity; constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. - y += (threadIdx.y % ntx) * (tile_B::I*MMQ_TILE_Y_K); + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); const int * x_qs = (const int *) x; - const float * x_df = (const float *) x_qs + 2*WARP_SIZE; + const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K; const int * y_qs = (const int *) y + 4; const float * y_df = (const float *) y; const half2 * y_ds = (const half2 *) y; - tile_A A[ntx][WARP_SIZE/QI8_0]; - float dA[ntx][tile_C::ne/2][WARP_SIZE/QI8_0]; + tile_A A[ntx][MMQ_TILE_NE_K/QI8_0]; + float dA[ntx][tile_C::ne/2][MMQ_TILE_NE_K/QI8_0]; const int i0 = (threadIdx.y/ntx)*rows_per_warp; #pragma unroll for (int n = 0; n < ntx; ++n) { #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) { const int k0 = k00 + k01; load_ldmatrix(A[n][k01/QI8_0], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0); @@ -689,7 +822,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( const int i = i0 + n*tile_A::I + tile_C::get_i(2*l); #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) { const int k0 = k00 + k01; dA[n][l][k01/QI8_0] = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0]; @@ -700,7 +833,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) { tile_B B; float dB[tile_C::ne/2]; @@ -729,11 +862,14 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( } } } +#endif // defined(AMD_MFMA_AVAILABLE) } -template +template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y); const int * x_qs = (const int *) x; @@ -742,7 +878,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( const half2 * y_ds = (const half2 *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += VDR_Q8_0_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += VDR_Q8_0_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -750,45 +886,95 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl - (&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], - x_dm[i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI8_1], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q8_1_q8_1_impl + (&x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], + x_dm[i*(MMQ_TILE_NE_K/QI5_1) + i/QI5_1 + k0/QI8_1], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); } } } } -template +template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { +#if defined(AMD_MFMA_AVAILABLE) + typedef tile<16, 8, int> tile_A; + typedef tile<16, 8, int> tile_B; + typedef tile<16, 16, int> tile_C; - typedef tile<16, 8, int> tile_A; - typedef tile< 8, 8, int> tile_B; - typedef tile<16, 8, int> tile_C; + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. + + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const half2 * y_dm = (const half2 *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) { + const int k0 = k00 + k01; + + tile_A A[ntx]; +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1); + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { + tile_B B; + load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + + const int j = j0 + tile_C::get_j(0); + const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]); + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C C; + mma(C, A[n], B); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_A::I + tile_C::get_i(l); + float2 dmA = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]); + sum[(j0/tile_C::J + n)*tile_C::ne + l] += dmA.x*dsB.x*C.x[l]; + sum[(j0/tile_C::J + n)*tile_C::ne + l] += dmA.y*dsB.y; + } + } + } + } +#else + typedef tile<16, 8, int> tile_A; + typedef tile< 8, 8, int> tile_B; + typedef tile<16, 8, int> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = 2 * granularity; constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. - y += (threadIdx.y % ntx) * (tile_B::J*MMQ_TILE_Y_K); + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); const int * x_qs = (const int *) x; - const half2 * x_dm = (const half2 *) x_qs + 2*WARP_SIZE; + const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K; const int * y_qs = (const int *) y + 4; const half2 * y_dm = (const half2 *) y; - tile_A A[ntx][WARP_SIZE/QI8_1]; - float2 dmA[ntx][tile_C::ne/2][WARP_SIZE/QI8_1]; + tile_A A[ntx][MMQ_TILE_NE_K/QI8_1]; + float2 dmA[ntx][tile_C::ne/2][MMQ_TILE_NE_K/QI8_1]; const int i0 = (threadIdx.y/ntx)*rows_per_warp; #pragma unroll for (int n = 0; n < ntx; ++n) { #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) { const int k0 = k00 + k01; load_ldmatrix(A[n][k01/QI8_1], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1); @@ -799,7 +985,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( const int i = i0 + n*tile_A::I + tile_C::get_i(2*l); #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) { const int k0 = k00 + k01; dmA[n][l][k01/QI8_1] = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]); @@ -810,7 +996,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) { tile_B B; float2 dsB[tile_C::ne/2]; @@ -836,11 +1022,15 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( } } } +#endif // defined(AMD_MFMA_AVAILABLE) } -template +// Used for Q3_K, IQ2_S, and IQ2_XS +template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16; const int * x_qs = (const int *) x; @@ -849,7 +1039,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( const float * y_df = (const float *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) { const int k0 = k00 + k01; #pragma unroll @@ -857,23 +1047,73 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_16_q8_1_impl( - &x_qs[i*(2*WARP_SIZE + 1) + k0], + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q8_0_16_q8_1_impl( + &x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], - &x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + k0/(QI8_0/2)], + &x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + k0/(QI8_0/2)], y_df[j*MMQ_TILE_Y_K + k01/QI8_1]); } } } } -template +// Used for Q3_K, IQ2_S, and IQ2_XS: +template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) + typedef tile<16, 8, int> tile_A; + typedef tile<16, 8, int> tile_B; + typedef tile<16, 16, int> tile_C; + typedef tile<64, 2, int> tile_load; + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. + + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) { + const int k0 = k00 + k01; + + tile_A A[ntx]; +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K); + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { + tile_B B[1]; + load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + + const int j = j0 + tile_C::get_j(0); + const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C C; + mma(C, A[n], B[0]); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_C::I + tile_C::get_i(l); + sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB; + } + } + } + } +#elif defined(NEW_MMA_AVAILABLE) typedef tile<16, 4, int> tile_A; typedef tile<16, 8, int> tile_A_8; @@ -884,10 +1124,10 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( constexpr int rows_per_warp = 2 * granularity; constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. - y += (threadIdx.y % ntx) * (tile_B::I*MMQ_TILE_Y_K); + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); const int * x_qs = (const int *) x; - const float * x_df = (const float *) x_qs + WARP_SIZE*2; + const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2; const int * y_qs = (const int *) y + 4; const float * y_df = (const float *) y; @@ -899,7 +1139,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( #pragma unroll for (int n = 0; n < ntx; ++n) { #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 8) { const int k0 = k00 + k01; load_ldmatrix(((tile_A_8 *) A[n])[k01/8], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K); @@ -910,7 +1150,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( const int i = i0 + n*tile_C::I + tile_C::get_i(2*l); #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += 4) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) { const int k0 = k00 + k01; dA[n][l][k01/4] = x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4]; @@ -921,7 +1161,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) { tile_B B[2]; float dB[tile_C::ne/2]; @@ -952,26 +1192,29 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( #else GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00); NO_DEVICE_CODE; -#endif // NEW_MMA_AVAILABLE +#endif // AMD_MFMA_AVAILABLE } -template static __device__ __forceinline__ void load_tiles_q2_K( +template static __device__ __forceinline__ void load_tiles_q2_K( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE); + half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y); int * x_qs = (int *) x_tile; half2 * x_dm = (half2 *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kqsx = threadIdx.x % QI2_K; + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR2_K); + constexpr int nrows = ggml_cuda_get_physical_warp_size() / threads_per_row; + const int kqsx = threadIdx.x % threads_per_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI2_K) { - int i = i0 + threadIdx.y*(WARP_SIZE/QI2_K) + threadIdx.x/QI2_K; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row; if (need_check) { i = min(i, i_max); @@ -987,11 +1230,11 @@ template static __device__ __forceinlin const int x_qs_k = (x_ql_0 >> (2*l)) & 0x03030303; -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q2_K + k] = x_qs_k; #else - x_qs[i*(2*WARP_SIZE + 1) + k] = x_qs_k; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k] = x_qs_k; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } const int sc_m = bxi->scales[kqsx]; @@ -1002,17 +1245,19 @@ template static __device__ __forceinlin const half2 x_dm_ik = make_half2(bxi_dmf.x*(sc_m & 0x0F), bxi_dmf.y*(sc_m >> 4)); #endif // FAST_FP16_AVAILABLE -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + kqsx] = x_dm_ik; #else - x_dm[i*(WARP_SIZE + 1) + kqsx] = x_dm_ik; -#endif // NEW_MMA_AVAILABLE + x_dm[i*(MMQ_TILE_NE_K + 1) + kqsx] = x_dm_ik; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template +template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y); const int * x_qs = (const int *) x; @@ -1029,7 +1274,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( } #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -1037,13 +1282,13 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; constexpr int ns = 2; - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq( - &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], - &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y, + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q2_K_q8_1_impl_mmq( + &x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], + &x_dm[i*(MMQ_TILE_NE_K + 1) + k0/4], k01 < MMQ_TILE_NE_K/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y, &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]); } } @@ -1052,7 +1297,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( // Some compilers fail to unroll the loop over k01 if there is a conditional statement for ns in the inner loop. // As a workaround 2 separate loops are used instead. #pragma unroll - for (int k01 = WARP_SIZE/2; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) { + for (int k01 = MMQ_TILE_NE_K/2; k01 < MMQ_TILE_NE_K; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -1060,23 +1305,89 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; constexpr int ns = 1; - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq( - &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], - &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y, + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q2_K_q8_1_impl_mmq( + &x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], + &x_dm[i*(MMQ_TILE_NE_K + 1) + k0/4], k01 < MMQ_TILE_NE_K/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y, &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]); } } } } -template +template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) + typedef tile<16, 8, int> tile_A; + typedef tile<16, 8, int> tile_B; + typedef tile<16, 16, int> tile_C; + typedef tile<64, 2, int> tile_load; + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. + + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2; + const int * y_qs = (const int *) y + 4; + const half2 * y_ds = (const half2 *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) { + const int k0 = k00 + k01; + + tile_A A[ntx]; +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K); + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { + tile_B B[1]; + load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + + const int j = j0 + tile_C::get_j(0); + const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x/2 : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y/2; + const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0 + : (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y + : __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x); + + tile_C Cm; + if (k01 >= MMQ_TILE_NE_K * 3/4) { + tile_A A1; + A1.x[0] = 0x01010101; + A1.x[1] = 0x01010101; + mma(Cm, A1, B[0]); + } + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C Cd; + mma(Cd, A[n], B[0]); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_C::I + tile_C::get_i(l); + const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]); + float tmp = Cd.x[l]*dm.x; + if (k01 >= MMQ_TILE_NE_K * 3/4) { + tmp -= Cm.x[l]*dm.y; + } + sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB; + sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB; + } + } + } + } +#elif defined(NEW_MMA_AVAILABLE) typedef tile<16, 4, int> tile_A; typedef tile<16, 8, int> tile_A_8; @@ -1087,10 +1398,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( constexpr int rows_per_warp = 2 * granularity; constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. - y += (threadIdx.y % ntx) * (tile_B::I*MMQ_TILE_Y_K); + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); const int * x_qs = (const int *) x; - const half2 * x_dm = (const half2 *) x_qs + WARP_SIZE*2; + const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2; const int * y_qs = (const int *) y + 4; const half2 * y_ds = (const half2 *) y; @@ -1103,7 +1414,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( #pragma unroll for (int n = 0; n < ntx; ++n) { #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) { const int k0 = k00 + k01; load_ldmatrix(((tile_A_8 *) A[n])[k01/QI8_1], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K); @@ -1117,7 +1428,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( const int i = i0 + n*tile_C::I + tile_C::get_i(2*l); #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1/2) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1/2) { const int k0 = k00 + k01; const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/(QI8_1/2)]); @@ -1140,7 +1451,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) { tile_B B[2]; // Here load_generic is faster than load_ldmatrix. @@ -1148,7 +1459,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( load_generic(B[1], y_qs + j0*MMQ_TILE_Y_K + (k01 + tile_B::J), MMQ_TILE_Y_K); tile_C Cm[2]; - if (k01 >= WARP_SIZE * 3/4) { + if (k01 >= MMQ_TILE_NE_K * 3/4) { tile_A A1; A1.x[0] = 0x01010101; A1.x[1] = 0x01010101; @@ -1166,16 +1477,16 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( #pragma unroll for (int l = 0; l < tile_C::ne; ++l) { float tmp = Cd[0].x[l]*dA[n][l/2][k01/4 + 0] + Cd[1].x[l]*dA[n][l/2][k01/4 + 1]; - if (k01 >= WARP_SIZE * 3/4) { + if (k01 >= MMQ_TILE_NE_K * 3/4) { tmp -= Cm[0].x[l]*mA[n][l/2][k01/4 + 0] + Cm[1].x[l]*mA[n][l/2][k01/4 + 1]; } - sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*(k01 < WARP_SIZE/2 ? dB[l%2].x : dB[l%2].y); + sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*(k01 < MMQ_TILE_NE_K/2 ? dB[l%2].x : dB[l%2].y); } } } #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE * 3/4; k01 += QI8_1) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K * 3/4; k01 += QI8_1) { float2 sB[tile_C::ne/2]; #pragma unroll @@ -1198,27 +1509,31 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( #else GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00); NO_DEVICE_CODE; -#endif // NEW_MMA_AVAILABLE +#endif // AMD_MFMA_AVAILABLE } -template static __device__ __forceinline__ void load_tiles_q3_K( +template static __device__ __forceinline__ void load_tiles_q3_K( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); int * x_sc = (int *) (x_df + txs.dm); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kqsx = threadIdx.x % QI3_K; + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR3_K); + constexpr int nrows = warp_size / threads_per_row; + const int kqsx = threadIdx.x % threads_per_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI3_K) { - int i = i0 + threadIdx.y * (WARP_SIZE/QI3_K) + threadIdx.x / QI3_K; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row; if (need_check) { i = min(i, i_max); @@ -1238,17 +1553,18 @@ template static __device__ __forceinlin const int x_qs_k = __vsubss4(x_ql_k | x_qh_k, 0x04040404); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + k] = x_qs_k; #else - x_qs[i*(2*WARP_SIZE + 1) + k] = x_qs_k; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k] = x_qs_k; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } + constexpr int rows_per_warp = warp_size / 4; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps*8) { - int i = i0 + threadIdx.y*8 + threadIdx.x/(WARP_SIZE/8); + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) { + int i = i0 + threadIdx.y*rows_per_warp + threadIdx.x/4; if (need_check) { i = min(i, i_max); @@ -1256,7 +1572,7 @@ template static __device__ __forceinlin const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride; - const int ksc = threadIdx.x % (WARP_SIZE/8); + const int ksc = threadIdx.x % 4; const int ksc_low = ksc % (QI3_K/8); const int shift_low = 4 * (ksc / (QI3_K/8)); @@ -1268,23 +1584,23 @@ template static __device__ __forceinlin const int sc = __vsubss4(sc_low | sc_high, 0x20202020); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) const int8_t * sc8 = (const int8_t *) ≻ const float d = bxi->d; #pragma unroll for (int l = 0; l < int(sizeof(int)); ++l) { - x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l]; + x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*ksc + l] = d*sc8[l]; } #else - x_sc[i*(WARP_SIZE/8) + i/8 + threadIdx.x % (WARP_SIZE/8)] = sc; -#endif // NEW_MMA_AVAILABLE + x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = sc; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } -#ifndef NEW_MMA_AVAILABLE +#if !(defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)) #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps*WARP_SIZE) { - int i = (i0 + threadIdx.y*WARP_SIZE + threadIdx.x) % mmq_y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) { + int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y; if (need_check) { i = min(i, i_max); @@ -1294,12 +1610,14 @@ template static __device__ __forceinlin x_df[i] = bxi->d; } -#endif // NEW_MMA_AVAILABLE +#endif // !(defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)) } -template +template static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y); const int * x_qs = (const int *) x; @@ -1309,7 +1627,7 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a( const float * y_df = (const float *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -1317,13 +1635,13 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; - const int8_t * scales = ((const int8_t *) (x_sc + i*(WARP_SIZE/8) + i/8)) + k0/4; + const int8_t * scales = ((const int8_t *) (x_sc + i*(MMQ_TILE_NE_K/8) + i/8)) + k0/4; - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq( - &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], scales, + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q3_K_q8_1_impl_mmq( + &x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], scales, x_df[i], y_df[j*MMQ_TILE_Y_K + k01/QI8_1]); } } @@ -1340,72 +1658,85 @@ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, co ((scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030); // upper 2 bits } -template static __device__ __forceinline__ void load_tiles_q4_K( +template static __device__ __forceinline__ void load_tiles_q4_K( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE); + half2 * x_dm = (half2 *) (x_qs + 2*MMQ_TILE_NE_K); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y); int * x_qs = (int *) x_tile; half2 * x_dm = (half2 *) (x_qs + txs.qs); int * x_sc = (int *) (x_dm + txs.dm); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_K); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); } const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride; - const int qs0 = get_int_b4(bxi->qs, threadIdx.x); + const int qs0 = get_int_b4(bxi->qs, txi); -#ifdef NEW_MMA_AVAILABLE - x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(threadIdx.x/8) + threadIdx.x % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F; - x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(threadIdx.x/8) + threadIdx.x % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F; + x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F; #else - x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } -#ifdef NEW_MMA_AVAILABLE - +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + constexpr int rows_per_warp = warp_size / 2; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps*16) { - int i = (i0 + threadIdx.y*16 + threadIdx.x/(WARP_SIZE/16)) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) { +#if defined(AMD_MFMA_AVAILABLE) + // Need if on AMD instead of % because warp_size == 64 + // This causes double work and throughput loss (MI300X) + // H100 loses about 100 t/s with 'if' condition over '%' + int i = i0 + threadIdx.y*rows_per_warp + threadIdx.x/2; + if (i < mmq_y) { +#else + int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/2) % mmq_y; + { +#endif // defined(AMD_MFMA_AVAILABLE) + if (need_check) { + i = min(i, i_max); + } - const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride; + const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride; - const int * scales = (const int *) bxi->scales; - const int ksc = threadIdx.x % (WARP_SIZE/16); + const int * scales = (const int *) bxi->scales; + const int ksc = threadIdx.x % 2; - const int sc32 = unpack_scales_q45_K(scales, ksc + 0); - const int m32 = unpack_scales_q45_K(scales, ksc + 2); + const int sc32 = unpack_scales_q45_K(scales, ksc + 0); + const int m32 = unpack_scales_q45_K(scales, ksc + 2); - const uint8_t * sc8 = (const uint8_t *) &sc32; - const uint8_t * m8 = (const uint8_t *) &m32; + const uint8_t * sc8 = (const uint8_t *) &sc32; + const uint8_t * m8 = (const uint8_t *) &m32; - const half2 dm = bxi->dm * make_half2(1.0f, -1.0f); + const half2 dm = bxi->dm * make_half2(1.0f, -1.0f); -#pragma unroll - for (int l = 0; l < int(sizeof(int)); ++l) { - x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]); + #pragma unroll + for (int l = 0; l < sizeof(int); ++l) { + x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]); + } } } - #else - #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps*QI4_K) { - int i = (i0 + threadIdx.y*QI4_K + threadIdx.x) % mmq_y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) { + int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y; if (need_check) { i = min(i, i_max); @@ -1415,30 +1746,32 @@ template static __device__ __forceinlin x_dm[i] = bxi->dm; } - + constexpr int rows_per_warp = warp_size / 4; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) { + int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/(MMQ_TILE_NE_K/8)) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI4_K/8); + const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + (threadIdx.x % (MMQ_TILE_NE_K/8)) / (QI4_K/8); const int * scales = (const int *) bxi->scales; - const int ksc = threadIdx.x % (WARP_SIZE/8); + const int ksc = threadIdx.x % (MMQ_TILE_NE_K/8); const int scales8 = unpack_scales_q45_K(scales, ksc); - x_sc[i*(WARP_SIZE/8) + i/8 + ksc] = scales8; + x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = scales8; } -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } -template +template static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y); const int * x_qs = (const int *) x; @@ -1448,7 +1781,7 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( const half2 * y_ds = (const half2 *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_K*VDR_Q4_K_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR4_K*VDR_Q4_K_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -1456,97 +1789,110 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; - const uint8_t * sc = (const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/32] + 2*(k01/16); + const uint8_t * sc = (const uint8_t *) &x_sc[i * (MMQ_TILE_NE_K/8) + i/8 + k0/32] + 2*(k01/16); - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq( - &x_qs[i*(WARP_SIZE + 1) + k0/2], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8, + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q4_K_q8_1_impl_mmq( + &x_qs[i*(MMQ_TILE_NE_K + 1) + k0/2], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8, x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); } } } } -template static __device__ __forceinline__ void load_tiles_q5_K( +template static __device__ __forceinline__ void load_tiles_q5_K( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - half2 * x_dm = (half2 *) (x_qs + WARP_SIZE*2); + half2 * x_dm = (half2 *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y); int * x_qs = (int *) x_tile; half2 * x_dm = (half2 *) (x_qs + txs.qs); int * x_sc = (int *) (x_dm + txs.dm); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR5_K); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); } const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride; - const int ky = QR5_K*threadIdx.x; + const int ky = QR5_K*txi; - const int ql = get_int_b4(bxi->qs, threadIdx.x); + const int ql = get_int_b4(bxi->qs, txi); const int ql0 = (ql >> 0) & 0x0F0F0F0F; const int ql1 = (ql >> 4) & 0x0F0F0F0F; - const int qh = get_int_b4(bxi->qh, threadIdx.x % (QI5_K/4)); - const int qh0 = ((qh >> (2 * (threadIdx.x / (QI5_K/4)) + 0)) << 4) & 0x10101010; - const int qh1 = ((qh >> (2 * (threadIdx.x / (QI5_K/4)) + 1)) << 4) & 0x10101010; + const int qh = get_int_b4(bxi->qh, txi % (QI5_K/4)); + const int qh0 = ((qh >> (2 * (txi / (QI5_K/4)) + 0)) << 4) & 0x10101010; + const int qh1 = ((qh >> (2 * (txi / (QI5_K/4)) + 1)) << 4) & 0x10101010; - const int kq0 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + 0; - const int kq1 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + QI5_K/4; + const int kq0 = ky - ky % (QI5_K/2) + txi % (QI5_K/4) + 0; + const int kq1 = ky - ky % (QI5_K/2) + txi % (QI5_K/4) + QI5_K/4; -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq0] = ql0 | qh0; x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq1] = ql1 | qh1; #else - x_qs[i*(2*WARP_SIZE + 1) + kq0] = ql0 | qh0; - x_qs[i*(2*WARP_SIZE + 1) + kq1] = ql1 | qh1; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq0] = ql0 | qh0; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq1] = ql1 | qh1; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } -#ifdef NEW_MMA_AVAILABLE - +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + constexpr int rows_per_warp = warp_size / 2; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps*16) { - int i = (i0 + threadIdx.y*16 + threadIdx.x/(WARP_SIZE/16)) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) { +#if defined(AMD_MFMA_AVAILABLE) + // Need if on AMD instead of % because warp_size == 64 + // This causes double work and throughput loss (MI300X) + // H100 loses about 100 t/s with 'if' condition over '%' + int i = i0 + threadIdx.y*rows_per_warp + threadIdx.x/2; + if (i < mmq_y) { +#else + int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/2) % mmq_y; + { +#endif // defined(AMD_MFMA_AVAILABLE) + if (need_check) { + i = min(i, i_max); + } - const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride; + const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride; - const int * scales = (const int *) bxi->scales; - const int ksc = threadIdx.x % (WARP_SIZE/16); + const int * scales = (const int *) bxi->scales; + const int ksc = threadIdx.x % 2; - const int sc32 = unpack_scales_q45_K(scales, ksc + 0); - const int m32 = unpack_scales_q45_K(scales, ksc + 2); + const int sc32 = unpack_scales_q45_K(scales, ksc + 0); + const int m32 = unpack_scales_q45_K(scales, ksc + 2); - const uint8_t * sc8 = (const uint8_t *) &sc32; - const uint8_t * m8 = (const uint8_t *) &m32; + const uint8_t * sc8 = (const uint8_t *) &sc32; + const uint8_t * m8 = (const uint8_t *) &m32; - const half2 dm = bxi->dm * make_half2(1.0f, -1.0f); + const half2 dm = bxi->dm * make_half2(1.0f, -1.0f); #pragma unroll - for (int l = 0; l < int(sizeof(int)); ++l) { - x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]); + for (int l = 0; l < int(sizeof(int)); ++l) { + x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]); + } } } - #else - #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps*QI5_K) { - int i = (i0 + threadIdx.y*QI5_K + threadIdx.x) % mmq_y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) { + int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y; if (need_check) { i = min(i, i_max); @@ -1557,9 +1903,10 @@ template static __device__ __forceinlin x_dm[i] = bxi->dm; } + constexpr int rows_per_warp = warp_size / 4; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps*8) { - int i = (i0 + threadIdx.y*8 + threadIdx.x/(WARP_SIZE/8)) % mmq_y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) { + int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/(MMQ_TILE_NE_K/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -1569,17 +1916,19 @@ template static __device__ __forceinlin const int * scales = (const int *) bxi->scales; - const int ksc = threadIdx.x % (WARP_SIZE/8); + const int ksc = threadIdx.x % (MMQ_TILE_NE_K/8); const int scales8 = unpack_scales_q45_K(scales, ksc); - x_sc[i*(WARP_SIZE/8) + i/8 + ksc] = scales8; + x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + ksc] = scales8; } -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } -template +template static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y); const int * x_qs = (const int *) x; @@ -1589,7 +1938,7 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( const half2 * y_ds = (const half2 *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QR5_K*VDR_Q5_K_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR5_K*VDR_Q5_K_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -1597,36 +1946,42 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k00/32]) + 2*(k01/16); + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (MMQ_TILE_NE_K/8) + i/8 + k00/32]) + 2*(k01/16); - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq( - &x_qs[i*(QR5_K*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8, + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q5_K_q8_1_impl_mmq( + &x_qs[i*(QR5_K*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8, x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); } } } } -template static __device__ __forceinline__ void load_tiles_q6_K( +template static __device__ __forceinline__ void load_tiles_q6_K( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); - int * x_sc = (int *) (x_df + WARP_SIZE/QI6_K); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); + int * x_sc = (int *) (x_df + MMQ_TILE_NE_K/QI6_K); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); int * x_sc = (int *) (x_df + txs.dm); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR6_K); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); @@ -1634,67 +1989,67 @@ template static __device__ __forceinlin const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride; - const int ql = get_int_b2(bxi->ql, threadIdx.x); + const int ql = get_int_b2(bxi->ql, txi); const int ql0 = (ql >> 0) & 0x0F0F0F0F; const int ql1 = (ql >> 4) & 0x0F0F0F0F; - const int qh = get_int_b2(bxi->qh, (QI6_K/4) * (threadIdx.x / (QI6_K/2)) + threadIdx.x % (QI6_K/4)); - const int qh0 = ((qh >> ((threadIdx.x & 0x08) >> 2)) << 4) & 0x30303030; - const int qh1 = (qh >> ((threadIdx.x & 0x08) >> 2)) & 0x30303030; + const int qh = get_int_b2(bxi->qh, (QI6_K/4) * (txi / (QI6_K/2)) + txi % (QI6_K/4)); + const int qh0 = ((qh >> ((txi & 0x08) >> 2)) << 4) & 0x30303030; + const int qh1 = (qh >> ((txi & 0x08) >> 2)) & 0x30303030; - const int kq0 = 2*threadIdx.x - threadIdx.x % (QI6_K/2) + 0; - const int kq1 = 2*threadIdx.x - threadIdx.x % (QI6_K/2) + QI6_K/2; + const int kq0 = 2*txi - txi % (QI6_K/2) + 0; + const int kq1 = 2*txi - txi % (QI6_K/2) + QI6_K/2; -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq0] = __vsubss4(ql0 | qh0, 0x20202020); x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq1] = __vsubss4(ql1 | qh1, 0x20202020); #else - x_qs[i*(2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); - x_qs[i*(2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); + x_qs[i*(2*MMQ_TILE_NE_K + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } - const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 - const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256 - #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { - int i = (i0 + threadIdx.y * QI6_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) { + int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbxd; + const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q6_K + kbxd] = bxi->d; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q6_K] = bxi->d; #else - x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K + kbxd] = bxi->d; -#endif // NEW_MMA_AVAILABLE + x_df[i*(MMQ_TILE_NE_K/QI6_K) + i/QI6_K] = bxi->d; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } + constexpr int rows_per_warp = warp_size / 4; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) { + int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/(MMQ_TILE_NE_K/8)) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / 4; + const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (MMQ_TILE_NE_K/8)) / 4; -#ifdef NEW_MMA_AVAILABLE - x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + threadIdx.x % (WARP_SIZE/8)] = get_int_b2(bxi->scales, threadIdx.x % (QI6_K/8)); +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + threadIdx.x%4] = get_int_b2(bxi->scales, threadIdx.x % (MMQ_TILE_NE_K/8)); #else - x_sc[i*(WARP_SIZE/8) + i/8 + threadIdx.x % (WARP_SIZE/8)] = get_int_b2(bxi->scales, threadIdx.x % (QI6_K/8)); -#endif // NEW_MMA_AVAILABLE + x_sc[i*(MMQ_TILE_NE_K/8) + i/8 + threadIdx.x%(MMQ_TILE_NE_K/8)] = get_int_b2(bxi->scales, threadIdx.x%(QI6_K/8)); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template +template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y); const int * x_qs = (const int *) x; @@ -1704,7 +2059,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( const float * y_df = (const float *) y; // #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += QR6_K*VDR_Q6_K_Q8_1_MMQ) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QR6_K*VDR_Q6_K_Q8_1_MMQ) { const int k0 = k00 + k01; #pragma unroll @@ -1712,23 +2067,74 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; - const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]); + const int8_t * sc = ((const int8_t *) &x_sc[i * (MMQ_TILE_NE_K/8) + i/8 + k0/16]); - sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq( - &x_qs[i*(QR6_K*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc, - x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + k01/QI8_1]); + sum[j0/nwarps*mmq_y/warp_size + i0/warp_size] += vec_dot_q6_K_q8_1_impl_mmq( + &x_qs[i*(QR6_K*MMQ_TILE_NE_K + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc, + x_df[i*(MMQ_TILE_NE_K/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + k01/QI8_1]); } } } } -template +template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) + typedef tile<16, 8, int> tile_A; + typedef tile<16, 8, int> tile_B; + typedef tile<16, 16, int> tile_C; + typedef tile<64, 2, int> tile_load; + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. + + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2; + const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) { + const int k0 = k00 + k01; + + tile_A A[ntx]; +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K); + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { + tile_B B[1]; + load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + + const int j = j0 + tile_C::get_j(0); + const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C C; + mma(C, A[n], B[0]); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_C::I + tile_C::get_i(l); + const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16); + sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB; + } + } + } + } +#elif defined(NEW_MMA_AVAILABLE) typedef tile<16, 4, int> tile_A; typedef tile< 8, 4, int> tile_B; @@ -1738,11 +2144,11 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( constexpr int rows_per_warp = 2 * granularity; constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. - y += (threadIdx.y % ntx) * (tile_B::I*MMQ_TILE_Y_K); + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); const int * x_qs = (const int *) x; - const float * x_df = (const float *) x_qs + WARP_SIZE*2; - const int * x_sc = (const int *) x_df + WARP_SIZE/QI6_K; + const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2; + const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K; const int * y_qs = (const int *) y + 4; const float * y_df = (const float *) y; @@ -1755,7 +2161,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( #pragma unroll for (int n = 0; n < ntx; ++n) { #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 8) { const int k0 = k00 + k01; load_ldmatrix(A[n][k01/4 + 0], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + (k0 + 0), MMQ_MMA_TILE_X_K_Q6_K); @@ -1763,7 +2169,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += 16) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 16) { const int k0 = k00 + k01; #pragma unroll @@ -1793,7 +2199,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( float tmp[ntx][tile_C::ne] = {{0.0f}}; #pragma unroll - for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) { + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 8) { tile_B B[2]; float dB[tile_C::ne/2]; @@ -1832,27 +2238,32 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( #else GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00); NO_DEVICE_CODE; -#endif // NEW_MMA_AVAILABLE +#endif // AMD_MFMA_AVAILABLE } -template static __device__ __forceinline__ void load_tiles_iq4_nl( +template static __device__ __forceinline__ void load_tiles_iq4_nl( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_NL, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kbx = threadIdx.x / QI4_NL; - const int kqsx = threadIdx.x % QI4_NL; + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_NL); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; + const int kbx = txi / QI4_NL; + const int kqsx = txi % QI4_NL; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); @@ -1862,22 +2273,24 @@ template static __device__ __forceinlin const int aux_q4 = get_int_b2(bxi->qs, kqsx); const int2 v = get_int_from_table_16(aux_q4); - const int k0 = 8 * (threadIdx.x / 4) + threadIdx.x % 4; -#ifdef NEW_MMA_AVAILABLE - x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x; - x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 4] = v.y; + const int k0 = kbx * (2 * QI4_NL) + kqsx; + +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x; + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + QI4_NL] = v.y; #else - x_qs[i*(2*WARP_SIZE + 1) + k0 + 0] = v.x; - x_qs[i*(2*WARP_SIZE + 1) + k0 + 4] = v.y; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI4_NL] = v.y; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_NL; + constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_NL; + constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row; const int kbxd = threadIdx.x % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_NL) { - int i = i0 + threadIdx.y * QI4_NL + threadIdx.x / blocks_per_tile_x_row; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row; if (need_check) { i = min(i, i_max); @@ -1885,31 +2298,35 @@ template static __device__ __forceinlin const block_iq4_nl * bxi = (const block_iq4_nl *) x + kbx0 + i*stride + kbxd; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d); +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d); #else - x_df[i*(WARP_SIZE/4) + i/4 + kbxd] = __half2float(bxi->d); -#endif // NEW_MMA_AVAILABLE + x_df[i*(MMQ_TILE_NE_K/QI4_NL) + i/QI4_NL + kbxd] = __half2float(bxi->d); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_iq2_xxs( +template static __device__ __forceinline__ void load_tiles_iq2_xxs( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_XXS, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kqsx = threadIdx.x % (QI2_XXS/2); + constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_XXS)) / 2; + constexpr int nrows = warp_size / threads_per_row; + const int kqsx = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_XXS/2)) { - int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_XXS) + threadIdx.x/(QI2_XXS/2); + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) { + int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row; if (need_check) { i = min(i, i_max); @@ -1932,42 +2349,46 @@ template static __device__ __forceinlin const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000); const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0; x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid1; #else - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 0)] = grid0; - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 1)] = grid1; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid0; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid1; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } const int ls = aux32 >> 28; const float d = bxi->d; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4; #else - x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = (ls*d + d/2)/4; -#endif // NEW_MMA_AVAILABLE + x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/4; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_iq2_xs( +template static __device__ __forceinline__ void load_tiles_iq2_xs( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16; int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kqsx = threadIdx.x % (QI2_XS/2); + constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_XS)) / 2; + constexpr int nrows = warp_size / threads_per_row; + const int kqsx = threadIdx.x % threads_per_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_XS/2)) { - int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_XS) + threadIdx.x/(QI2_XS/2); + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) { + int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row; if (need_check) { i = min(i, i_max); @@ -1986,44 +2407,48 @@ template static __device__ __forceinlin const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]); const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l; x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h; #else - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 0)] = grid_l; - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 1)] = grid_h; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } const int ls = bxi->scales[kqsx]; const float d = bxi->d; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4; - x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4; + x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4; #else - x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4; - x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4; -#endif // NEW_MMA_AVAILABLE + x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4; + x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_iq2_s( +template static __device__ __forceinline__ void load_tiles_iq2_s( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_S, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kqsx = threadIdx.x % (QI2_S/2); + constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR2_S)) / 2; + constexpr int nrows = warp_size / threads_per_row; + const int kqsx = threadIdx.x % threads_per_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_S/2)) { - int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_S) + threadIdx.x/(QI2_S/2); + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) { + int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row; if (need_check) { i = min(i, i_max); @@ -2049,44 +2474,48 @@ template static __device__ __forceinlin const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0); const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l; x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h; #else - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 0)] = grid_l; - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 1)] = grid_h; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } const int ls = bxi->scales[kqsx]; const float d = bxi->d; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4; - x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4; + x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4; #else - x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4; - x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4; -#endif // NEW_MMA_AVAILABLE + x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls & 0x0F)*d + d/2)/4; + x_df[i*(2*MMQ_TILE_NE_K*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >> 4)*d + d/2)/4; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_iq3_xxs( +template static __device__ __forceinline__ void load_tiles_iq3_xxs( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_XXS, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kqsx = threadIdx.x % (QI3_XXS/2); + constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR3_XXS)) / 2; + constexpr int nrows = warp_size / threads_per_row; + const int kqsx = threadIdx.x % threads_per_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI3_XXS/2)) { - int i = i0 + threadIdx.y*(2*WARP_SIZE/QI3_XXS) + threadIdx.x/(QI3_XXS/2); + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) { + int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row; if (need_check) { i = min(i, i_max); @@ -2107,42 +2536,46 @@ template static __device__ __forceinlin const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]); const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l; x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid_h; #else - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 0)] = grid_l; - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l + 1)] = grid_h; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 0)] = grid_l; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l + 1)] = grid_h; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } const int ls = aux32 >> 28; const float d = bxi->d; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/2; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/2; #else - x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = (ls*d + d/2)/2; -#endif // NEW_MMA_AVAILABLE + x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/2; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_iq3_s( +template static __device__ __forceinline__ void load_tiles_iq3_s( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kqsx = threadIdx.x % (QI3_S/2); + constexpr int threads_per_row = (MMQ_ITER_K / (4 * QR3_S)) / 2; + constexpr int nrows = warp_size / threads_per_row; + const int kqsx = threadIdx.x % threads_per_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI3_S/2)) { - int i = i0 + threadIdx.y*(2*WARP_SIZE/QI3_S) + threadIdx.x/(QI3_S/2); + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) { + int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row; if (need_check) { i = min(i, i_max); @@ -2170,42 +2603,46 @@ template static __device__ __forceinlin const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0); const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+0)] = grid_l; x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+1)] = grid_h; #else - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l+0)] = grid_l; - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l+1)] = grid_h; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+0)] = grid_l; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+1)] = grid_h; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } const int ls = 1 + 2*((bxi->scales[kqsx/2] >> (((2*kqsx) << 1) & 0x04)) & 0x0F); const float d = bxi->d; -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = ls*d; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = ls*d; #else - x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = ls*d; -#endif // NEW_MMA_AVAILABLE + x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = ls*d; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_iq1_s( +template static __device__ __forceinline__ void load_tiles_iq1_s( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - half2 * x_ds = (half2 *) (x_qs + WARP_SIZE*2); + half2 * x_ds = (half2 *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y); int * x_qs = (int *) x_tile; half2 * x_ds = (half2 *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kqsx = threadIdx.x % QI1_S; + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR1_S); + constexpr int nrows = warp_size / threads_per_row; + const int kqsx = threadIdx.x % threads_per_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI1_S) { - int i = i0 + threadIdx.y*(WARP_SIZE/QI1_S) + threadIdx.x/QI1_S; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * nrows) { + int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row; if (need_check) { i = min(i, i_max); @@ -2225,66 +2662,71 @@ template static __device__ __forceinlin const int grid0 = (grid >> 0) & 0x0F0F0F0F; const int grid1 = (grid >> 4) & 0x0F0F0F0F; -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+0)] = grid0; x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+1)] = grid1; #else - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l+0)] = grid0; - x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + (2*l+1)] = grid1; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+0)] = grid0; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + 8*kqsx + (2*l+1)] = grid1; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } const float d1q = __half2float(bxi->d) * (((qh >> 11) & 0x0E) + 1); const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000); -#ifdef NEW_MMA_AVAILABLE - x_ds[i*MMQ_MMA_TILE_X_K_Q8_1 + kqsx] = make_half2(d1q, d1q*delta); +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_ds[i*MMQ_MMA_TILE_X_K_Q8_1 + kqsx] = make_half2(d1q, d1q*delta); #else - x_ds[i*(WARP_SIZE/4) + i/4 + kqsx] = make_half2(d1q, d1q*delta); -#endif // NEW_MMA_AVAILABLE + x_ds[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = make_half2(d1q, d1q*delta); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template static __device__ __forceinline__ void load_tiles_iq4_xs( +template static __device__ __forceinline__ void load_tiles_iq4_xs( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); -#ifdef NEW_MMA_AVAILABLE +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) int * x_qs = (int *) x_tile; - float * x_df = (float *) (x_qs + WARP_SIZE*2); + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); #else constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y); int * x_qs = (int *) x_tile; float * x_df = (float *) (x_qs + txs.qs); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) - const int kbx = 0; // threadIdx.x / QI4_XS - const int kqsx = threadIdx.x; // threadIdx.x % QI4_XS + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_XS); + constexpr int nrows = warp_size / threads_per_row; + const int kqsx = threadIdx.x % threads_per_row; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + threadIdx.y; + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); if (need_check) { i = min(i, i_max); } - const block_iq4_xs * bxi = (const block_iq4_xs *) x + kbx0 + i*stride + kbx; + const block_iq4_xs * bxi = (const block_iq4_xs *) x + kbx0 + i*stride; const int aux_q4 = get_int_b4(bxi->qs, kqsx); const int2 v = get_int_from_table_16(aux_q4); - const int k0 = 8 * (threadIdx.x / 4) + threadIdx.x % 4; -#ifdef NEW_MMA_AVAILABLE + const int k0 = 8 * (kqsx / 4) + kqsx % 4; + +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x; x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 4] = v.y; #else - x_qs[i*(2*WARP_SIZE + 1) + k0 + 0] = v.x; - x_qs[i*(2*WARP_SIZE + 1) + k0 + 4] = v.y; -#endif // NEW_MMA_AVAILABLE + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 4] = v.y; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } + constexpr int rows_per_warp = warp_size / 8; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4); + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / (MMQ_TILE_NE_K/4); if (need_check) { i = min(i, i_max); @@ -2297,18 +2739,21 @@ template static __device__ __forceinlin const int ls = ((bxi->scales_l[(threadIdx.x % 8)/2] >> (4*(threadIdx.x % 2))) & 0x0F) | (((bxi->scales_h >> (2*(threadIdx.x % 8))) & 0x03) << 4); -#ifdef NEW_MMA_AVAILABLE - x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * (ls - 32); +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * (ls - 32); #else - x_df[i*(WARP_SIZE/4) + i/4 + threadIdx.x % 8] = d * (ls - 32); -#endif // NEW_MMA_AVAILABLE + x_df[i*(MMQ_TILE_NE_K/4) + i/4 + threadIdx.x % 8] = d * (ls - 32); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } } -template +template static __device__ __forceinline__ void mmq_write_back_dp4a( const float * __restrict__ sum, const int32_t * __restrict__ ids_dst, float * __restrict__ dst, const int stride, const int i_max, const int j_max) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -2318,32 +2763,40 @@ static __device__ __forceinline__ void mmq_write_back_dp4a( } #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; if (need_check && i > i_max) { continue; } - dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size]; } } } -template +template static __device__ __forceinline__ void mmq_write_back_mma( const float * __restrict__ sum, const int * __restrict__ ids_dst, float * __restrict__ dst, const int stride, const int i_max, const int j_max) { - typedef tile<16, 8, int> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int nwarps = mmq_get_nwarps_device(); + +#if defined(AMD_MFMA_AVAILABLE) + constexpr int tileC_IJ = mmq_get_granularity_device(0); + typedef tile tile_C; + constexpr int rows_per_warp = granularity; +#else + typedef tile<16, 8, int> tile_C; constexpr int rows_per_warp = 2 * granularity; +#endif constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. const int i0 = (threadIdx.y / ntx) * (ntx*tile_C::I); -#ifdef NEW_MMA_AVAILABLE +#if defined(NEW_MMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) static_assert(nwarps*tile_C::I == mmq_y, "nwarps*tile_C::I != mmq_y"); -#endif // NEW_MMA_AVAILABLE +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { @@ -2371,179 +2824,181 @@ static __device__ __forceinline__ void mmq_write_back_mma( // ------------------------------------------------------------------------------------------------------------------------------------- -template +template struct mmq_type_traits; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_0_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_0_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_1_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_1_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q2_K_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q2_K_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q2_K_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q2_K_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q3_K_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q3_K_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_K_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_K_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q5_K_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q5_K_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q6_K_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q6_K_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q6_K_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q6_K_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_IQ2_XXS_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_xxs; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_xxs; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_IQ2_XS_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_xs; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_xs; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_IQ2_S_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_s; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_s; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_IQ3_XXS_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_xxs; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_xxs; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_IQ3_S_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_s; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_s; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_IQ1_S_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq1_s; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq1_s; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_IQ4_NL_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_nl; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_nl; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; -template -struct mmq_type_traits { +template +struct mmq_type_traits { static constexpr int vdr = VDR_IQ4_XS_Q8_1_MMQ; - static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_xs; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; - static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_xs; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; -template +template static __device__ __forceinline__ void mul_mat_q_process_tile( const char * __restrict__ x, const int offset_x, const int * __restrict__ y, const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup, const int stride_row_x, const int ncols_y, const int stride_col_dst, const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) { + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + constexpr int nwarps = mmq_get_nwarps_device(); constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); - constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles; + constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles; extern __shared__ int data_mul_mat_q[]; int * tile_y = data_mul_mat_q + mmq_x; - int * tile_x = tile_y + GGML_PAD(mmq_x*(WARP_SIZE + WARP_SIZE/QI8_1), nwarps*WARP_SIZE); + int * tile_x = tile_y + GGML_PAD(mmq_x*MMQ_TILE_Y_K, nwarps*warp_size); -#ifdef NEW_MMA_AVAILABLE - constexpr vec_dot_mmq_t vec_dot = mmq_type_traits::vec_dot_mma; - constexpr mmq_write_back_t write_back = mmq_write_back_mma; +#if defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) + constexpr vec_dot_mmq_t vec_dot = mmq_type_traits::vec_dot_mma; + constexpr mmq_write_back_t write_back = mmq_write_back_mma; #else - constexpr vec_dot_mmq_t vec_dot = mmq_type_traits::vec_dot_dp4a; - constexpr mmq_write_back_t write_back = mmq_write_back_dp4a; -#endif // NEW_MMA_AVAILABLE + constexpr vec_dot_mmq_t vec_dot = mmq_type_traits::vec_dot_dp4a; + constexpr mmq_write_back_t write_back = mmq_write_back_dp4a; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) constexpr int blocks_per_iter = MMQ_ITER_K / qk; - float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; + float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f}; for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); @@ -2551,8 +3006,8 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( { const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll - for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { - int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; + for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) { + int l = l0 + threadIdx.y*warp_size + threadIdx.x; tile_y[l] = by0[l]; } @@ -2567,8 +3022,8 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( { const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll - for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { - int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; + for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) { + int l = l0 + threadIdx.y*warp_size + threadIdx.x; tile_y[l] = by0[l]; } @@ -2576,7 +3031,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( __syncthreads(); - vec_dot(tile_x, tile_y, sum, WARP_SIZE); + vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K); __syncthreads(); } @@ -2591,16 +3046,16 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 -template +template #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) - __launch_bounds__(WARP_SIZE*nwarps, 2) + __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2) #endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) #else #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA - __launch_bounds__(WARP_SIZE*nwarps, 1) + __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 1) #else - __launch_bounds__(WARP_SIZE*nwarps, 2) + __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2) #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( @@ -2616,6 +3071,9 @@ static __global__ void mul_mat_q( return; } + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); @@ -2627,10 +3085,10 @@ static __global__ void mul_mat_q( // For MoE the correct indices are loaded from ids_dst. extern __shared__ int ids_dst_shared[]; // Stored at beginning of shared memory. #pragma unroll - for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { - const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*warp_size) { + const int j = j0 + threadIdx.y*warp_size + threadIdx.x; - if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + if (j0 + nwarps*warp_size > mmq_x && j >= mmq_x) { break; } @@ -2639,7 +3097,7 @@ static __global__ void mul_mat_q( __syncthreads(); // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: -#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA +#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA { const int wt = blockIdx.z / nchannels_y; const int zt = blockIdx.z - wt*nchannels_y; @@ -2667,10 +3125,10 @@ static __global__ void mul_mat_q( // __syncthreads(); // There is no previous tile that could cause a race condition. #pragma unroll - for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { - const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*warp_size) { + const int j = j0 + threadIdx.y*warp_size + threadIdx.x; - if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + if (j0 + nwarps*warp_size > mmq_x && j >= mmq_x) { break; } @@ -2688,12 +3146,12 @@ static __global__ void mul_mat_q( const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = false; - mul_mat_q_process_tile + mul_mat_q_process_tile (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); return; } -#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA +#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA const int64_t blocks_per_ne00 = ncols_x / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; @@ -2745,10 +3203,10 @@ static __global__ void mul_mat_q( __syncthreads(); #pragma unroll - for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { - const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*warp_size) { + const int j = j0 + threadIdx.y*warp_size + threadIdx.x; - if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + if (j0 + nwarps*warp_size > mmq_x && j >= mmq_x) { break; } @@ -2766,7 +3224,7 @@ static __global__ void mul_mat_q( const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer. - mul_mat_q_process_tile + mul_mat_q_process_tile (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); @@ -2812,10 +3270,10 @@ static __global__ void mul_mat_q( // The memory layout for the fixup buffer is always contiguous, therefore reset ids: __syncthreads(); #pragma unroll - for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { - const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*warp_size) { + const int j = j0 + threadIdx.y*warp_size + threadIdx.x; - if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + if (j0 + nwarps*warp_size > mmq_x && j >= mmq_x) { break; } @@ -2833,13 +3291,13 @@ static __global__ void mul_mat_q( const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks. - mul_mat_q_process_tile + mul_mat_q_process_tile (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); } -template +template static __global__ void mul_mat_q_stream_k_fixup( const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst, @@ -2849,7 +3307,10 @@ static __global__ void mul_mat_q_stream_k_fixup( constexpr int blocks_per_iter = MMQ_ITER_K / qk; const int64_t blocks_per_ne00 = ncols_x / qk; - float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + + float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f}; const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; const int nty = (nrows_x + mmq_y - 1) / mmq_y; @@ -2893,10 +3354,10 @@ static __global__ void mul_mat_q_stream_k_fixup( const int j = j0 + threadIdx.y; #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; - sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i]; + sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i]; } } @@ -2937,14 +3398,14 @@ static __global__ void mul_mat_q_stream_k_fixup( } #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; if (need_check && i > i_max) { continue; } - dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size]; } } return; @@ -2955,7 +3416,7 @@ static __global__ void mul_mat_q_stream_k_fixup( const int col_high = expert_bounds[zt + 1]; const int col_diff = col_high - col_low; - for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) { + for (int j = threadIdx.y*warp_size + threadIdx.x; j < mmq_x; j += nwarps*warp_size) { ids_dst_shared[j] = ids_dst[col_low + j]; } __syncthreads(); @@ -2975,14 +3436,14 @@ static __global__ void mul_mat_q_stream_k_fixup( } #pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + for (int i0 = 0; i0 < mmq_y; i0 += warp_size) { const int i = i0 + threadIdx.x; if (need_check && i > i_max) { continue; } - dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size]; } } } @@ -2996,13 +3457,13 @@ struct mmq_args { }; template -static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc) { +static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc, const int warp_size, const int nwarps) { const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y); const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type); const size_t nbs_ids = mmq_x*sizeof(int); - const size_t nbs_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); + const size_t nbs_x = (new_mma_available(cc) || amd_mfma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq); - return nbs_ids + nbs_x + GGML_PAD(nbs_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); + return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int)); } template @@ -3010,14 +3471,16 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int id = ggml_cuda_get_device(); const int cc = ggml_cuda_info().devices[id].cc; const int nsm = ggml_cuda_info().devices[id].nsm; + const int warp_size = ggml_cuda_info().devices[id].warp_size; + const int nwarps = mmq_get_nwarps_host(cc); const int mmq_y = get_mmq_y_host(cc); - const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1); + const dim3 block_dims(warp_size, nwarps, 1); - const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc); + const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc, warp_size, nwarps); - CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); - CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); + CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); + CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x; @@ -3032,14 +3495,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a if (!args.use_stream_k) { if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> + mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> + mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, @@ -3059,8 +3522,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - - mul_mat_q<<>> + mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, @@ -3070,13 +3532,12 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a return; } - mul_mat_q_stream_k_fixup<<>> + mul_mat_q_stream_k_fixup<<>> (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - - mul_mat_q<<>> + mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, @@ -3086,7 +3547,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a return; } - mul_mat_q_stream_k_fixup<<>> + mul_mat_q_stream_k_fixup<<>> (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } @@ -3094,9 +3555,11 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a template void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const size_t smpbo = ggml_cuda_info().devices[id].smpbo; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; + const int warp_size = ggml_cuda_info().devices[id].warp_size; + const int nwarps = mmq_get_nwarps_host(cc); const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_y = get_mmq_y_host(cc); @@ -3107,7 +3570,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) { const int granularity = mmq_get_granularity_host(mmq_x, cc); - if (mmq_x % granularity != 0 || mmq_get_nbytes_shared(mmq_x, mmq_y, cc) > smpbo) { + if (mmq_x % granularity != 0 || mmq_get_nbytes_shared(mmq_x, mmq_y, cc, warp_size, nwarps) > smpbo) { continue; } diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 184d445f5c0..56e59a058f9 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -160,7 +160,19 @@ #endif #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) -#define CDNA +#define CDNA // For the entire family +#endif + +#if defined(__gfx942__) +#define CDNA3 +#endif + +#if defined(__gfx90a__) +#define CDNA2 +#endif + +#if defined(__gfx908__) +#define CDNA1 #endif #if defined(__GFX12__) From 5693b857d2979ca5c0d56c8b3a135b49ad8eab26 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 27 Jul 2025 04:05:34 -0500 Subject: [PATCH 0161/1348] vulkan: skip empty set_rows to avoid invalid API usage (llama/14860) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 1a7a381ce59..de108ccb40b 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -7882,6 +7882,13 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); + // Skip empty skip_rows operations. For most ops the empty check at the start + // of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst + // with empty srcs. + if (ggml_is_empty(src0) || ggml_is_empty(src1)) { + return; + } + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, From d96f4d8ea145ac9f9bc6b843234437e05189a993 Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Sun, 27 Jul 2025 12:04:33 +0200 Subject: [PATCH 0162/1348] vulkan : add fp16 support for the conv_2d kernel (llama/14872) * add f16 to conv_2d testing * weaken conv2d test error threshold --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 24 +++++++++++++++---- .../vulkan-shaders/vulkan-shaders-gen.cpp | 1 + 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index de108ccb40b..a99b1c73130 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -484,6 +484,7 @@ struct vk_device_struct { vk_pipeline pipeline_rwkv_wkv7_f32; vk_pipeline pipeline_opt_step_adamw_f32; vk_pipeline pipeline_conv2d_f32; + vk_pipeline pipeline_conv2d_f16_f32; vk_pipeline pipeline_conv2d_dw_whcn_f32; vk_pipeline pipeline_conv2d_dw_cwhn_f32; @@ -3074,12 +3075,21 @@ static void ggml_vk_load_shaders(vk_device& device) { device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 }, { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true); + ggml_vk_create_pipeline( + device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3, + sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 }, + { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true); } else { ggml_vk_create_pipeline( device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 }, { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, false); + ggml_vk_create_pipeline( + device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3, + sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 }, + { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, + false); } ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); @@ -6958,9 +6968,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_CONV_2D: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { - return ctx->device->pipeline_conv2d_f32; + if (src0->type == GGML_TYPE_F32) { + return ctx->device->pipeline_conv2d_f32; + } else if (src0->type == GGML_TYPE_F16) { + return ctx->device->pipeline_conv2d_f16_f32; + } } return nullptr; case GGML_OP_CONV_2D_DW: @@ -8185,13 +8199,13 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float)); @@ -10874,7 +10888,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm const vk_device& device = ggml_vk_get_device(ctx->device); bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE; // Channel-contiguous format is not supported yet. - return (op->src[0]->type == GGML_TYPE_F32 && + return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 598f0370fb8..f9f0c95b8b2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -656,6 +656,7 @@ void process_shaders() { string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}}); + string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}}); string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); From 5b4646df1aa8c3c5333c5240d73cad8a9904fa4f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 28 Jul 2025 08:43:53 +0300 Subject: [PATCH 0163/1348] sync : ggml ggml-ci --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 9b223827afb..5be1dce60aa 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -a0361ace408ba2c30820deb39e793ad9ed787a85 +b96890f3ab5ffbdbe56bc126df5366c34bd08d39 From d0a9d8c7f8f7b91c51d77bbaa394b915f79cde6b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 28 Jul 2025 10:09:47 +0300 Subject: [PATCH 0164/1348] talk-llama : sync llama.cpp --- examples/talk-llama/llama-arch.cpp | 106 ++ examples/talk-llama/llama-arch.h | 5 + examples/talk-llama/llama-batch.cpp | 146 +- examples/talk-llama/llama-batch.h | 42 +- examples/talk-llama/llama-chat.cpp | 44 +- examples/talk-llama/llama-chat.h | 2 + examples/talk-llama/llama-context.cpp | 290 +-- examples/talk-llama/llama-context.h | 42 +- examples/talk-llama/llama-cparams.h | 5 +- examples/talk-llama/llama-graph.cpp | 242 ++- examples/talk-llama/llama-graph.h | 219 ++- examples/talk-llama/llama-hparams.cpp | 40 + examples/talk-llama/llama-hparams.h | 12 +- .../llama-kv-cache-unified-iswa.cpp | 16 +- .../talk-llama/llama-kv-cache-unified-iswa.h | 3 + .../talk-llama/llama-kv-cache-unified.cpp | 1000 ++++++---- examples/talk-llama/llama-kv-cache-unified.h | 120 +- examples/talk-llama/llama-memory-hybrid.cpp | 1 + .../talk-llama/llama-memory-recurrent.cpp | 17 +- examples/talk-llama/llama-model.cpp | 1605 +++++++++++++---- examples/talk-llama/llama-model.h | 7 +- examples/talk-llama/llama-quant.cpp | 3 +- examples/talk-llama/llama-vocab.cpp | 371 +++- examples/talk-llama/llama-vocab.h | 2 + examples/talk-llama/llama.h | 22 +- examples/talk-llama/unicode.cpp | 207 +++ examples/talk-llama/unicode.h | 2 + 27 files changed, 3571 insertions(+), 1000 deletions(-) diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp index e63ab284bc3..062a9977678 100644 --- a/examples/talk-llama/llama-arch.cpp +++ b/examples/talk-llama/llama-arch.cpp @@ -34,6 +34,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PHIMOE, "phimoe" }, { LLM_ARCH_PLAMO, "plamo" }, + { LLM_ARCH_PLAMO2, "plamo2" }, { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, @@ -67,6 +68,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, + { LLM_ARCH_EXAONE4, "exaone4" }, { LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" }, { LLM_ARCH_RWKV7, "rwkv7" }, @@ -81,9 +83,11 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DOTS1, "dots1" }, { LLM_ARCH_ARCEE, "arcee" }, { LLM_ARCH_ERNIE4_5, "ernie4_5" }, + { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" }, { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_LFM2, "lfm2" }, + { LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -784,6 +788,36 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_PLAMO2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" }, + { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" }, + { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, { LLM_ARCH_CODESHELL, { @@ -1477,6 +1511,26 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_EXAONE4, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + } + }, { LLM_ARCH_RWKV6, { @@ -1793,6 +1847,31 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_ERNIE4_5_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + }, + }, { LLM_ARCH_HUNYUAN_MOE, { @@ -1854,6 +1933,23 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, } }, + { + LLM_ARCH_DREAM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2094,6 +2190,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { switch (arch) { case LLM_ARCH_JAMBA: case LLM_ARCH_FALCON_H1: + case LLM_ARCH_PLAMO2: case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_LFM2: return true; @@ -2101,3 +2198,12 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { return false; } } + +bool llm_arch_is_diffusion(const llm_arch & arch) { + switch (arch) { + case LLM_ARCH_DREAM: + return true; + default: + return false; + } +} diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h index 1f973259524..d09b7d7810b 100644 --- a/examples/talk-llama/llama-arch.h +++ b/examples/talk-llama/llama-arch.h @@ -38,6 +38,7 @@ enum llm_arch { LLM_ARCH_PHI3, LLM_ARCH_PHIMOE, LLM_ARCH_PLAMO, + LLM_ARCH_PLAMO2, LLM_ARCH_CODESHELL, LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, @@ -71,6 +72,7 @@ enum llm_arch { LLM_ARCH_JAIS, LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, + LLM_ARCH_EXAONE4, LLM_ARCH_RWKV6, LLM_ARCH_RWKV6QWEN2, LLM_ARCH_RWKV7, @@ -85,9 +87,11 @@ enum llm_arch { LLM_ARCH_DOTS1, LLM_ARCH_ARCEE, LLM_ARCH_ERNIE4_5, + LLM_ARCH_ERNIE4_5_MOE, LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_SMOLLM3, LLM_ARCH_LFM2, + LLM_ARCH_DREAM, LLM_ARCH_UNKNOWN, }; @@ -478,3 +482,4 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor); bool llm_arch_is_recurrent(const llm_arch & arch); bool llm_arch_is_hybrid (const llm_arch & arch); +bool llm_arch_is_diffusion(const llm_arch & arch); diff --git a/examples/talk-llama/llama-batch.cpp b/examples/talk-llama/llama-batch.cpp index 3bc8554e51c..a546063c0a7 100644 --- a/examples/talk-llama/llama-batch.cpp +++ b/examples/talk-llama/llama-batch.cpp @@ -27,6 +27,7 @@ bool llama_batch_allocr::init( const llama_vocab & vocab, const llama_memory_i * memory, uint32_t n_embd, + uint32_t n_seq_max, bool output_all) { clear(); @@ -40,6 +41,11 @@ bool llama_batch_allocr::init( // validate input batch // + if (n_seq_max > LLAMA_MAX_SEQ) { + LLAMA_LOG_ERROR("%s: n_seq_max = %d > %d\n", __func__, n_seq_max, LLAMA_MAX_SEQ); + return false; + } + if (batch.token) { for (int32_t i = 0; i < batch.n_tokens; ++i) { if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) { @@ -52,8 +58,8 @@ bool llama_batch_allocr::init( if (batch.seq_id) { for (int32_t i = 0; i < batch.n_tokens; ++i) { for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) { - if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) { - LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ); + if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) { + LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max); return false; } } @@ -86,7 +92,7 @@ bool llama_batch_allocr::init( // initialize the starting position for each sequence based on the positions in the memory llama_pos p0[LLAMA_MAX_SEQ]; - for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { + for (uint32_t s = 0; s < n_seq_max; ++s) { if (!memory) { // if no memory -> start from 0 p0[s] = 0; @@ -143,13 +149,16 @@ bool llama_batch_allocr::init( // compute stats // - this->n_embd = n_embd; + this->n_embd = n_embd; + this->n_seq_max = n_seq_max; // count the outputs in this batch for (int32_t i = 0; i < batch.n_tokens; ++i) { n_outputs += batch.logits[i] != 0; } + has_cpl = false; + // determine coupled sequences // these are pairs of sequences that have at least one token in the input batch that is assigned to both of them for (int32_t i = 0; i < batch.n_tokens; ++i) { @@ -189,7 +198,7 @@ bool llama_batch_allocr::init( seq_set_map[cur].push_back(i); } - for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { + for (uint32_t s = 0; s < n_seq_max; ++s) { if (seq_set_unq.test(s)) { seq_idx[s] = seq_id_unq.size(); seq_id_unq.push_back(s); @@ -201,7 +210,7 @@ bool llama_batch_allocr::init( LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__); llama_ubatch ubatch { - /*.equal_seqs =*/ false, + /*.b_equal_seqs =*/ false, /*.n_tokens =*/ (uint32_t) batch.n_tokens, /*.n_seq_tokens =*/ (uint32_t) 1, /*.n_seqs =*/ (uint32_t) batch.n_tokens, @@ -214,6 +223,7 @@ bool llama_batch_allocr::init( /*.seq_id_unq =*/ this->seq_id_unq.data(), /*.seq_idx =*/ this->seq_idx.data(), /*.output =*/ batch.logits, + /*.data =*/ {}, }; ubatch_print(ubatch, debug); @@ -241,7 +251,7 @@ bool llama_batch_allocr::init( // consistency checks // - for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { + for (uint32_t s = 0; s < n_seq_max; ++s) { if (seq_pos[s].empty()) { continue; } @@ -284,8 +294,8 @@ bool llama_batch_allocr::init( } if (memory) { - for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) { - for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) { + for (uint32_t s0 = 0; s0 < n_seq_max; ++s0) { + for (uint32_t s1 = 0; s1 < n_seq_max; ++s1) { if (seq_cpl[s0][s1]) { if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) || memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) { @@ -316,12 +326,12 @@ bool llama_batch_allocr::init( // { seq_set_t cur_seq_set[LLAMA_MAX_SEQ]; - for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { + for (uint32_t s = 0; s < n_seq_max; ++s) { cur_seq_set[s].set(); } llama_pos cur_seq_pos[LLAMA_MAX_SEQ]; - for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { + for (uint32_t s = 0; s < n_seq_max; ++s) { cur_seq_pos[s] = -1; } @@ -357,39 +367,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t clear(); split_reset(); - ubatches.emplace_back(); + auto udata = std::make_shared(); - auto & ubatch = ubatches.back(); - - ubatch.token .resize(n_tokens); - ubatch.embd .clear(); - ubatch.pos .resize(n_tokens); - ubatch.n_seq_id .resize(n_tokens); - ubatch.seq_id .resize(n_tokens); - ubatch.seq_id_unq.resize(0); - ubatch.seq_idx .resize(LLAMA_MAX_SEQ, -1); - ubatch.output .resize(n_tokens); + udata->token .resize(n_tokens); + udata->embd .clear(); + udata->pos .resize(n_tokens); + udata->n_seq_id .resize(n_tokens); + udata->seq_id .resize(n_tokens); + udata->seq_id_unq.resize(0); + udata->seq_idx .resize(LLAMA_MAX_SEQ, -1); + udata->output .resize(n_tokens); for (uint32_t s = 0; s < n_seqs; ++s) { - ubatch.seq_idx[s] = s; - ubatch.seq_id_unq.push_back(s); + udata->seq_idx[s] = s; + udata->seq_id_unq.push_back(s); } llama_ubatch res { - /*.equal_seqs =*/ true, + /*.b_equal_seqs =*/ true, /*.n_tokens =*/ n_tokens, /*.n_seq_tokens =*/ n_seq_tokens, /*.n_seqs =*/ n_seqs, /*.n_seqs_unq =*/ n_seqs, - /*.token =*/ ubatch.token.data(), + /*.token =*/ udata->token.data(), /*.embd =*/ nullptr, - /*.pos =*/ ubatch.pos.data(), - /*.n_seq_id =*/ ubatch.n_seq_id.data(), - /*.seq_id =*/ ubatch.seq_id.data(), - /*.seq_id_unq =*/ ubatch.seq_id_unq.data(), - /*.seq_idx =*/ ubatch.seq_idx.data(), - /*.output =*/ ubatch.output.data(), + /*.pos =*/ udata->pos.data(), + /*.n_seq_id =*/ udata->n_seq_id.data(), + /*.seq_id =*/ udata->seq_id.data(), + /*.seq_id_unq =*/ udata->seq_id_unq.data(), + /*.seq_idx =*/ udata->seq_idx.data(), + /*.output =*/ udata->output.data(), + /*.data =*/ std::move(udata), }; return res; @@ -430,8 +439,6 @@ void llama_batch_allocr::split_reset() { used.clear(); used.resize(get_n_tokens(), false); - - ubatches.clear(); } llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) { @@ -646,78 +653,77 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u assert(n_tokens%n_seqs == 0); - ubatches.emplace_back(); - - auto & ubatch = ubatches.back(); + auto udata = std::make_shared(); const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1; const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0; const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur; - ubatch.token .resize(n_tokens); - ubatch.embd .resize(n_embd_all); - ubatch.pos .resize(n_pos_all); - ubatch.n_seq_id .resize(n_tokens); - ubatch.seq_id .resize(n_tokens); - ubatch.seq_id_unq.resize(0); - ubatch.seq_idx .resize(LLAMA_MAX_SEQ, -1); - ubatch.output .resize(n_tokens); + udata->token .resize(n_tokens); + udata->embd .resize(n_embd_all); + udata->pos .resize(n_pos_all); + udata->n_seq_id .resize(n_tokens); + udata->seq_id .resize(n_tokens); + udata->seq_id_unq.resize(0); + udata->seq_idx .resize(LLAMA_MAX_SEQ, -1); + udata->output .resize(n_tokens); seq_set_t seq_set_unq; for (size_t i = 0; i < idxs.size(); ++i) { if (batch.token) { - ubatch.token[i] = batch.token[idxs[i]]; + udata->token[i] = batch.token[idxs[i]]; } if (batch.embd) { - memcpy(ubatch.embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float)); + memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float)); } for (int j = 0; j < n_pos_cur; ++j) { - ubatch.pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]]; + udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]]; } - ubatch.n_seq_id[i] = batch.n_seq_id[idxs[i]]; - ubatch.seq_id[i] = batch.seq_id[idxs[i]]; - ubatch.output[i] = batch.logits[idxs[i]]; + udata->n_seq_id[i] = batch.n_seq_id[idxs[i]]; + udata->seq_id[i] = batch.seq_id[idxs[i]]; + udata->output[i] = batch.logits[idxs[i]]; - for (int s = 0; s < ubatch.n_seq_id[i]; ++s) { - seq_set_unq.set(ubatch.seq_id[i][s]); + for (int s = 0; s < udata->n_seq_id[i]; ++s) { + seq_set_unq.set(udata->seq_id[i][s]); } - if (ubatch.output[i]) { + if (udata->output[i]) { out_ids.push_back(idxs[i]); } } - for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { + for (uint32_t s = 0; s < n_seq_max; ++s) { if (seq_set_unq.test(s)) { - ubatch.seq_idx[s] = ubatch.seq_id_unq.size(); - ubatch.seq_id_unq.push_back(s); + udata->seq_idx[s] = udata->seq_id_unq.size(); + udata->seq_id_unq.push_back(s); } } llama_ubatch res { - /*.equal_seqs =*/ equal_seqs, + /*.b_equal_seqs =*/ equal_seqs, /*.n_tokens =*/ n_tokens, /*.n_seq_tokens =*/ n_tokens/n_seqs, /*.n_seqs =*/ n_seqs, - /*.n_seqs_unq =*/ (uint32_t) ubatch.seq_id_unq.size(), - - /*.token =*/ batch.token ? ubatch.token.data() : nullptr, - /*.embd =*/ batch.embd ? ubatch.embd.data() : nullptr, - /*.pos =*/ ubatch.pos.data(), - /*.n_seq_id =*/ ubatch.n_seq_id.data(), - /*.seq_id =*/ ubatch.seq_id.data(), - /*.seq_id_unq =*/ ubatch.seq_id_unq.data(), - /*.seq_idx =*/ ubatch.seq_idx.data(), - /*.output =*/ ubatch.output.data(), + /*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(), + + /*.token =*/ batch.token ? udata->token.data() : nullptr, + /*.embd =*/ batch.embd ? udata->embd.data() : nullptr, + /*.pos =*/ udata->pos.data(), + /*.n_seq_id =*/ udata->n_seq_id.data(), + /*.seq_id =*/ udata->seq_id.data(), + /*.seq_id_unq =*/ udata->seq_id_unq.data(), + /*.seq_idx =*/ udata->seq_idx.data(), + /*.output =*/ udata->output.data(), + /*.data =*/ std::move(udata), }; if (debug > 0) { - LLAMA_LOG_DEBUG("%s: added ubatch %d to split:\n", __func__, (int) ubatches.size() - 1); + LLAMA_LOG_DEBUG("%s: added ubatch to split:\n", __func__); ubatch_print(res, debug); } @@ -727,7 +733,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) { if (debug > 0) { - LLAMA_LOG_DEBUG("%s: equal_seqs = %d\n", __func__, ubatch.equal_seqs); + LLAMA_LOG_DEBUG("%s: equal_seqs = %d\n", __func__, ubatch.equal_seqs()); LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, ubatch.n_tokens); LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d\n", __func__, ubatch.n_seq_tokens); LLAMA_LOG_DEBUG("%s: n_seqs = %d\n", __func__, ubatch.n_seqs); diff --git a/examples/talk-llama/llama-batch.h b/examples/talk-llama/llama-batch.h index 3420803ff94..d563adc66aa 100644 --- a/examples/talk-llama/llama-batch.h +++ b/examples/talk-llama/llama-batch.h @@ -8,12 +8,17 @@ #include #include #include +#include #include // keep this struct lightweight -// it points to data in `llama_batch_allocr` struct llama_ubatch { - bool equal_seqs; + bool equal_seqs() const { + return b_equal_seqs != 0; + } + + uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment + // otherwise address sanitizer complains // TODO: whole_seqs for embeddings? uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs) @@ -34,6 +39,20 @@ struct llama_ubatch { llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx int8_t * output; // [n_tokens] | i | - + + struct data_t { + std::vector token; + std::vector embd; + std::vector pos; + std::vector n_seq_id; + std::vector seq_id; + std::vector seq_id_unq; + std::vector seq_idx; + std::vector output; + }; + + // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data + std::shared_ptr data; }; // a helper for sanitizing, fulfilling and splitting a batch @@ -48,6 +67,7 @@ class llama_batch_allocr { const llama_vocab & vocab, const llama_memory_i * memory, uint32_t n_embd, + uint32_t n_seq_max, bool output_all); const llama_batch & get_batch() const; @@ -100,6 +120,7 @@ class llama_batch_allocr { const uint32_t n_pos_per_embd; uint32_t n_embd; + uint32_t n_seq_max; uint32_t n_outputs; std::array seq_id_0 = { 0 }; // default sequence id @@ -115,7 +136,7 @@ class llama_batch_allocr { using seq_cpl_t = std::vector; // helper flag to quickly determine if there are any coupled sequences in the batch - bool has_cpl; + bool has_cpl = false; std::vector seq_pos; // seq_pos[s]: the set of positions in sequence s std::vector seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1 @@ -135,20 +156,5 @@ class llama_batch_allocr { // used[i] indicates if token i has already been used in a previous ubatch std::vector used; - // llama_ubatch points to this data: - struct ubatch { - std::vector token; - std::vector embd; - std::vector pos; - std::vector n_seq_id; - std::vector seq_id; - std::vector seq_id_unq; - std::vector seq_idx; - std::vector output; - }; - - // current splitting state: - std::vector ubatches; - int debug; }; diff --git a/examples/talk-llama/llama-chat.cpp b/examples/talk-llama/llama-chat.cpp index cbc19d3c40c..d34bb26878c 100644 --- a/examples/talk-llama/llama-chat.cpp +++ b/examples/talk-llama/llama-chat.cpp @@ -56,6 +56,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE }, { "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, + { "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 }, { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD }, { "granite", LLM_CHAT_TEMPLATE_GRANITE }, { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT }, @@ -65,6 +66,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM }, { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE }, + { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -167,10 +169,13 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) { return LLM_CHAT_TEMPLATE_DEEPSEEK_3; } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) { + if (tmpl_contains("[|tool|]")) { + return LLM_CHAT_TEMPLATE_EXAONE_4; + } // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb // EXAONE-3.0-7.8B-Instruct return LLM_CHAT_TEMPLATE_EXAONE_3; - } else if (tmpl_contains("rwkv-world")) { + } else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) { return LLM_CHAT_TEMPLATE_RWKV_WORLD; } else if (tmpl_contains("<|start_of_role|>")) { return LLM_CHAT_TEMPLATE_GRANITE; @@ -188,6 +193,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_DOTS1; } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) { return LLM_CHAT_TEMPLATE_HUNYUAN_MOE; + } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { + return LLM_CHAT_TEMPLATE_KIMI_K2; } return LLM_CHAT_TEMPLATE_UNKNOWN; } @@ -529,6 +536,22 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "[|assistant|]"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_4) { + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n"; + } else if (role == "user") { + ss << "[|user|]" << trim(message->content) << "\n"; + } else if (role == "assistant") { + ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n"; + } else if (role == "tool") { + ss << "[|tool|]" << trim(message->content) << "[|endofturn|]\n"; + } + } + if (add_ass) { + ss << "[|assistant|]"; + } } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) { // this template requires the model to have "\n\n" as EOT token for (size_t i = 0; i < chat.size(); i++) { @@ -680,6 +703,25 @@ int32_t llm_chat_apply_template( ss << "<|startoftext|>" << message->content << "<|extra_0|>"; } } + } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) { + // moonshotai/Kimi-K2-Instruct + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "<|im_system|>system<|im_middle|>"; + } else if (role == "user") { + ss << "<|im_user|>user<|im_middle|>"; + } else if (role == "assistant") { + ss << "<|im_assistant|>assistant<|im_middle|>"; + } else if (role == "tool") { + ss << "<|im_system|>tool<|im_middle|>"; + } + + ss << message->content << "<|im_end|>"; + } + if (add_ass) { + ss << "<|im_assistant|>assistant<|im_middle|>"; + } } else { // template not supported return -1; diff --git a/examples/talk-llama/llama-chat.h b/examples/talk-llama/llama-chat.h index b621fda2816..6968a19fbe1 100644 --- a/examples/talk-llama/llama-chat.h +++ b/examples/talk-llama/llama-chat.h @@ -35,6 +35,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_GLMEDGE, LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_EXAONE_3, + LLM_CHAT_TEMPLATE_EXAONE_4, LLM_CHAT_TEMPLATE_RWKV_WORLD, LLM_CHAT_TEMPLATE_GRANITE, LLM_CHAT_TEMPLATE_GIGACHAT, @@ -45,6 +46,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_SMOLVLM, LLM_CHAT_TEMPLATE_DOTS1, LLM_CHAT_TEMPLATE_HUNYUAN_MOE, + LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp index 06e93b19cbf..9e77fe6d869 100644 --- a/examples/talk-llama/llama-context.cpp +++ b/examples/talk-llama/llama-context.cpp @@ -98,10 +98,20 @@ llama_context::llama_context( LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); cparams.n_batch = GGML_KQ_MASK_PAD; } - cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); cparams.op_offload = params.op_offload; + cparams.kv_unified = params.kv_unified; + + { + const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); + supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false; + + if (!supports_set_rows && !cparams.kv_unified) { + LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__); + cparams.kv_unified = true; + } + } const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; @@ -112,6 +122,7 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn); LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); + LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false"); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); @@ -227,8 +238,8 @@ llama_context::llama_context( LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes); - // buffer used to store the computation graph and the tensor meta data - buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + gf_res_prev.reset(new llm_graph_result(max_nodes)); + gf_res_reserve.reset(new llm_graph_result(max_nodes)); // TODO: move these checks to ggml_backend_sched // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary @@ -267,7 +278,7 @@ llama_context::llama_context( // reserve worst-case graph if (!hparams.vocab_only && memory) { - const uint32_t n_seqs = cparams.n_seq_max; + const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs); @@ -287,7 +298,7 @@ llama_context::llama_context( cross.v_embd.clear(); - // reserve pp graph first so that buffers are only allocated once + // reserve pp (prompt processing) graph first so that buffers are only allocated once { auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); if (!gf) { @@ -298,9 +309,9 @@ llama_context::llama_context( n_nodes_pp = ggml_graph_n_nodes(gf); } - // reserve with tg graph to get the number of splits and nodes + // reserve with tg (token generation) graph to get the number of splits and nodes { - auto * gf = graph_reserve(1, 1, 1, mctx.get()); + auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get()); if (!gf) { throw std::runtime_error("failed to allocate compute tg buffers"); } @@ -311,6 +322,10 @@ llama_context::llama_context( // reserve again with pp graph to avoid ggml-alloc reallocations during inference { + // TODO: not sure if the following graph would be worster case for multi-stream KV caches: + // + // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get()); + // auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); @@ -388,10 +403,6 @@ ggml_backend_sched_t llama_context::get_sched() const { return sched.get(); } -ggml_context * llama_context::get_ctx_compute() const { - return ctx_compute.get(); -} - uint32_t llama_context::n_ctx() const { return cparams.n_ctx; } @@ -463,6 +474,11 @@ bool llama_context::kv_self_update(bool optimize) { } } + // reset the previous graph result to make sure that it won't be reused + // TODO: change the mctx->apply() to return information if a graph reserve is needed + // reset the graph result only if the memory module did reset the scheduler + gf_res_prev->reset(); + if (!mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__); } @@ -475,7 +491,7 @@ bool llama_context::kv_self_update(bool optimize) { throw std::runtime_error("failed to initialize memory context"); } - const uint32_t n_seqs = cparams.n_seq_max; + const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); @@ -492,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const { } float * llama_context::get_logits() { + output_reorder(); + return logits; } float * llama_context::get_logits_ith(int32_t i) { int64_t j = -1; + output_reorder(); + try { if (logits == nullptr) { throw std::runtime_error("no logits"); @@ -534,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) { } float * llama_context::get_embeddings() { + output_reorder(); + return embd; } float * llama_context::get_embeddings_ith(int32_t i) { int64_t j = -1; + output_reorder(); + try { if (embd == nullptr) { throw std::runtime_error("no embeddings"); @@ -678,38 +702,59 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { +llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { if (mctx && !mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); ret = GGML_STATUS_FAILED; return nullptr; } - auto * gf = graph_init(); - if (!gf) { - LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__); - ret = GGML_STATUS_FAILED; - return nullptr; - } + auto * res = gf_res_prev.get(); + auto * gf = res->get_gf(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__); - ret = GGML_STATUS_FAILED; - return nullptr; - } + // the new graph parameters + // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters + const auto gparams = graph_params(res, ubatch, mctx, gtype); - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + if (res->can_reuse(gparams)) { + //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__); - if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__); - ret = GGML_STATUS_ALLOC_FAILED; - return nullptr; + n_reused++; + } else { + res->reset(); + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + //const auto t_start_us = ggml_time_us(); + + gf = model.build_graph(gparams); + + //LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); + + if (!gf) { + LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__); + ret = GGML_STATUS_FAILED; + return nullptr; + } + + if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__); + ret = GGML_STATUS_ALLOC_FAILED; + return nullptr; + } } - res->set_inputs(&ubatch); + // set the input data for the input tensors + { + //const auto t_start_us = ggml_time_us(); + + res->set_inputs(&ubatch); + + //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); + } - const auto status = graph_compute(gf, ubatch.n_tokens > 1); + const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1); if (status != GGML_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status); ret = status; @@ -731,16 +776,19 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd; + const int64_t n_embd = hparams.n_embd; + const int32_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 - if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) { + if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); return -1; } const uint32_t n_tokens = balloc->get_n_tokens(); + // [TAG_NO_CACHE_PAD] + // TODO: add new split mode where we pad the input sequences so that ubatch.equal_seqs == true const llama_ubatch ubatch = balloc->split_simple(n_tokens); // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot @@ -767,9 +815,6 @@ int llama_context::encode(const llama_batch & batch_inp) { n_outputs = n_tokens; - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - const auto causal_attn_org = cparams.causal_attn; // always use non-causal attention for encoder graphs @@ -778,7 +823,7 @@ int llama_context::encode(const llama_batch & batch_inp) { cparams.causal_attn = false; ggml_status status; - const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status); + const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status); cparams.causal_attn = causal_attn_org; @@ -791,10 +836,20 @@ int llama_context::encode(const llama_batch & batch_inp) { } } + auto * t_logits = res->get_logits(); auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); + // extract logits + if (logits && t_logits) { + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); + GGML_ASSERT(backend_res != nullptr); + GGML_ASSERT(logits != nullptr); + + ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float)); + } + // extract embeddings - if (t_embd) { + if (embd && t_embd) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); GGML_ASSERT(backend_embd != nullptr); @@ -844,9 +899,11 @@ int llama_context::encode(const llama_batch & batch_inp) { } } - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(sched.get()); + if (!supports_set_rows) { + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + } // TODO: hacky solution if (model.arch == LLM_ARCH_T5 && t_embd) { @@ -899,7 +956,7 @@ int llama_context::decode(const llama_batch & batch_inp) { // when computing embeddings, all tokens are output const bool output_all = cparams.embeddings; - if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) { + if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); return -1; } @@ -927,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) { // TODO: this clear of the buffer can easily be forgotten - need something better embd_seq.clear(); + output_swaps.clear(); bool did_optimize = false; @@ -1005,11 +1063,8 @@ int llama_context::decode(const llama_batch & batch_inp) { n_outputs = n_outputs_new; } - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - ggml_status status; - const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); + const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache @@ -1149,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) { // make the outputs have the same order they had in the user-provided batch // note: this is mostly relevant for recurrent models atm if (!sorted_output) { - const uint32_t n_vocab = model.vocab.n_tokens(); - const uint64_t n_embd = model.hparams.n_embd; - GGML_ASSERT((size_t) n_outputs == out_ids.size()); // TODO: is there something more efficient which also minimizes swaps? @@ -1167,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) { continue; } std::swap(out_ids[i], out_ids[j_min]); - if (logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); - } - } - if (embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); - } - } + + // remember the swaps and apply them lazily upon logits/embeddings access + output_swaps.push_back({ i, j_min }); } std::fill(output_ids.begin(), output_ids.end(), -1); @@ -1190,9 +1235,11 @@ int llama_context::decode(const llama_batch & batch_inp) { // wait for the computation to finish (automatically done when obtaining the model output) //synchronize(); - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(sched.get()); + if (!supports_set_rows) { + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + } return 0; } @@ -1271,24 +1318,40 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { return n_outputs_max; } +void llama_context::output_reorder() { + const uint32_t n_vocab = model.vocab.n_tokens(); + const uint64_t n_embd = model.hparams.n_embd; + + for (uint32_t s = 0; s < output_swaps.size(); ++s) { + const uint32_t i0 = output_swaps[s].i0; + const uint32_t i1 = output_swaps[s].i1; + + if (logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]); + } + } + + if (embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]); + } + } + } + + output_swaps.clear(); +} + // // graph // -int32_t llama_context::graph_max_nodes() const { - return std::max(65536, 5*model.n_tensors()); +uint32_t llama_context::graph_max_nodes() const { + return std::max(1024u, 8u*model.n_tensors()); } -ggml_cgraph * llama_context::graph_init() { - ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ctx_compute.reset(ggml_init(params)); - - return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false); +llm_graph_result * llama_context::get_gf_res_reserve() const { + return static_cast(gf_res_reserve.get()); } ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) { @@ -1301,6 +1364,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); } + ggml_backend_sched_reset(sched.get()); + + // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that + gf_res_prev->reset(); + // store the n_outputs as it is, and restore it afterwards // TODO: not sure if needed, might simplify in the future by removing this const auto save_n_outputs = this->n_outputs; @@ -1310,17 +1378,15 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u llama_batch_allocr balloc(model.hparams.n_pos_per_embd()); llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); - auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx); + auto * res = gf_res_reserve.get(); - this->n_outputs = save_n_outputs; + const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__); - return nullptr; - } + res->reset(); - ggml_backend_sched_reset(sched.get()); + auto * gf = model.build_graph(gparams); + + this->n_outputs = save_n_outputs; // initialize scheduler with the specified graph if (!ggml_backend_sched_reserve(sched.get(), gf)) { @@ -1331,28 +1397,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u return gf; } -llm_graph_result_ptr llama_context::graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype, - const llama_memory_context_i * mctx) { - return model.build_graph( - { - /*.ctx =*/ ctx, - /*.arch =*/ model.arch, - /*.hparams =*/ model.hparams, - /*.cparams =*/ cparams, - /*.ubatch =*/ ubatch, - /*.sched =*/ sched.get(), - /*.backend_cpu =*/ backend_cpu, - /*.cvec =*/ &cvec, - /*.loras =*/ &loras, - /*.mctx =*/ mctx, - /*.cross =*/ &cross, - /*.n_outputs =*/ n_outputs, - /*.cb =*/ graph_get_cb(), - }, gf, gtype); +llm_graph_params llama_context::graph_params( + llm_graph_result * res, + const llama_ubatch & ubatch, + const llama_memory_context_i * mctx, + llm_graph_type gtype) const { + return { + /*.arch =*/ model.arch, + /*.hparams =*/ model.hparams, + /*.cparams =*/ cparams, + /*.ubatch =*/ ubatch, + /*.gtype =*/ gtype, + /*.sched =*/ sched.get(), + /*.backend_cpu =*/ backend_cpu, + /*.cvec =*/ &cvec, + /*.loras =*/ &loras, + /*.mctx =*/ mctx, + /*.cross =*/ &cross, + /*.n_outputs =*/ n_outputs, + /*.cb =*/ graph_get_cb(), + /*.res =*/ res, + }; } ggml_status llama_context::graph_compute( @@ -1930,6 +1995,7 @@ llama_perf_context_data llama_context::perf_get_data() const { data.t_eval_ms = 1e-3 * t_eval_us; data.n_p_eval = std::max(1, n_p_eval); data.n_eval = std::max(1, n_eval); + data.n_reused = std::max(0, n_reused); return data; } @@ -1938,6 +2004,7 @@ void llama_context::perf_reset() { t_start_us = ggml_time_us(); t_eval_us = n_eval = 0; t_p_eval_us = n_p_eval = 0; + n_reused = 0; } // @@ -2028,7 +2095,7 @@ void llama_context::opt_epoch_iter( batch.logits [pos_batch] = true; } - if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) { + if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); return; } @@ -2064,8 +2131,13 @@ void llama_context::opt_epoch_iter( break; } - auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get()); + auto * res = gf_res_prev.get(); + + const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT); + + res->reset(); + + auto * gf = model.build_graph(gparams); struct ggml_context * ctx_compute_opt; { @@ -2187,6 +2259,7 @@ llama_context_params llama_context_default_params() { /*.no_perf =*/ true, /*.op_offload =*/ true, /*.swa_full =*/ true, + /*.kv_unified =*/ false, }; return result; @@ -2807,6 +2880,7 @@ void llama_perf_context_print(const llama_context * ctx) { LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused); } void llama_perf_context_reset(llama_context * ctx) { diff --git a/examples/talk-llama/llama-context.h b/examples/talk-llama/llama-context.h index 9ce05715a8c..5c3a1c09886 100644 --- a/examples/talk-llama/llama-context.h +++ b/examples/talk-llama/llama-context.h @@ -35,8 +35,6 @@ struct llama_context { ggml_backend_sched_t get_sched() const; - ggml_context * get_ctx_compute() const; - uint32_t n_ctx() const; uint32_t n_ctx_per_seq() const; uint32_t n_batch() const; @@ -96,7 +94,7 @@ struct llama_context { // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation // returns nullptr only if ret != GGML_STATUS_SUCCESS - llm_graph_result_ptr process_ubatch( + llm_graph_result * process_ubatch( const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, @@ -183,15 +181,17 @@ struct llama_context { // Returns max number of outputs for which space was reserved. uint32_t output_reserve(int32_t n_outputs); + void output_reorder(); + // // graph // public: - int32_t graph_max_nodes() const; + uint32_t graph_max_nodes() const; - // zero-out inputs and create the ctx_compute for the compute graph - ggml_cgraph * graph_init(); + // can reuse the llm_graph_result instance of the context (for example to update a memory module) + llm_graph_result * get_gf_res_reserve() const; // returns the result of ggml_backend_sched_graph_compute_async execution ggml_status graph_compute(ggml_cgraph * gf, bool batched); @@ -200,12 +200,11 @@ struct llama_context { ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx); private: - llm_graph_result_ptr graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype, - const llama_memory_context_i * mctx); + llm_graph_params graph_params( + llm_graph_result * res, + const llama_ubatch & ubatch, + const llama_memory_context_i * mctx, + llm_graph_type gtype) const; llm_graph_cb graph_get_cb() const; @@ -253,13 +252,18 @@ struct llama_context { std::vector output_ids; // map batch token positions to ids of the logits and embd buffers + struct swap_info { + uint32_t i0; + uint32_t i1; + }; + + std::vector output_swaps; + ggml_backend_sched_ptr sched; ggml_backend_t backend_cpu = nullptr; std::vector backends; - ggml_context_ptr ctx_compute; - // training ggml_opt_context_t opt_ctx = nullptr; @@ -275,14 +279,18 @@ struct llama_context { std::vector backend_ptrs; std::vector backend_buft; - // memory buffers used to evaluate the model - std::vector buf_compute_meta; + llm_graph_result_ptr gf_res_prev; + llm_graph_result_ptr gf_res_reserve; // host buffer for the model output (logits and embeddings) ggml_backend_buffer_ptr buf_output; bool has_evaluated_once = false; + // env: LLAMA_SET_ROWS (temporary) + // ref: https://github.com/ggml-org/llama.cpp/pull/14285 + bool supports_set_rows = false; + // perf mutable int64_t t_start_us = 0; mutable int64_t t_load_us = 0; @@ -294,4 +302,6 @@ struct llama_context { mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) mutable int32_t n_eval = 0; // number of eval calls + + mutable int32_t n_reused = 0; // number of times the previous graph was reused }; diff --git a/examples/talk-llama/llama-cparams.h b/examples/talk-llama/llama-cparams.h index 118615d5bd2..38750affc50 100644 --- a/examples/talk-llama/llama-cparams.h +++ b/examples/talk-llama/llama-cparams.h @@ -11,8 +11,8 @@ struct llama_cparams { uint32_t n_batch; uint32_t n_ubatch; uint32_t n_seq_max; - int n_threads; // number of threads to use for generation - int n_threads_batch; // number of threads to use for batch processing + int32_t n_threads; // number of threads to use for generation + int32_t n_threads_batch; // number of threads to use for batch processing float rope_freq_base; float rope_freq_scale; @@ -33,6 +33,7 @@ struct llama_cparams { bool no_perf; bool warmup; bool op_offload; + bool kv_unified; enum llama_pooling_type pooling_type; diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp index a248a7ec223..b63a41053b4 100644 --- a/examples/talk-llama/llama-graph.cpp +++ b/examples/talk-llama/llama-graph.cpp @@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) { + bool res = true; + + res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); + res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens); + + return res; +} + void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; @@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) { + bool res = true; + + res &= pos->ne[0] == params.ubatch.n_tokens; + + return res; +} + void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && attn_scale) { const int64_t n_tokens = ubatch->n_tokens; @@ -71,7 +88,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { const int64_t n_tokens = ubatch->n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); - GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing + GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing int32_t * data = (int32_t *) pos_bucket->data; @@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) { + bool res = true; + + res &= n_outputs == params.n_outputs; + + return res; +} + void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = ubatch->n_tokens; @@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) { mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } +bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + + res &= self_kq_mask->ne[0] == mctx->get_n_kv(); + res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= mctx->get_supports_set_rows(); // TODO: tmp + + return res; +} + void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); @@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } +bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + + res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + + res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv(); + res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv(); + res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp + + return res; +} + void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(cross_kq_mask); @@ -306,7 +373,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { const int64_t n_tokens = ubatch->n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer)); - GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing + GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing float * data = (float *) cross_kq_mask->data; @@ -340,6 +407,91 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { inp_rs->set_input(ubatch); } +// +// llm_graph_result +// + +llm_graph_result::llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) { + reset(); + + const char * LLAMA_GRAPH_RESULT_DEBUG = getenv("LLAMA_GRAPH_RESULT_DEBUG"); + debug = LLAMA_GRAPH_RESULT_DEBUG ? atoi(LLAMA_GRAPH_RESULT_DEBUG) : 0; +} + +int64_t llm_graph_result::get_max_nodes() const { + return max_nodes; +} + +void llm_graph_result::reset() { + t_tokens = nullptr; + t_logits = nullptr; + t_embd = nullptr; + t_embd_pooled = nullptr; + + params = {}; + + inputs.clear(); + + buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + + ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx_compute.reset(ggml_init(params)); + + gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false); +} + +void llm_graph_result::set_inputs(const llama_ubatch * ubatch) { + for (auto & input : inputs) { + input->set_input(ubatch); + } +} + +bool llm_graph_result::can_reuse(const llm_graph_params & params) { + if (!this->params.allow_reuse(params)) { + if (debug > 1) { + LLAMA_LOG_DEBUG("%s: cannot reuse graph due to incompatible graph parameters\n", __func__); + } + + return false; + } + + if (debug > 1) { + LLAMA_LOG_DEBUG("%s: checking compatibility of %d inputs:\n", __func__, (int) inputs.size()); + } + + bool res = true; + + for (auto & input : inputs) { + const bool cur = input->can_reuse(params); + + if (debug > 1) { + LLAMA_LOG_DEBUG("%s: can_reuse = %d\n", "placeholder", cur); + } + + res = res && cur; + } + + if (debug > 0) { + LLAMA_LOG_DEBUG("%s: can reuse graph = %d\n", __func__, res); + } + + return res; +} + +llm_graph_input_i * llm_graph_result::add_input(llm_graph_input_ptr input) { + inputs.emplace_back(std::move(input)); + return inputs.back().get(); +} + +void llm_graph_result::set_params(const llm_graph_params & params) { + this->params = params; +} + // // llm_graph_context // @@ -374,7 +526,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : n_ctx_orig (cparams.n_ctx_orig_yarn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), - ctx0 (params.ctx), sched (params.sched), backend_cpu (params.backend_cpu), cvec (params.cvec), @@ -382,7 +533,10 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : mctx (params.mctx), cross (params.cross), cb_func (params.cb), - res (std::make_unique()) { + res (params.res), + ctx0 (res->get_ctx()), + gf (res->get_gf()) { + res->set_params(params); } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { @@ -753,20 +907,28 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_weighted", il); } + ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; + + assert(n_expert_used > 0); + + // order the views before the adds + for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { + cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); + + ggml_build_forward_expand(gf, cur_experts[i]); + } + // aggregate experts - ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens, - experts->nb[2], i*experts->nb[1]); + // note: here we explicitly use hparams.n_expert_used instead of n_expert_used + // to avoid potentially a large number of add nodes during warmup + // ref: https://github.com/ggml-org/llama.cpp/pull/14753 + ggml_tensor * moe_out = cur_experts[0]; - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx0, moe_out, cur_expert); - } + for (uint32_t i = 1; i < hparams.n_expert_used; ++i) { + moe_out = ggml_add(ctx0, moe_out, cur_experts[i]); } - if (n_expert_used == 1) { + if (hparams.n_expert_used == 1) { // avoid returning a non-contiguous tensor moe_out = ggml_cont(ctx0, moe_out); } @@ -972,7 +1134,6 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t } ggml_tensor * llm_graph_context::build_attn_mha( - ggml_cgraph * gf, ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, @@ -982,13 +1143,16 @@ ggml_tensor * llm_graph_context::build_attn_mha( float kq_scale) const { const bool v_trans = v->nb[1] > v->nb[2]; + // split the batch into streams if needed + const auto n_stream = k->ne[3]; + + q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream); + q = ggml_permute(ctx0, q, 0, 2, 1, 3); k = ggml_permute(ctx0, k, 0, 2, 1, 3); v = ggml_permute(ctx0, v, 0, 2, 1, 3); - const auto n_tokens = q->ne[1]; - const auto n_head = q->ne[2]; - const auto n_kv = k->ne[1]; + const auto n_kv = k->ne[1]; ggml_tensor * cur; @@ -1030,7 +1194,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( #endif } - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); } else { ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); @@ -1075,7 +1239,8 @@ ggml_tensor * llm_graph_context::build_attn_mha( cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); + // recombine streams + cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); if (!cparams.offload_kqv) { // all nodes between the KV store and the attention output are run on the CPU @@ -1102,7 +1267,6 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_no_cache * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -1122,11 +1286,15 @@ ggml_tensor * llm_graph_context::build_attn( const auto & kq_mask = inp->get_kq_mask(); + // [TAG_NO_CACHE_PAD] + // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams + assert(!ubatch.equal_seqs()); + ggml_tensor * q = q_cur; ggml_tensor * k = k_cur; ggml_tensor * v = v_cur; - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1156,13 +1324,14 @@ static std::unique_ptr build_attn_inp_kv_unifie { GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA"); - const auto n_kv = mctx_cur->get_n_kv(); + const auto n_kv = mctx_cur->get_n_kv(); const auto n_tokens = ubatch.n_tokens; + const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq; inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch); inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); - inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); + inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1181,7 +1350,6 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_kv_unified * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -1214,7 +1382,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1234,7 +1402,6 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_kv_unified_iswa * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -1281,7 +1448,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1314,7 +1481,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_cross * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -1336,7 +1502,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = k_cur; ggml_tensor * v = v_cur; - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1362,13 +1528,15 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif auto inp = std::make_unique(hparams, cparams, mctx_cur); + const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq; + { const auto n_kv = mctx_cur->get_base()->get_n_kv(); inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch); inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch); - inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); + inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1382,7 +1550,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch); inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch); - inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1); + inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream); ggml_set_input(inp->self_kq_mask_swa); inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; @@ -1392,7 +1560,6 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif } ggml_tensor * llm_graph_context::build_rs( - ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, int32_t state_size, @@ -1450,21 +1617,19 @@ llm_graph_input_rs * llm_graph_context::build_rs_inp() const { ggml_tensor * llm_graph_context::build_rs( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * s, int32_t state_size, int32_t n_seqs, const llm_graph_get_rows_fn & get_state_rows) const { const auto * kv_state = inp->mctx; - return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows); + return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows); } ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( llm_graph_input_rs * inp, - ggml_cgraph * gf, const llama_ubatch & ubatch, - int il) const { + int il) const { const auto * mctx_cur = static_cast(mctx); const auto token_shift_count = hparams.token_shift_count; @@ -1474,7 +1639,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( ggml_tensor * token_shift_all = mctx_cur->get_r_l(il); ggml_tensor * token_shift = build_rs( - inp, gf, token_shift_all, + inp, token_shift_all, hparams.n_embd_r(), n_seqs); token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); @@ -1514,7 +1679,6 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { } void llm_graph_context::build_pooling( - ggml_cgraph * gf, ggml_tensor * cls, ggml_tensor * cls_b, ggml_tensor * cls_out, diff --git a/examples/talk-llama/llama-graph.h b/examples/talk-llama/llama-graph.h index fbf8e288956..a28a8c4bdda 100644 --- a/examples/talk-llama/llama-graph.h +++ b/examples/talk-llama/llama-graph.h @@ -1,6 +1,7 @@ #pragma once #include "llama-arch.h" +#include "llama-batch.h" #include "llama-hparams.h" #include "llama-adapter.h" @@ -14,7 +15,6 @@ struct ggml_cgraph; struct ggml_context; struct ggml_tensor; -struct llama_ubatch; struct llama_cparams; struct llama_memory_context_i; @@ -69,6 +69,8 @@ struct llama_cross { std::vector> seq_ids_enc; }; +struct llm_graph_params; + // // llm_graph_input // @@ -78,11 +80,19 @@ class llm_graph_input_i { virtual ~llm_graph_input_i() = default; virtual void set_input(const llama_ubatch * ubatch) = 0; + + // return true if the resulting input tensors using the provided graph parameters would be + // the same as the previous input tensors that we have currently stored in the object + virtual bool can_reuse(const llm_graph_params & params) { + // returning false here by default will prevent from reusing the graph if the check + // for the input type has not been implemented yet + GGML_UNUSED(params); + return false; + } }; using llm_graph_input_ptr = std::unique_ptr; - class llm_graph_input_embd : public llm_graph_input_i { public: llm_graph_input_embd() = default; @@ -90,6 +100,8 @@ class llm_graph_input_embd : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] }; @@ -101,6 +113,8 @@ class llm_graph_input_pos : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + ggml_tensor * pos = nullptr; // I32 [n_batch] const uint32_t n_pos_per_embd = 1; @@ -154,17 +168,19 @@ class llm_graph_input_out_ids : public llm_graph_input_i { llm_graph_input_out_ids( const llama_hparams & hparams, const llama_cparams & cparams, - int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} + uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} virtual ~llm_graph_input_out_ids() = default; void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + ggml_tensor * out_ids; // I32 [n_outputs] const llama_hparams & hparams; const llama_cparams & cparams; - const int32_t n_outputs; + const uint32_t n_outputs; }; class llm_graph_input_mean : public llm_graph_input_i { @@ -249,16 +265,18 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; } ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] - ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] - ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1] + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] const llama_hparams & hparams; const llama_cparams & cparams; @@ -280,6 +298,8 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; } ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; } @@ -289,14 +309,14 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] - ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch] - ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] - ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1] - ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch, 1, 1] - ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch, 1, 1] + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] const llama_hparams & hparams; const llama_cparams & cparams; @@ -351,40 +371,108 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i { // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc. // these are used by the llama_context to extact the relevant data, based on the compute parameters -class llm_graph_result_i { -public: - virtual ~llm_graph_result_i() = default; +// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) +using llm_graph_cb = std::function; - virtual ggml_tensor * get_tokens() = 0; - virtual ggml_tensor * get_logits() = 0; - virtual ggml_tensor * get_embd() = 0; - virtual ggml_tensor * get_embd_pooled() = 0; +class llm_graph_result; - virtual void set_inputs(const llama_ubatch * ubatch) = 0; -}; +struct llm_graph_params { + llm_arch arch = LLM_ARCH_UNKNOWN; -using llm_graph_result_ptr = std::unique_ptr; + llama_hparams hparams; + llama_cparams cparams; + llama_ubatch ubatch; // note: intentionally make a copy -class llm_graph_result : public llm_graph_result_i { -public: - virtual ~llm_graph_result() = default; + llm_graph_type gtype; - ggml_tensor * get_tokens() override { return t_tokens; } - ggml_tensor * get_logits() override { return t_logits; } - ggml_tensor * get_embd() override { return t_embd; } - ggml_tensor * get_embd_pooled() override { return t_embd_pooled; } + ggml_backend_sched_t sched; + ggml_backend_t backend_cpu; - void set_inputs(const llama_ubatch * ubatch) override { - for (auto & input : inputs) { - input->set_input(ubatch); + const llama_adapter_cvec * cvec; + const llama_adapter_loras * loras; + const llama_memory_context_i * mctx; + const llama_cross * cross; + + uint32_t n_outputs; + + llm_graph_cb cb; + + llm_graph_result * res; + + // return true if the "other" params would result in a graph with the same topology as with the current params + // having the same topology allows us to reuse the graph in some cases + bool allow_reuse(const llm_graph_params & other) const { + // first check the ubatch + bool can_reuse_ubatch = + ubatch.equal_seqs() == other.ubatch.equal_seqs() && + ubatch.n_tokens == other.ubatch.n_tokens && + ubatch.n_seq_tokens == other.ubatch.n_seq_tokens && + ubatch.n_seqs == other.ubatch.n_seqs && + ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && + ( + (!ubatch.token && !other.ubatch.token) || + (!ubatch.embd && !other.ubatch.embd) + ); + + if (can_reuse_ubatch && !ubatch.equal_seqs()) { + if (!ubatch.data) { + // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and + // therefore we cannot perform the sequence id check. normally should never happen + can_reuse_ubatch = false; + } else { + for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { + can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s]; + } + } } - } - llm_graph_input_i * add_input(llm_graph_input_ptr input) { - inputs.emplace_back(std::move(input)); - return inputs.back().get(); + if (!can_reuse_ubatch) { + return false; + } + + return + cparams.embeddings == other.cparams.embeddings && + cparams.causal_attn == other.cparams.causal_attn && + arch == other.arch && + gtype == other.gtype && + cvec == other.cvec && + loras == other.loras && + cross == other.cross && + n_outputs == other.n_outputs; } +}; + +class llm_graph_result { +public: + llm_graph_result(int64_t max_nodes); + + virtual ~llm_graph_result() = default; + + ggml_tensor * get_tokens() const { return t_tokens; } + ggml_tensor * get_logits() const { return t_logits; } + ggml_tensor * get_embd() const { return t_embd; } + ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } + + ggml_cgraph * get_gf() const { return gf; } + ggml_context * get_ctx() const { return ctx_compute.get(); } + + int64_t get_max_nodes() const; + + void reset(); + + void set_inputs(const llama_ubatch * ubatch); + + // try to update the existing graph result using the new graph parameters in order to reuse it + // this can only be done if we determine that the resulting graph using the new graph parameters + // would be identical to the existing graph. in that case, we simply have to update the memory + // contexts of the input tensors of the graph and we can reuse it for another computation + // return true if the graph was updated and can be reused + bool can_reuse(const llm_graph_params & params); + + llm_graph_input_i * add_input(llm_graph_input_ptr input); + + void set_params(const llm_graph_params & params); // important graph nodes ggml_tensor * t_tokens = nullptr; @@ -393,36 +481,31 @@ class llm_graph_result : public llm_graph_result_i { ggml_tensor * t_embd_pooled = nullptr; std::vector inputs; -}; -// -// llm_graph_context -// + ggml_context_ptr ctx_compute; -// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) -using llm_graph_cb = std::function; + // memory buffers used to evaluate the model + std::vector buf_compute_meta; -struct llm_graph_params { - ggml_context * ctx; + ggml_cgraph * gf; - const llm_arch arch; + int64_t max_nodes; - const llama_hparams & hparams; - const llama_cparams & cparams; - const llama_ubatch & ubatch; +private: + // keep a copy of the previous graph parameters + // we will use this to determine whether the graph can be reused by comparing them with the new parameters + // note: these are updated after constructing the new graph + llm_graph_params params; - ggml_backend_sched_t sched; - ggml_backend_t backend_cpu; - - const llama_adapter_cvec * cvec; - const llama_adapter_loras * loras; - const llama_memory_context_i * mctx; - const llama_cross * cross; + // env: LLAMA_GRAPH_RESULT_DEBUG + int debug = 0; +}; - uint32_t n_outputs; +using llm_graph_result_ptr = std::unique_ptr; - const llm_graph_cb & cb; -}; +// +// llm_graph_context +// // used in build_rs to properly order writes and avoid unnecessary copies using llm_graph_get_rows_fn = std::function; @@ -463,8 +546,6 @@ struct llm_graph_context { const enum llama_pooling_type pooling_type; const enum llama_rope_type rope_type; - ggml_context * ctx0 = nullptr; - ggml_backend_sched_t sched; ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? @@ -476,7 +557,10 @@ struct llm_graph_context { const llm_graph_cb & cb_func; - std::unique_ptr res; + llm_graph_result * res; + + ggml_context * ctx0 = nullptr; + ggml_cgraph * gf = nullptr; llm_graph_context(const llm_graph_params & params); virtual ~llm_graph_context() = default; @@ -562,7 +646,6 @@ struct llm_graph_context { // ggml_tensor * build_attn_mha( - ggml_cgraph * gf, ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens] ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false) @@ -575,7 +658,6 @@ struct llm_graph_context { ggml_tensor * build_attn( llm_graph_input_attn_no_cache * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] @@ -590,7 +672,6 @@ struct llm_graph_context { ggml_tensor * build_attn( llm_graph_input_attn_kv_unified * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] @@ -606,7 +687,6 @@ struct llm_graph_context { // note: if k_cur or v_cur are not provided, they will not be stored in the memory ggml_tensor * build_attn( llm_graph_input_attn_kv_unified_iswa * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] @@ -621,7 +701,6 @@ struct llm_graph_context { ggml_tensor * build_attn( llm_graph_input_attn_cross * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] @@ -643,7 +722,6 @@ struct llm_graph_context { // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in // `llama_memory_recurrent` ggml_tensor * build_rs( - ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, int32_t state_size, @@ -658,7 +736,6 @@ struct llm_graph_context { ggml_tensor * build_rs( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * s, int32_t state_size, int32_t n_seqs, @@ -666,9 +743,8 @@ struct llm_graph_context { ggml_tensor * build_rwkv_token_shift_load( llm_graph_input_rs * inp, - ggml_cgraph * gf, const llama_ubatch & ubatch, - int il) const; + int il) const; ggml_tensor * build_rwkv_token_shift_store( ggml_tensor * token_shift, @@ -685,7 +761,6 @@ struct llm_graph_context { // void build_pooling( - ggml_cgraph * gf, ggml_tensor * cls, ggml_tensor * cls_b, ggml_tensor * cls_out, diff --git a/examples/talk-llama/llama-hparams.cpp b/examples/talk-llama/llama-hparams.cpp index 7aa736e2f39..c6c67d26f93 100644 --- a/examples/talk-llama/llama-hparams.cpp +++ b/examples/talk-llama/llama-hparams.cpp @@ -65,6 +65,46 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const { return n_embd_head_v * n_head_kv; } +bool llama_hparams::is_n_embd_k_gqa_variable() const { + const uint32_t val = n_embd_k_gqa(); + for (uint32_t il = 0; il < n_layer; ++il) { + if (val != n_embd_k_gqa(il)) { + return true; + } + } + + return false; +} + +bool llama_hparams::is_n_embd_v_gqa_variable() const { + const uint32_t val = n_embd_v_gqa(); + for (uint32_t il = 0; il < n_layer; ++il) { + if (val != n_embd_v_gqa(il)) { + return true; + } + } + + return false; +} + +uint32_t llama_hparams::n_embd_k_gqa_max() const { + uint32_t val = n_embd_k_gqa(); + for (uint32_t il = 0; il < n_layer; ++il) { + val = std::max(val, n_embd_k_gqa(il)); + } + + return val; +} + +uint32_t llama_hparams::n_embd_v_gqa_max() const { + uint32_t val = n_embd_v_gqa(); + for (uint32_t il = 0; il < n_layer; ++il) { + val = std::max(val, n_embd_v_gqa(il)); + } + + return val; +} + uint32_t llama_hparams::n_embd_r() const { if (wkv_head_size != 0) { // for RWKV models diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h index d0500e4d0fd..ec7fd6a42bf 100644 --- a/examples/talk-llama/llama-hparams.h +++ b/examples/talk-llama/llama-hparams.h @@ -6,7 +6,7 @@ // bump if necessary #define LLAMA_MAX_LAYERS 512 -#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3 +#define LLAMA_MAX_EXPERTS 384 // Kimi-K2 enum llama_expert_gating_func_type { LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0, @@ -98,7 +98,7 @@ struct llama_hparams { float rope_freq_scale_train; float rope_freq_scale_train_swa; uint32_t n_ctx_orig_yarn; - float rope_yarn_log_mul; + float rope_yarn_log_mul = 0.0f; std::array rope_sections; @@ -191,6 +191,14 @@ struct llama_hparams { // dimension of value embeddings across all k-v heads uint32_t n_embd_v_gqa(uint32_t il = 0) const; + // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa + bool is_n_embd_k_gqa_variable() const; + bool is_n_embd_v_gqa_variable() const; + + // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers + uint32_t n_embd_k_gqa_max() const; + uint32_t n_embd_v_gqa_max() const; + // dimension of the rolling state embeddings // corresponds to Mamba's conv_states size or RWKV's token_shift states size uint32_t n_embd_r() const; diff --git a/examples/talk-llama/llama-kv-cache-unified-iswa.cpp b/examples/talk-llama/llama-kv-cache-unified-iswa.cpp index fe207ad5360..01d27fb4db9 100644 --- a/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +++ b/examples/talk-llama/llama-kv-cache-unified-iswa.cpp @@ -18,16 +18,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( bool v_trans, bool offload, bool swa_full, + bool unified, uint32_t kv_size, uint32_t n_seq_max, uint32_t n_ubatch, - uint32_t n_pad) : hparams(model.hparams) { + uint32_t n_pad) : hparams(model.hparams), unified(unified) { llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); }; llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); }; const uint32_t size_base = kv_size; - uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad)); + uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad)); // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size if (swa_full) { @@ -41,14 +42,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( kv_base = std::make_unique( model, std::move(filter_base), type_k, type_v, - v_trans, offload, size_base, n_seq_max, n_pad, + v_trans, offload, unified, size_base, n_seq_max, n_pad, 0, LLAMA_SWA_TYPE_NONE); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( model, std::move(filter_swa), type_k, type_v, - v_trans, offload, size_swa, n_seq_max, n_pad, + v_trans, offload, unified, size_swa, n_seq_max, n_pad, hparams.n_swa, hparams.swa_type); } @@ -100,6 +101,11 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all // first try simple split do { + if (!unified) { + // requires equal splits, so we skip the simple split + break; + } + balloc.split_reset(); std::vector ubatches; @@ -140,7 +146,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all std::vector ubatches; while (true) { - auto ubatch = balloc.split_equal(n_ubatch, false); + auto ubatch = balloc.split_equal(n_ubatch, !unified); if (ubatch.n_tokens == 0) { break; diff --git a/examples/talk-llama/llama-kv-cache-unified-iswa.h b/examples/talk-llama/llama-kv-cache-unified-iswa.h index 23205d826b2..d2650dadd35 100644 --- a/examples/talk-llama/llama-kv-cache-unified-iswa.h +++ b/examples/talk-llama/llama-kv-cache-unified-iswa.h @@ -20,6 +20,7 @@ class llama_kv_cache_unified_iswa : public llama_memory_i { bool v_trans, bool offload, bool swa_full, + bool unified, uint32_t kv_size, uint32_t n_seq_max, uint32_t n_ubatch, @@ -68,6 +69,8 @@ class llama_kv_cache_unified_iswa : public llama_memory_i { private: const llama_hparams & hparams; + const bool unified; + std::unique_ptr kv_base; std::unique_ptr kv_swa; }; diff --git a/examples/talk-llama/llama-kv-cache-unified.cpp b/examples/talk-llama/llama-kv-cache-unified.cpp index d3129cc5328..321dc79fc36 100644 --- a/examples/talk-llama/llama-kv-cache-unified.cpp +++ b/examples/talk-llama/llama-kv-cache-unified.cpp @@ -23,13 +23,14 @@ llama_kv_cache_unified::llama_kv_cache_unified( ggml_type type_v, bool v_trans, bool offload, + bool unified, uint32_t kv_size, uint32_t n_seq_max, uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), - n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { + n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { GGML_ASSERT(kv_size % n_pad == 0); @@ -45,7 +46,7 @@ llama_kv_cache_unified::llama_kv_cache_unified( auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ size_t(2u*n_layer_cache*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_cache*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -64,9 +65,33 @@ llama_kv_cache_unified::llama_kv_cache_unified( return it->second; }; - head = 0; + GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max); - cells.resize(kv_size); + v_heads.resize(n_stream); + for (uint32_t s = 0; s < n_stream; ++s) { + v_heads[s] = 0; + } + + v_cells.resize(n_stream); + for (uint32_t s = 0; s < n_stream; ++s) { + v_cells[s].resize(kv_size); + } + + // by default, all sequence ids are mapped to the 0th stream + seq_to_stream.resize(LLAMA_MAX_SEQ, 0); + + if (n_stream > 1) { + seq_to_stream.resize(n_stream, 0); + for (uint32_t s = 0; s < n_stream; ++s) { + seq_to_stream[s] = s; + } + } + + // [TAG_V_CACHE_VARIABLE] + if (v_trans && hparams.is_n_embd_v_gqa_variable()) { + LLAMA_LOG_WARN("%s: the V embeddings have different sizes across layers and FA is not enabled - padding V cache to %d\n", + __func__, hparams.n_embd_v_gqa_max()); + } for (uint32_t il = 0; il < n_layer_cache; il++) { if (filter && !filter(il)) { @@ -74,8 +99,9 @@ llama_kv_cache_unified::llama_kv_cache_unified( continue; } - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + // [TAG_V_CACHE_VARIABLE] + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const uint32_t n_embd_v_gqa = !v_trans ? hparams.n_embd_v_gqa(il) : hparams.n_embd_v_gqa_max(); const char * dev_name = "CPU"; @@ -98,14 +124,23 @@ llama_kv_cache_unified::llama_kv_cache_unified( ggml_tensor * k; ggml_tensor * v; - k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size); - v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size); + k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream); + v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream); ggml_format_name(k, "cache_k_l%d", il); ggml_format_name(v, "cache_v_l%d", il); + std::vector k_stream; + std::vector v_stream; + + for (uint32_t s = 0; s < n_stream; ++s) { + k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2])); + v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2])); + } + map_layer_ids[il] = layers.size(); - layers.push_back({ il, k, v }); + + layers.push_back({ il, k, v, k_stream, v_stream, }); } // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] @@ -148,8 +183,8 @@ llama_kv_cache_unified::llama_kv_cache_unified( const size_t memory_size_k = size_k_bytes(); const size_t memory_size_v = size_v_bytes(); - LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, - (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, + LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream, ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } @@ -158,7 +193,12 @@ llama_kv_cache_unified::llama_kv_cache_unified( debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0; const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); - supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0; + supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0; + + if (!supports_set_rows) { + // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support"); + } if (!supports_set_rows) { LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__); @@ -166,9 +206,10 @@ llama_kv_cache_unified::llama_kv_cache_unified( } void llama_kv_cache_unified::clear(bool data) { - cells.reset(); - - head = 0; + for (uint32_t s = 0; s < n_stream; ++s) { + v_cells[s].reset(); + v_heads[s] = 0; + } if (data) { for (auto & buf : bufs) { @@ -178,6 +219,11 @@ void llama_kv_cache_unified::clear(bool data) { } bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); + + auto & cells = v_cells[seq_to_stream[seq_id]]; + auto & head = v_heads[seq_to_stream[seq_id]]; + uint32_t new_head = cells.size(); if (p0 < 0) { @@ -224,30 +270,94 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos } void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { - if (seq_id_src == seq_id_dst) { + GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size()); + GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size()); + + const auto s0 = seq_to_stream[seq_id_src]; + const auto s1 = seq_to_stream[seq_id_dst]; + + if (s0 == s1) { + // since both sequences are in the same stream, no data copy is necessary + // we just have to update the cells meta data + + auto & cells = v_cells[s0]; + + if (seq_id_src == seq_id_dst) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } + + if (cells.seq_has(i, seq_id_src)) { + cells.seq_add(i, seq_id_dst); + } + } + return; } - if (p0 < 0) { - p0 = 0; + // cross-stream sequence copies require to copy the actual buffer data + + bool is_full = true; + + if (p0 > 0 && p0 + 1 < (int) get_size()) { + is_full = false; } - if (p1 < 0) { - p1 = std::numeric_limits::max(); + if (p1 > 0 && p1 + 1 < (int) get_size()) { + is_full = false; } - for (uint32_t i = 0; i < cells.size(); ++i) { - if (!cells.pos_in(i, p0, p1)) { - continue; - } + GGML_ASSERT(is_full && "seq_cp() is only supported for full KV buffers"); + + // enqueue the copy operation - the buffer copy will be performed during the next update + sc_info.ssrc.push_back(s0); + sc_info.sdst.push_back(s1); - if (cells.seq_has(i, seq_id_src)) { - cells.seq_add(i, seq_id_dst); + v_cells[s1].reset(); + for (uint32_t i = 0; i < v_cells[s0].size(); ++i) { + if (v_cells[s0].seq_has(i, seq_id_src)) { + llama_pos pos = v_cells[s0].pos_get(i); + llama_pos shift = v_cells[s0].get_shift(i); + + if (shift != 0) { + pos -= shift; + assert(pos >= 0); + } + + v_cells[s1].pos_set(i, pos); + v_cells[s1].seq_add(i, seq_id_dst); + + if (shift != 0) { + v_cells[s1].pos_add(i, shift); + } } } + + v_heads[s1] = v_heads[s0]; + + //for (uint32_t s = 0; s < n_stream; ++s) { + // LLAMA_LOG_WARN("%s: seq %d: min = %d, max = %d\n", __func__, s, v_cells[s].seq_pos_min(s), v_cells[s].seq_pos_max(s)); + //} } void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); + + auto & cells = v_cells[seq_to_stream[seq_id]]; + auto & head = v_heads[seq_to_stream[seq_id]]; + uint32_t new_head = cells.size(); for (uint32_t i = 0; i < cells.size(); ++i) { @@ -265,6 +375,11 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { } void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); + + auto & cells = v_cells[seq_to_stream[seq_id]]; + auto & head = v_heads[seq_to_stream[seq_id]]; + if (shift == 0) { return; } @@ -304,6 +419,10 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po } void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); + + auto & cells = v_cells[seq_to_stream[seq_id]]; + if (d == 1) { return; } @@ -333,10 +452,18 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po } llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const { + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); + + const auto & cells = v_cells[seq_to_stream[seq_id]]; + return cells.seq_pos_min(seq_id); } llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); + + const auto & cells = v_cells[seq_to_stream[seq_id]]; + return cells.seq_pos_max(seq_id); } @@ -351,7 +478,7 @@ llama_memory_context_ptr llama_kv_cache_unified::init_batch( std::vector ubatches; while (true) { - auto ubatch = balloc.split_simple(n_ubatch); + auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true); if (ubatch.n_tokens == 0) { break; @@ -387,7 +514,10 @@ llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lct defrag_info dinfo; // see if we need to defrag - { + if (n_stream == 1) { + // note : for now do not consider defrag for n_stream > 1 + const auto & cells = v_cells[seq_to_stream[0]]; + bool do_defrag = optimize; const auto thold = lctx->get_cparams().defrag_thold; @@ -411,22 +541,22 @@ llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lct } } - return std::make_unique(this, lctx, do_shift, std::move(dinfo)); + return std::make_unique(this, lctx, do_shift, std::move(dinfo), std::move(sc_info)); } llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const std::vector & ubatches) { llama_kv_cache_unified::slot_info_vec_t res; - struct state { - uint32_t head_old; // old position of the head, before placing the ubatch - + struct state_t { slot_info sinfo; // slot info for the ubatch - llama_kv_cells_unified cells; // copy of the old cells, before placing the ubatch + std::vector v_heads_old; // old positions of the heads, before placing the ubatch + + std::vector v_cells; // copy of the old cells, before placing the ubatch }; // remember the old state of the cells so we can restore it in the end - std::vector states; + std::vector states; bool success = true; @@ -445,16 +575,35 @@ llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const st res.push_back(sinfo_new); // store the old state of the cells in the recovery stack - states.push_back({head, sinfo_new, cells.cp(sinfo_new.idxs)}); + { + state_t state = { sinfo_new, v_heads, {} }; + + for (uint32_t s = 0; s < sinfo_new.n_stream(); ++s) { + auto & cells = v_cells[sinfo_new.strm[s]]; + + state.v_cells.push_back(cells.cp(sinfo_new.idxs[s])); + } + + states.push_back(std::move(state)); + } // now emplace the ubatch apply_ubatch(sinfo_new, ubatch); } + GGML_ASSERT(!states.empty() || !success); + // iterate backwards and restore the cells to their original state for (auto it = states.rbegin(); it != states.rend(); ++it) { - cells.set(it->sinfo.idxs, it->cells); - head = it->head_old; + const auto & sinfo = it->sinfo; + + for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { + auto & cells = v_cells[sinfo.strm[s]]; + auto & head = v_heads[sinfo.strm[s]]; + + cells.set(sinfo.idxs[s], it->v_cells[s]); + head = it->v_heads_old[s]; + } } if (!success) { @@ -464,11 +613,38 @@ llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const st return res; } -bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo) { +bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info) { bool updated = false; auto * sched = lctx->get_sched(); + if (!sc_info.empty()) { + assert(n_stream > 1 && "stream copy should never happen with a single stream"); + + llama_synchronize(lctx); + + const size_t n_copy = sc_info.ssrc.size(); + + for (size_t i = 0; i < n_copy; ++i) { + const auto ssrc = sc_info.ssrc[i]; + const auto sdst = sc_info.sdst[i]; + + assert(ssrc < n_stream); + assert(sdst < n_stream); + + LLAMA_LOG_DEBUG("%s: copying KV buffer: stream %d to stream %d\n", __func__, ssrc, sdst); + + assert(ssrc != sdst); + + for (uint32_t il = 0; il < layers.size(); ++il) { + const auto & layer = layers[il]; + + ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]); + ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]); + } + } + } + if (do_shift) { if (!get_can_shift()) { GGML_ABORT("The current KV cache / model configuration does not support K-shift"); @@ -480,14 +656,11 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(sched); - auto * gf = lctx->graph_init(); + auto * res = lctx->get_gf_res_reserve(); - auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__); - return updated; - } + res->reset(); + auto * gf = build_graph_shift(res, lctx); if (!ggml_backend_sched_alloc_graph(sched, gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__); return updated; @@ -503,12 +676,20 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d updated = true; } - cells.reset_shift(); + for (uint32_t s = 0; s < n_stream; ++s) { + auto & cells = v_cells[s]; + + cells.reset_shift(); + } } if (!dinfo.empty()) { LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); + // note: for now do not consider defrag for n_stream > 1 + auto & cells = v_cells[seq_to_stream[0]]; + auto & head = v_heads[seq_to_stream[0]]; + // apply moves: { const auto n_kv = dinfo.ids.size(); @@ -529,14 +710,11 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d ggml_backend_sched_reset(sched); - auto * gf = lctx->graph_init(); + auto * res = lctx->get_gf_res_reserve(); - auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__); - return updated; - } + res->reset(); + auto * gf = build_graph_defrag(res, lctx, dinfo); if (!ggml_backend_sched_alloc_graph(sched, gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__); return updated; @@ -556,23 +734,13 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d } llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const { - const uint32_t n_tokens = ubatch.n_tokens; + if (debug > 0) { + const auto & cells = v_cells[seq_to_stream[1]]; - uint32_t head_cur = this->head; + const uint32_t head_cur = v_heads[1]; - // if we have enough unused cells before the current head -> - // better to start searching from the beginning of the cache, hoping to fill it - if (head_cur > cells.get_used() + 2*ubatch.n_tokens) { - head_cur = 0; - } - - if (n_tokens > cells.size()) { - LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size()); - return { }; - } - - if (debug > 0) { - LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa); + LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", + __func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa); if ((debug == 2 && n_swa > 0) || debug > 2) { std::string ss; @@ -629,86 +797,133 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ } } - uint32_t n_tested = 0; + uint32_t n_tokens = ubatch.n_tokens; + uint32_t n_seqs = 1; + + if (n_stream > 1) { + GGML_ASSERT(n_tokens % ubatch.n_seqs_unq == 0); - // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head - // for non-continuous slots, we test the tokens one by one - const uint32_t n_test = cont ? n_tokens : 1; + n_seqs = ubatch.n_seqs_unq; + n_tokens = n_tokens / n_seqs; + } - slot_info res; + slot_info res = { + /*.s0 =*/ LLAMA_MAX_SEQ, + /*.s1 =*/ 0, + /*.strm =*/ { }, + /*.idxs =*/ { }, + }; - auto & idxs = res.idxs; + res.resize(n_seqs); - idxs.reserve(n_tokens); + for (uint32_t s = 0; s < n_seqs; ++s) { + const auto seq_id = ubatch.seq_id_unq[s]; - while (true) { - if (head_cur + n_test > cells.size()) { - n_tested += cells.size() - head_cur; + if (n_stream > 1) { + GGML_ASSERT(ubatch.n_seq_id[s*n_tokens] == 1); + GGML_ASSERT(ubatch.seq_id [s*n_tokens][0] == seq_id); + } + + res.s0 = std::min(res.s0, seq_to_stream[seq_id]); + res.s1 = std::max(res.s1, seq_to_stream[seq_id]); + + res.strm[s] = seq_to_stream[seq_id]; + res.idxs[s].reserve(n_tokens); + + const auto & cells = v_cells[seq_to_stream[seq_id]]; + + uint32_t head_cur = v_heads[seq_to_stream[seq_id]]; + + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (head_cur > cells.get_used() + 2*n_tokens) { head_cur = 0; - continue; } - for (uint32_t i = 0; i < n_test; i++) { - const auto idx = head_cur; + if (n_tokens > cells.size()) { + LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size()); + return { }; + } + + uint32_t n_tested = 0; + + // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head + // for non-continuous slots, we test the tokens one by one + const uint32_t n_test = cont ? n_tokens : 1; - //const llama_pos pos = ubatch.pos[i]; - //const llama_seq_id seq_id = ubatch.seq_id[i][0]; + while (true) { + if (head_cur + n_test > cells.size()) { + n_tested += cells.size() - head_cur; + head_cur = 0; + continue; + } - // can we use this cell? either: - // - the cell is empty - // - the cell is occupied only by one sequence: - // - (disabled) mask causally, if the sequence is the same as the one we are inserting - // - mask SWA, using current max pos for that sequence in the cache - // always insert in the cell with minimum pos - bool can_use = cells.is_empty(idx); + for (uint32_t i = 0; i < n_test; i++) { + const auto idx = head_cur; - if (!can_use && cells.seq_count(idx) == 1) { - const llama_pos pos_cell = cells.pos_get(idx); + head_cur++; + n_tested++; - // (disabled) causal mask - // note: it's better to purge any "future" tokens beforehand - //if (cells.seq_has(idx, seq_id)) { - // can_use = pos_cell >= pos; - //} + //const llama_pos pos = ubatch.pos[i]; + //const llama_seq_id seq_id = ubatch.seq_id[i][0]; - if (!can_use) { - const llama_seq_id seq_id_cell = cells.seq_get(idx); + // can we use this cell? either: + // - the cell is empty + // - the cell is occupied only by one sequence: + // - (disabled) mask causally, if the sequence is the same as the one we are inserting + // - mask SWA, using current max pos for that sequence in the cache + // always insert in the cell with minimum pos + bool can_use = cells.is_empty(idx); - // SWA mask - if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) { - can_use = true; + if (!can_use && cells.seq_count(idx) == 1) { + const llama_pos pos_cell = cells.pos_get(idx); + + // (disabled) causal mask + // note: it's better to purge any "future" tokens beforehand + //if (cells.seq_has(idx, seq_id)) { + // can_use = pos_cell >= pos; + //} + + if (!can_use) { + const llama_seq_id seq_id_cell = cells.seq_get(idx); + + // SWA mask + if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) { + can_use = true; + } } } - } - head_cur++; - n_tested++; + if (can_use) { + res.idxs[s].push_back(idx); + } else { + if (cont) { + break; + } + } + } - if (can_use) { - idxs.push_back(idx); - } else { + if (res.idxs[s].size() == n_tokens) { break; } - } - if (idxs.size() == n_tokens) { - break; - } + if (cont) { + res.idxs[s].clear(); + } - if (cont) { - idxs.clear(); + if (n_tested >= cells.size()) { + //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); + return { }; + } } - if (n_tested >= cells.size()) { - //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); + // we didn't find a suitable slot - return empty result + if (res.idxs[s].size() < n_tokens) { return { }; } } - // we didn't find a suitable slot - return empty result - if (idxs.size() < n_tokens) { - res.clear(); - } + assert(res.s1 >= res.s0); return res; } @@ -717,41 +932,51 @@ void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_u // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ]; - for (int s = 0; s < LLAMA_MAX_SEQ; ++s) { + for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { seq_pos_max_rm[s] = -1; } - assert(ubatch.n_tokens == sinfo.idxs.size()); + assert(ubatch.n_tokens == sinfo.n_stream()*sinfo.size()); - for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { - const auto idx = sinfo.idxs.at(i); + for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { + for (uint32_t ii = 0; ii < sinfo.size(); ++ii) { + const uint32_t i = s*sinfo.size() + ii; - if (!cells.is_empty(idx)) { - assert(cells.seq_count(idx) == 1); + auto & cells = v_cells[sinfo.strm[s]]; - const llama_seq_id seq_id = cells.seq_get(idx); - const llama_pos pos = cells.pos_get(idx); + const auto idx = sinfo.idxs[s][ii]; - seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos); + if (!cells.is_empty(idx)) { + assert(cells.seq_count(idx) == 1); - cells.rm(idx); - } + const llama_seq_id seq_id = cells.seq_get(idx); + const llama_pos pos = cells.pos_get(idx); - cells.pos_set(idx, ubatch.pos[i]); + seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos); + + cells.rm(idx); + } - for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) { - cells.seq_add(idx, ubatch.seq_id[i][s]); + cells.pos_set(idx, ubatch.pos[i]); + + for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) { + cells.seq_add(idx, ubatch.seq_id[i][s]); + } } } // note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence // will be present in the cache. so we have to purge any position which is less than those we would overwrite // ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092 - for (int s = 0; s < LLAMA_MAX_SEQ; ++s) { + for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { if (seq_pos_max_rm[s] == -1) { continue; } + GGML_ASSERT(s < seq_to_stream.size()); + + auto & cells = v_cells[seq_to_stream[s]]; + if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) { LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n", __func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s); @@ -761,7 +986,11 @@ void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_u } // move the head at the end of the slot - head = sinfo.idxs.back() + 1; + for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { + auto & head = v_heads[sinfo.strm[s]]; + + head = sinfo.idxs[s].back() + 1; + } } bool llama_kv_cache_unified::get_can_shift() const { @@ -769,49 +998,91 @@ bool llama_kv_cache_unified::get_can_shift() const { } uint32_t llama_kv_cache_unified::get_size() const { + const auto & cells = v_cells[seq_to_stream[0]]; + return cells.size(); } +uint32_t llama_kv_cache_unified::get_n_stream() const { + return n_stream; +} + bool llama_kv_cache_unified::get_has_shift() const { - return cells.get_has_shift(); + bool result = false; + + for (uint32_t s = 0; s < n_stream; ++s) { + result |= v_cells[s].get_has_shift(); + } + + return result; } uint32_t llama_kv_cache_unified::get_n_kv() const { - return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))); + uint32_t result = 0; + + for (uint32_t s = 0; s < n_stream; ++s) { + const auto & cells = v_cells[s]; + + result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result); + } + + return result; } -ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const { +bool llama_kv_cache_unified::get_supports_set_rows() const { + return supports_set_rows; +} + +ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const { const int32_t ikv = map_layer_ids.at(il); auto * k = layers[ikv].k; - return ggml_view_3d(ctx, k, - hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, + const uint64_t kv_size = get_size(); + const uint64_t n_embd_k_gqa = k->ne[0]; + + assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il)); + + const uint32_t ns = sinfo.s1 - sinfo.s0 + 1; + + return ggml_view_4d(ctx, k, + hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns, ggml_row_size(k->type, hparams.n_embd_head_k), - ggml_row_size(k->type, hparams.n_embd_k_gqa(il)), - 0); + ggml_row_size(k->type, n_embd_k_gqa), + ggml_row_size(k->type, n_embd_k_gqa*kv_size), + ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0); } -ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const { +ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const { const int32_t ikv = map_layer_ids.at(il); auto * v = layers[ikv].v; + const uint64_t kv_size = get_size(); + const uint64_t n_embd_v_gqa = v->ne[0]; + + // [TAG_V_CACHE_VARIABLE] + assert(n_embd_v_gqa >= hparams.n_embd_v_gqa(il)); + + const uint32_t ns = sinfo.s1 - sinfo.s0 + 1; + if (!v_trans) { // note: v->nb[1] <= v->nb[2] - return ggml_view_3d(ctx, v, - hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, - ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1] - ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2] - 0); + return ggml_view_4d(ctx, v, + hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns, + ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1] + ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2] + ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3] + ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0); } // note: v->nb[1] > v->nb[2] - return ggml_view_3d(ctx, v, - n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, - ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1] - ggml_row_size(v->type, v->ne[1]), // v->nb[2] - 0); + return ggml_view_4d(ctx, v, + n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns, + ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1] + ggml_row_size(v->type, kv_size), // v->nb[2] + ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3] + ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0); } ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const { @@ -825,12 +1096,18 @@ ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_ k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens); if (k_idxs && supports_set_rows) { + if (k->ne[2] > 1) { + k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]); + } + return ggml_set_rows(ctx, k, k_cur, k_idxs); } // TODO: fallback to old ggml_cpy() method for backwards compatibility // will be removed when ggml_set_rows() is adopted by all backends + GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS"); + ggml_tensor * k_view = ggml_view_1d(ctx, k, n_tokens*n_embd_k_gqa, ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head()); @@ -843,37 +1120,38 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_ auto * v = layers[ikv].v; - const int64_t n_embd_v_gqa = v->ne[0]; - const int64_t n_tokens = v_cur->ne[2]; + const int64_t n_embd_v_gqa = v_cur->ne[0]*v_cur->ne[1]; + const int64_t n_tokens = v_cur->ne[2]; v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens); if (v_idxs && supports_set_rows) { if (!v_trans) { + if (v->ne[2] > 1) { + v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]); + } + return ggml_set_rows(ctx, v, v_cur, v_idxs); } - // the row becomes a single element - ggml_tensor * v_view = ggml_reshape_3d(ctx, v, 1, v->ne[1], v->ne[0]); + // [TAG_V_CACHE_VARIABLE] + if (n_embd_v_gqa < v->ne[0]) { + v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0); + } - // note: the V cache is transposed when not using flash attention - v_cur = ggml_permute(ctx, ggml_reshape_3d(ctx, v_cur, v_cur->ne[0], 1, v_cur->ne[1]), 2, 0, 1, 3); + // the row becomes a single element + ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]); - // note: we can be more explicit here at the cost of extra cont - // however, above we take advantage that a row of single element is always continuous regardless of the row stride - //v_cur = ggml_transpose(ctx, v_cur); - //v_cur = ggml_cont_3d(ctx, v_cur, 1, v_cur->ne[0], v_cur->ne[1]); + v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]); - // we broadcast the KV indices n_embd_v_gqa times - // v [1, n_kv, n_embd_v_gqa] - // v_cur [1, n_tokens, n_embd_v_gqa] - // v_idxs [n_tokens, 1, 1] return ggml_set_rows(ctx, v_view, v_cur, v_idxs); } // TODO: fallback to old ggml_cpy() method for backwards compatibility // will be removed when ggml_set_rows() is adopted by all backends + GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS"); + ggml_tensor * v_view = nullptr; if (!v_trans) { @@ -904,7 +1182,13 @@ ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, con ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { const uint32_t n_tokens = ubatch.n_tokens; - ggml_tensor * v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens); + ggml_tensor * v_idxs; + + if (!v_trans) { + v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens); + } else { + v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens*hparams.n_embd_v_gqa_max()); + } ggml_set_input(v_idxs); @@ -917,12 +1201,17 @@ void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_uba } const uint32_t n_tokens = ubatch->n_tokens; + GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream()); GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); int64_t * data = (int64_t *) dst->data; - for (int64_t i = 0; i < n_tokens; ++i) { - data[i] = sinfo.idxs.at(i); + for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { + const int64_t offs = sinfo.strm[s]*get_size(); + + for (uint32_t i = 0; i < sinfo.size(); ++i) { + data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i]; + } } } @@ -932,12 +1221,48 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba } const uint32_t n_tokens = ubatch->n_tokens; + GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream()); GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); int64_t * data = (int64_t *) dst->data; - for (int64_t i = 0; i < n_tokens; ++i) { - data[i] = sinfo.idxs.at(i); + if (!v_trans) { + for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { + const int64_t offs = sinfo.strm[s]*get_size(); + + for (uint32_t i = 0; i < sinfo.size(); ++i) { + data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i]; + } + } + } else { + // note: the V cache is transposed when not using flash attention + const int64_t kv_size = get_size(); + + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa_max(); + + for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { + const int64_t offs = sinfo.strm[s]*kv_size*n_embd_v_gqa; + + for (uint32_t i = 0; i < sinfo.size(); ++i) { + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + data[s*sinfo.size()*n_embd_v_gqa + i*n_embd_v_gqa + j] = offs + j*kv_size + sinfo.idxs[s][i]; + } + } + } + } +} + +void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const { + GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); + + int32_t * data = (int32_t *) dst->data; + + for (uint32_t s = 0; s < n_stream; ++s) { + const auto & cells = v_cells[s]; + + for (uint32_t i = 0; i < cells.size(); ++i) { + data[s*cells.size() + i] = cells.is_empty(i) ? 0 : cells.get_shift(i); + } } } @@ -947,7 +1272,16 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); float * data = (float *) dst->data; - const int64_t n_kv = dst->ne[0]; + const int64_t n_kv = dst->ne[0]; + const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch + + GGML_ASSERT(n_tokens%n_stream == 0); + + // n_tps == n_tokens_per_stream + const int64_t n_tps = n_tokens/n_stream; + const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD); + + std::fill(data, data + ggml_nelements(dst), -INFINITY); // Use only the previous KV cells of the correct sequence for each token of the ubatch. // It's assumed that if a token in the batch has multiple sequences, they are equivalent. @@ -961,70 +1295,57 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub // xxxxx----- // xxxxx----- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615 + // TODO: optimize this section for (uint32_t h = 0; h < 1; ++h) { - for (uint32_t i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = ubatch->seq_id[i][0]; + for (uint32_t s = 0; s < n_stream; ++s) { + for (uint32_t ii = 0; ii < n_tps; ++ii) { + const uint32_t i = s*n_tps + ii; - const llama_pos p1 = ubatch->pos[i]; + const llama_seq_id seq_id = ubatch->seq_id[i][0]; - for (uint32_t j = 0; j < n_kv; ++j) { - float f = 0.0f; + const auto & cells = v_cells[seq_to_stream[seq_id]]; - bool masked = false; + const llama_pos p1 = ubatch->pos[i]; - if (cells.is_empty(j)) { - masked = true; - } else { - const llama_pos p0 = cells.pos_get(j); + const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii); + + for (uint32_t j = 0; j < n_kv; ++j) { + if (cells.is_empty(j)) { + continue; + } // mask the token if not the same sequence - masked = masked || (!cells.seq_has(j, seq_id)); + if (!cells.seq_has(j, seq_id)) { + continue; + } + + const llama_pos p0 = cells.pos_get(j); // mask future tokens - masked = masked || (causal_attn && p0 > p1); + if (causal_attn && p0 > p1) { + continue; + } // apply SWA if any - masked = masked || (is_masked_swa(p0, p1)); - - if (!masked && hparams.use_alibi) { - f = -std::abs(p0 - p1); + if (is_masked_swa(p0, p1)) { + continue; } - } - - if (masked) { - f = -INFINITY; - } - - data[h*(n_kv*n_tokens) + i*n_kv + j] = f; - } - } - // mask padded tokens - if (data) { - for (uint32_t i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (uint32_t j = 0; j < n_kv; ++j) { - data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f; } } } } } -void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const { - GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - - int32_t * data = (int32_t *) dst->data; - - for (uint32_t i = 0; i < cells.size(); ++i) { - data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i); - } -} - void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { const int64_t n_tokens = ubatch->n_tokens; + GGML_ASSERT(n_stream == 1 && "TODO: support multiple streams"); + const auto & cells = v_cells[0]; + GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing + GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing int32_t * data = (int32_t *) dst->data; @@ -1129,7 +1450,7 @@ class llm_graph_input_k_shift : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * k_shift; // I32 [kv_size] + ggml_tensor * k_shift; // I32 [kv_size*n_stream] const llama_kv_cache_unified * kv_self; }; @@ -1142,20 +1463,20 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { } } -llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf) const { - auto res = std::make_unique(); +ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { + auto * ctx = res->get_ctx(); + auto * gf = res->get_gf(); const auto & n_embd_head_k = hparams.n_embd_head_k; //const auto & n_embd_head_v = hparams.n_embd_head_v; auto inp = std::make_unique(this); - inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size()); + inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream); ggml_set_input(inp->k_shift); + const auto & cparams = lctx->get_cparams(); + for (const auto & layer : layers) { const uint32_t il = layer.il; @@ -1169,7 +1490,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( ggml_tensor * k = ggml_view_3d(ctx, layer.k, - n_embd_head_k, n_head_kv, cells.size(), + n_embd_head_k, n_head_kv, get_size()*n_stream, ggml_row_size(layer.k->type, n_embd_head_k), ggml_row_size(layer.k->type, n_embd_k_gqa), 0); @@ -1181,18 +1502,24 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( res->add_input(std::move(inp)); - return res; + return gf; } -llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf, - const defrag_info & dinfo) const { - auto res = std::make_unique(); +ggml_cgraph * llama_kv_cache_unified::build_graph_defrag( + llm_graph_result * res, + llama_context * lctx, + const defrag_info & dinfo) const { + auto * ctx = res->get_ctx(); + auto * gf = res->get_gf(); + + GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag"); + + const auto & cells = v_cells[0]; const auto & ids = dinfo.ids; + const auto & cparams = lctx->get_cparams(); + #if 0 // CPU defrag // @@ -1329,10 +1656,14 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); #endif - return res; + return gf; } llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const { + GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag"); + + const auto & cells = v_cells[0]; + const uint32_t n_layer = layers.size(); const uint32_t n_kv = cells.used_max_p1(); @@ -1478,64 +1809,94 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const { } void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { - std::vector> cell_ranges; // ranges, from inclusive, to exclusive - uint32_t cell_count = 0; + io.write(&n_stream, sizeof(n_stream)); - // Count the number of cells with the specified seq_id - // Find all the ranges of cells with this seq id (or all, when -1) - uint32_t cell_range_begin = cells.size(); + for (uint32_t s = 0; s < n_stream; ++s) { + cell_ranges_t cr { s, {} }; - for (uint32_t i = 0; i < cells.size(); ++i) { - if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) { - ++cell_count; - if (cell_range_begin == cells.size()) { - cell_range_begin = i; - } - } else { - if (cell_range_begin != cells.size()) { - cell_ranges.emplace_back(cell_range_begin, i); - cell_range_begin = cells.size(); + uint32_t cell_count = 0; + + const auto & cells = v_cells[s]; + + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) + uint32_t cell_range_begin = cells.size(); + + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) { + ++cell_count; + if (cell_range_begin == cells.size()) { + cell_range_begin = i; + } + } else { + if (cell_range_begin != cells.size()) { + cr.data.emplace_back(cell_range_begin, i); + cell_range_begin = cells.size(); + } } } - } - if (cell_range_begin != cells.size()) { - cell_ranges.emplace_back(cell_range_begin, cells.size()); - } + if (cell_range_begin != cells.size()) { + cr.data.emplace_back(cell_range_begin, cells.size()); + } - // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count - uint32_t cell_count_check = 0; - for (const auto & range : cell_ranges) { - cell_count_check += range.second - range.first; - } - GGML_ASSERT(cell_count == cell_count_check); + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cr.data) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); - io.write(&cell_count, sizeof(cell_count)); + io.write(&cell_count, sizeof(cell_count)); - state_write_meta(io, cell_ranges, seq_id); - state_write_data(io, cell_ranges); + // skip empty streams + if (cell_count == 0) { + continue; + } + + state_write_meta(io, cr, seq_id); + state_write_data(io, cr); + } } void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) { - uint32_t cell_count; - io.read_to(&cell_count, sizeof(cell_count)); + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); - bool res = true; - res = res && state_read_meta(io, cell_count, seq_id); - res = res && state_read_data(io, cell_count); + uint32_t n_stream_cur; + io.read_to(&n_stream_cur, sizeof(n_stream_cur)); + if (n_stream_cur != n_stream) { + throw std::runtime_error("n_stream mismatch"); + } + + for (uint32_t s = 0; s < n_stream; ++s) { + uint32_t cell_count; + io.read_to(&cell_count, sizeof(cell_count)); + + if (cell_count == 0) { + continue; + } - if (!res) { - if (seq_id == -1) { - clear(true); - } else { - seq_rm(seq_id, -1, -1); + const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id]; + + bool res = true; + res = res && state_read_meta(io, strm, cell_count, seq_id); + res = res && state_read_data(io, strm, cell_count); + + if (!res) { + if (seq_id == -1) { + clear(true); + } else { + seq_rm(seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); } - throw std::runtime_error("failed to restore kv cache"); } } -void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { - for (const auto & range : cell_ranges) { +void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const { + const auto & cells = v_cells[cr.strm]; + + for (const auto & range : cr.data) { for (uint32_t i = range.first; i < range.second; ++i) { std::vector seq_ids; @@ -1560,7 +1921,9 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std:: } } -void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { +void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const { + const auto & cells = v_cells[cr.strm]; + const uint32_t v_trans = this->v_trans ? 1 : 0; const uint32_t n_layer = layers.size(); @@ -1576,19 +1939,21 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std:: const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + auto * k = layer.k_stream[cr.strm]; + // Write key type - const int32_t k_type_i = (int32_t)layer.k->type; + const int32_t k_type_i = (int32_t) k->type; io.write(&k_type_i, sizeof(k_type_i)); // Write row size of key - const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa); + const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa); io.write(&k_size_row, sizeof(k_size_row)); // Read each range of cells of k_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { + for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * k_size_row; - io.write_tensor(layer.k, range.first * k_size_row, buf_size); + io.write_tensor(k, range.first * k_size_row, buf_size); } } @@ -1598,19 +1963,21 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std:: const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + auto * v = layer.v_stream[cr.strm]; + // Write value type - const int32_t v_type_i = (int32_t)layer.v->type; + const int32_t v_type_i = (int32_t) v->type; io.write(&v_type_i, sizeof(v_type_i)); // Write row size of value - const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa); + const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa); io.write(&v_size_row, sizeof(v_size_row)); // Read each range of cells of v_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { + for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * v_size_row; - io.write_tensor(layer.v, range.first * v_size_row, buf_size); + io.write_tensor(v, range.first * v_size_row, buf_size); } } } else { @@ -1622,12 +1989,14 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std:: const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + auto * v = layer.v_stream[cr.strm]; + // Write value type - const int32_t v_type_i = (int32_t)layer.v->type; + const int32_t v_type_i = (int32_t) v->type; io.write(&v_type_i, sizeof(v_type_i)); // Write element size - const uint32_t v_size_el = ggml_type_size(layer.v->type); + const uint32_t v_size_el = ggml_type_size(v->type); io.write(&v_size_el, sizeof(v_size_el)); // Write GQA embedding size @@ -1636,27 +2005,31 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std:: // For each row, we get the element values of each cell for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { // Read each range of cells of v_size_el length each into tmp_buf and write out - for (const auto & range : cell_ranges) { + for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; const size_t buf_size = range_size * v_size_el; - io.write_tensor(layer.v, src_offset, buf_size); + io.write_tensor(v, src_offset, buf_size); } } } } } -bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) { +bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) { + auto & cells = v_cells[strm]; + auto & head = v_heads[strm]; + if (dest_seq_id != -1) { // single sequence - seq_rm(dest_seq_id, -1, -1); llama_batch_allocr balloc(hparams.n_pos_per_embd()); llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1); + ubatch.seq_id_unq[0] = dest_seq_id; + for (uint32_t i = 0; i < cell_count; ++i) { llama_pos pos; uint32_t n_seq_id; @@ -1693,6 +2066,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell // keep the head at the old position because we will read the KV data into it in state_read_data() head = head_cur; + LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id); + // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values) // Assume that this is one contiguous block of cells GGML_ASSERT(head_cur + cell_count <= cells.size()); @@ -1738,7 +2113,10 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell return true; } -bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) { +bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) { + auto & cells = v_cells[strm]; + auto & head = v_heads[strm]; + uint32_t v_trans; uint32_t n_layer; @@ -1766,10 +2144,12 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + auto * k = layer.k_stream[strm]; + // Read type of key int32_t k_type_i_ref; io.read_to(&k_type_i_ref, sizeof(k_type_i_ref)); - const int32_t k_type_i = (int32_t) layer.k->type; + const int32_t k_type_i = (int32_t) k->type; if (k_type_i != k_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); return false; @@ -1778,7 +2158,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell // Read row size of key uint64_t k_size_row_ref; io.read_to(&k_size_row_ref, sizeof(k_size_row_ref)); - const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa); + const size_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa); if (k_size_row != k_size_row_ref) { LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); return false; @@ -1786,7 +2166,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell if (cell_count) { // Read and set the keys for the whole cell range - ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row); + ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row); } } @@ -1796,10 +2176,12 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + auto * v = layer.v_stream[strm]; + // Read type of value int32_t v_type_i_ref; io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)layer.v->type; + const int32_t v_type_i = (int32_t) v->type; if (v_type_i != v_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); return false; @@ -1808,7 +2190,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell // Read row size of value uint64_t v_size_row_ref; io.read_to(&v_size_row_ref, sizeof(v_size_row_ref)); - const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa); + const size_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa); if (v_size_row != v_size_row_ref) { LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); return false; @@ -1816,7 +2198,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell if (cell_count) { // Read and set the values for the whole cell range - ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row); + ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row); } } } else { @@ -1826,10 +2208,12 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + auto * v = layer.v_stream[strm]; + // Read type of value int32_t v_type_i_ref; io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)layer.v->type; + const int32_t v_type_i = (int32_t) v->type; if (v_type_i != v_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); return false; @@ -1838,7 +2222,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell // Read element size of value uint32_t v_size_el_ref; io.read_to(&v_size_el_ref, sizeof(v_size_el_ref)); - const size_t v_size_el = ggml_type_size(layer.v->type); + const size_t v_size_el = ggml_type_size(v->type); if (v_size_el != v_size_el_ref) { LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); return false; @@ -1856,7 +2240,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell // For each row in the transposed matrix, read the values for the whole cell range for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { const size_t dst_offset = (head + j * cells.size()) * v_size_el; - ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); } } } @@ -1875,18 +2259,26 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context( llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) { n_kv = kv->get_size(); + const uint32_t n_stream = kv->get_n_stream(); + // create a dummy slot info - the actual data is irrelevant. we just need to build the graph sinfos.resize(1); - sinfos[0].idxs.resize(1); - sinfos[0].idxs[0] = 0; + sinfos[0].s0 = 0; + sinfos[0].s1 = n_stream - 1; + sinfos[0].idxs.resize(n_stream); + for (uint32_t s = 0; s < n_stream; ++s) { + sinfos[0].strm.push_back(s); + sinfos[0].idxs[s].resize(1, 0); + } } llama_kv_cache_unified_context::llama_kv_cache_unified_context( llama_kv_cache_unified * kv, llama_context * lctx, bool do_shift, - defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) { - if (!do_shift && this->dinfo.empty()) { + defrag_info dinfo, + stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)), sc_info(std::move(sc_info)) { + if (!do_shift && this->dinfo.empty() && this->sc_info.empty()) { status = LLAMA_MEMORY_STATUS_NO_UPDATE; } } @@ -1914,7 +2306,7 @@ bool llama_kv_cache_unified_context::apply() { // no ubatches -> this is a KV cache update if (ubatches.empty()) { - kv->update(lctx, do_shift, dinfo); + kv->update(lctx, do_shift, dinfo, sc_info); return true; } @@ -1940,12 +2332,16 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const { return n_kv; } +bool llama_kv_cache_unified_context::get_supports_set_rows() const { + return kv->get_supports_set_rows(); +} + ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const { - return kv->get_k(ctx, il, n_kv); + return kv->get_k(ctx, il, n_kv, sinfos[i_cur]); } ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t il) const { - return kv->get_v(ctx, il, n_kv); + return kv->get_v(ctx, il, n_kv, sinfos[i_cur]); } ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const { diff --git a/examples/talk-llama/llama-kv-cache-unified.h b/examples/talk-llama/llama-kv-cache-unified.h index b8b0356e830..3e28e346c3f 100644 --- a/examples/talk-llama/llama-kv-cache-unified.h +++ b/examples/talk-llama/llama-kv-cache-unified.h @@ -35,16 +35,50 @@ class llama_kv_cache_unified : public llama_memory_i { std::vector ids; }; + struct stream_copy_info { + bool empty() const { + assert(ssrc.size() == sdst.size()); + return ssrc.empty(); + } + + std::vector ssrc; + std::vector sdst; + }; + // for each ubatch, create a slot_info that contains information about where the ubatch should be inserted in the // KV cells. for example, cell indices for each token, such that: token[i] -> goes to cells[idxs[i]] struct slot_info { // data for ggml_set_rows using idx_vec_t = std::vector; - idx_vec_t idxs; + // number of streams: ns = s1 - s0 + 1 + llama_seq_id s0; + llama_seq_id s1; + + std::vector strm; // [ns] + std::vector idxs; // [ns] uint32_t head() const { - return idxs.at(0); + GGML_ASSERT(idxs.size() == 1); + GGML_ASSERT(!idxs[0].empty()); + + return idxs[0][0]; + } + + void resize(size_t n) { + strm.resize(n); + idxs.resize(n); + } + + size_t size() const { + GGML_ASSERT(idxs.size() == strm.size()); + GGML_ASSERT(!idxs.empty()); + + return idxs[0].size(); + } + + size_t n_stream() const { + return strm.size(); } bool empty() const { @@ -54,9 +88,6 @@ class llama_kv_cache_unified : public llama_memory_i { void clear() { idxs.clear(); } - - // TODO: implement - //std::vector seq_idxs; }; using slot_info_vec_t = std::vector; @@ -68,6 +99,7 @@ class llama_kv_cache_unified : public llama_memory_i { ggml_type type_v, bool v_trans, bool offload, + bool unified, uint32_t kv_size, uint32_t n_seq_max, uint32_t n_pad, @@ -111,7 +143,8 @@ class llama_kv_cache_unified : public llama_memory_i { // llama_kv_cache_unified specific API // - uint32_t get_size() const; + uint32_t get_size() const; + uint32_t get_n_stream() const; bool get_has_shift() const; @@ -121,9 +154,12 @@ class llama_kv_cache_unified : public llama_memory_i { uint32_t get_n_kv() const; + // TODO: temporary + bool get_supports_set_rows() const; + // get views of the current state of the cache - ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const; - ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const; + ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const; + ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const; // store k_cur and v_cur in the cache based on the provided head location ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const; @@ -137,7 +173,7 @@ class llama_kv_cache_unified : public llama_memory_i { // return empty vector on failure slot_info_vec_t prepare(const std::vector & ubatches); - bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo); + bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info); // find a slot of kv cells that can hold the ubatch // if cont == true, then the slot must be continuous @@ -157,8 +193,9 @@ class llama_kv_cache_unified : public llama_memory_i { void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const; void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const; + void set_input_k_shift(ggml_tensor * dst) const; + void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const; - void set_input_k_shift (ggml_tensor * dst) const; void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const; private: @@ -172,15 +209,15 @@ class llama_kv_cache_unified : public llama_memory_i { ggml_tensor * k; ggml_tensor * v; + + std::vector k_stream; + std::vector v_stream; }; bool v_trans = true; // the value tensor is transposed - // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot()) - // note: this is not part of the KV state and it's only used to speed-up the find_slot() method - uint32_t head = 0; - const uint32_t n_seq_max = 1; + const uint32_t n_stream = 1; // required padding const uint32_t n_pad = 1; @@ -193,14 +230,24 @@ class llama_kv_cache_unified : public llama_memory_i { // env: LLAMA_SET_ROWS (temporary) // ref: https://github.com/ggml-org/llama.cpp/pull/14285 - int supports_set_rows = false; + bool supports_set_rows = false; const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; std::vector ctxs; std::vector bufs; - llama_kv_cells_unified cells; + // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot()) + // note: this is not part of the KV state and it's only used to speed-up the find_slot() method + std::vector v_heads; + + std::vector v_cells; + + // maps from a sequence id to a stream id + std::vector seq_to_stream; + + // pending stream copies that will be applied during the next update + stream_copy_info sc_info; std::vector layers; @@ -226,29 +273,34 @@ class llama_kv_cache_unified : public llama_memory_i { float freq_base, float freq_scale) const; - llm_graph_result_ptr build_graph_shift( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf) const; + ggml_cgraph * build_graph_shift( + llm_graph_result * res, + llama_context * lctx) const; - llm_graph_result_ptr build_graph_defrag( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf, + ggml_cgraph * build_graph_defrag( + llm_graph_result * res, + llama_context * lctx, const defrag_info & dinfo) const; - void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; - void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const; + struct cell_ranges_t { + uint32_t strm; - bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1); - bool state_read_data(llama_io_read_i & io, uint32_t cell_count); + std::vector> data; // ranges, from inclusive, to exclusive + }; + + void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const; + void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const; + + bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1); + bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count); }; class llama_kv_cache_unified_context : public llama_memory_context_i { public: // some shorthands - using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; - using defrag_info = llama_kv_cache_unified::defrag_info; + using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; + using defrag_info = llama_kv_cache_unified::defrag_info; + using stream_copy_info = llama_kv_cache_unified::stream_copy_info; // used for errors llama_kv_cache_unified_context(llama_memory_status status); @@ -262,7 +314,8 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { llama_kv_cache_unified * kv, llama_context * lctx, bool do_shift, - defrag_info dinfo); + defrag_info dinfo, + stream_copy_info sc_info); // used to create a batch procesing context from a batch llama_kv_cache_unified_context( @@ -288,6 +341,9 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { uint32_t get_n_kv() const; + // TODO: temporary + bool get_supports_set_rows() const; + // get views of the current state of the cache ggml_tensor * get_k(ggml_context * ctx, int32_t il) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; @@ -320,6 +376,8 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { defrag_info dinfo; + stream_copy_info sc_info; + // // batch processing context // diff --git a/examples/talk-llama/llama-memory-hybrid.cpp b/examples/talk-llama/llama-memory-hybrid.cpp index 6cd10db06b7..d8e2086c875 100644 --- a/examples/talk-llama/llama-memory-hybrid.cpp +++ b/examples/talk-llama/llama-memory-hybrid.cpp @@ -38,6 +38,7 @@ llama_memory_hybrid::llama_memory_hybrid( type_v, v_trans, offload, + 1, kv_size, n_seq_max, n_pad, diff --git a/examples/talk-llama/llama-memory-recurrent.cpp b/examples/talk-llama/llama-memory-recurrent.cpp index 2c1ae67098c..c0c2ec084dc 100644 --- a/examples/talk-llama/llama-memory-recurrent.cpp +++ b/examples/talk-llama/llama-memory-recurrent.cpp @@ -446,7 +446,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) { // A slot should be always be contiguous. // can only process batches with an equal number of new tokens in each sequence - GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.equal_seqs()); int32_t min = size - 1; int32_t max = 0; @@ -768,6 +768,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: // Iterate and write all the keys first, each row is a cell // Get whole range at a time for (uint32_t il = 0; il < n_layer; ++il) { + // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) + if (r_l[il] == nullptr) continue; // Write key type const int32_t r_type_i = (int32_t)r_l[il]->type; @@ -787,6 +789,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: if (!s_trans) { for (uint32_t il = 0; il < n_layer; ++il) { + // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) + if (s_l[il] == nullptr) continue; // Write value type const int32_t s_type_i = (int32_t)s_l[il]->type; @@ -807,6 +811,9 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: // When v is transposed, we also need the element size and get the element ranges from each row const uint32_t mem_size = size; for (uint32_t il = 0; il < n_layer; ++il) { + // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) + if (s_l[il] == nullptr) continue; + const uint32_t n_embd_s = hparams.n_embd_s(); // Write value type @@ -951,6 +958,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block for (uint32_t il = 0; il < n_layer; ++il) { + // skip null layers + if (r_l[il] == nullptr) continue; // Read type of key int32_t r_type_i_ref; @@ -978,11 +987,14 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell if (!s_trans) { for (uint32_t il = 0; il < n_layer; ++il) { + // skip null layers + if (s_l[il] == nullptr) continue; // Read type of value int32_t s_type_i_ref; io.read_to(&s_type_i_ref, sizeof(s_type_i_ref)); const int32_t s_type_i = (int32_t)s_l[il]->type; + if (s_type_i != s_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il); return false; @@ -1005,6 +1017,9 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell } else { // For each layer, read the values for each cell (transposed) for (uint32_t il = 0; il < n_layer; ++il) { + // skip null layers + if (s_l[il] == nullptr) continue; + const uint32_t n_embd_s = hparams.n_embd_s(); // Read type of value diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp index a322fc39352..71f89e19072 100644 --- a/examples/talk-llama/llama-model.cpp +++ b/examples/talk-llama/llama-model.cpp @@ -107,8 +107,10 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; case LLM_TYPE_A13B: return "A13B"; + case LLM_TYPE_21B_A3B: return "21B.A3B"; case LLM_TYPE_30B_A3B: return "30B.A3B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; + case LLM_TYPE_300B_A47B: return "300B.A47B"; case LLM_TYPE_E2B: return "E2B"; case LLM_TYPE_E4B: return "E4B"; default: return "?B"; @@ -644,6 +646,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + // MiniCPM uses rope by default, unlike Granite which uses it as a switch + hparams.rope_finetuned = true; + switch (hparams.n_layer) { case 52: type = LLM_TYPE_1B; break; case 40: type = LLM_TYPE_2B; break; @@ -849,6 +854,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_DREAM: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // Dream models are primarily 7B with 28 layers + switch (hparams.n_layer) { + case 28: + type = LLM_TYPE_7B; + break; + default: + type = LLM_TYPE_UNKNOWN; + } + // Set non-causal attention for diffusion models + hparams.causal_attn = false; + } + break; case LLM_ARCH_QWEN2MOE: { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); @@ -935,6 +955,33 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_PLAMO2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + // Load Mamba SSM parameters + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + } + + switch (hparams.n_layer) { + case 16: type = LLM_TYPE_1B; break; + case 32: + if (hparams.n_embd == 2048) { + type = LLM_TYPE_2B; + } else if (hparams.n_embd == 4096) { + type = LLM_TYPE_8B; + } + break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_GPT2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -1322,7 +1369,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // that have no expert_gating_func model parameter set hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; } - ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false); switch (hparams.n_layer) { case 27: type = LLM_TYPE_16B; break; @@ -1446,6 +1493,23 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_EXAONE4: + { + if (hparams.n_layer == 64) { // 32B + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.n_swa = 4096; + hparams.set_swa_pattern(4); + } + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 30: type = LLM_TYPE_1_2B; break; + case 64: type = LLM_TYPE_32B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: { @@ -1483,7 +1547,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); switch (hparams.n_layer) { - case 12: type = LLM_TYPE_190M; break; + case 12: + switch (hparams.n_embd) { + case 768: type = LLM_TYPE_190M; break; + default: type = LLM_TYPE_UNKNOWN; + } break; case 24: switch (hparams.n_embd) { case 1024: type = LLM_TYPE_450M; break; @@ -1496,7 +1564,17 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 3584: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; } break; - case 32: type = LLM_TYPE_2_9B; break; // RWKV-7-World + case 32: + switch (hparams.n_embd) { + case 2560: type = LLM_TYPE_2_9B; break; + case 4096: type = LLM_TYPE_7B; break; + default: type = LLM_TYPE_UNKNOWN; + } break; + case 61: + switch (hparams.n_embd) { + case 4096: type = LLM_TYPE_14B; break; + default: type = LLM_TYPE_UNKNOWN; + } break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -1607,10 +1685,20 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } break; case LLM_ARCH_ERNIE4_5: + case LLM_ARCH_ERNIE4_5_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + if (arch == LLM_ARCH_ERNIE4_5_MOE) { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + } + switch (hparams.n_layer) { case 18: type = LLM_TYPE_0_3B; break; + case 28: type = LLM_TYPE_21B_A3B; break; + case 54: type = LLM_TYPE_300B_A47B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -2643,12 +2731,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2VL: + case LLM_ARCH_DREAM: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (output == NULL) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); @@ -2938,6 +3028,73 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_PLAMO2: + { + const uint32_t d_conv = hparams.ssm_d_conv; + const uint32_t d_state = hparams.ssm_d_state; + const uint32_t num_heads = hparams.ssm_dt_rank; + const uint32_t intermediate_size = hparams.ssm_d_inner; + const uint32_t head_dim = intermediate_size / num_heads; + const uint32_t qk_dim = head_dim; + const uint32_t v_dim = head_dim; + const int64_t num_attention_heads = hparams.n_head(); + const int64_t q_num_heads = num_attention_heads; + const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16)); + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + bool is_mamba_layer = hparams.is_recurrent(i); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + if (is_mamba_layer) { + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0); + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0); + + layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0); + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0); + + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0); + layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0); + + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0); + + layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0); + layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0); + layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0); + } else { + const int64_t num_key_value_heads = hparams.n_head_kv(i); + const int64_t k_num_heads = num_key_value_heads; + const int64_t v_num_heads = num_key_value_heads; + const int64_t q_proj_dim = q_num_heads * qk_dim; + const int64_t k_proj_dim = k_num_heads * qk_dim; + const int64_t v_proj_dim = v_num_heads * v_dim; + + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0); + } + + // All layers have post-attention norm, FFN norm, and FFN tensors + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0); + } + } break; case LLM_ARCH_GPT2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4232,6 +4389,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_EXAONE4: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + } + } break; case LLM_ARCH_RWKV6: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4747,6 +4937,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; case LLM_ARCH_ERNIE4_5: + case LLM_ARCH_ERNIE4_5_MOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4775,9 +4966,27 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast(i) >= hparams.n_layer_dense_lead) { // MoE layers + int n_ff_exp = hparams.n_ff_exp; + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + + // Shared expert (if present) + if (hparams.n_ff_shexp > 0) { + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0); + } + } else { // Dense layers + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } } } break; case LLM_ARCH_FALCON_H1: @@ -5209,6 +5418,7 @@ void llama_model::print_info() const { arch == LLM_ARCH_MAMBA2 || arch == LLM_ARCH_JAMBA || arch == LLM_ARCH_FALCON_H1 || + arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_GRANITE_HYBRID) { LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); @@ -5381,7 +5591,7 @@ ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int i } struct llm_build_llama : public llm_graph_context { - llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5457,7 +5667,7 @@ struct llm_build_llama : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -5537,7 +5747,7 @@ struct llm_build_llama : public llm_graph_context { }; struct llm_build_llama_iswa : public llm_graph_context { - llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5631,7 +5841,7 @@ struct llm_build_llama_iswa : public llm_graph_context { cb(Kcur, "Kcur_normed", il); } - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -5720,7 +5930,7 @@ struct llm_build_llama_iswa : public llm_graph_context { }; struct llm_build_deci : public llm_graph_context { - llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5808,7 +6018,7 @@ struct llm_build_deci : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } @@ -5876,7 +6086,7 @@ struct llm_build_deci : public llm_graph_context { }; struct llm_build_baichuan : public llm_graph_context { - llm_build_baichuan(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5940,7 +6150,7 @@ struct llm_build_baichuan : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -5998,7 +6208,7 @@ struct llm_build_baichuan : public llm_graph_context { }; struct llm_build_xverse : public llm_graph_context { - llm_build_xverse(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6055,7 +6265,7 @@ struct llm_build_xverse : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6111,7 +6321,7 @@ struct llm_build_xverse : public llm_graph_context { }; struct llm_build_falcon : public llm_graph_context { - llm_build_falcon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6178,7 +6388,7 @@ struct llm_build_falcon : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6233,7 +6443,7 @@ struct llm_build_falcon : public llm_graph_context { }; struct llm_build_grok : public llm_graph_context { - llm_build_grok(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6308,7 +6518,7 @@ struct llm_build_grok : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -6395,7 +6605,7 @@ struct llm_build_grok : public llm_graph_context { }; struct llm_build_dbrx : public llm_graph_context { - llm_build_dbrx(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6457,7 +6667,7 @@ struct llm_build_dbrx : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6520,7 +6730,7 @@ struct llm_build_dbrx : public llm_graph_context { }; struct llm_build_starcoder : public llm_graph_context { - llm_build_starcoder(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6571,7 +6781,7 @@ struct llm_build_starcoder : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6629,7 +6839,7 @@ struct llm_build_starcoder : public llm_graph_context { }; struct llm_build_refact : public llm_graph_context { - llm_build_refact(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6670,7 +6880,7 @@ struct llm_build_refact : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6728,7 +6938,7 @@ struct llm_build_refact : public llm_graph_context { }; struct llm_build_bert : public llm_graph_context { - llm_build_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6827,7 +7037,7 @@ struct llm_build_bert : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); @@ -6914,7 +7124,7 @@ struct llm_build_bert : public llm_graph_context { }; struct llm_build_neo_bert : public llm_graph_context { - llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6972,7 +7182,7 @@ struct llm_build_neo_bert : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); @@ -7024,7 +7234,7 @@ struct llm_build_neo_bert : public llm_graph_context { }; struct llm_build_bloom : public llm_graph_context { - llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -7072,7 +7282,7 @@ struct llm_build_bloom : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7130,7 +7340,7 @@ struct llm_build_bloom : public llm_graph_context { }; struct llm_build_mpt : public llm_graph_context { - llm_build_mpt(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -7219,7 +7429,7 @@ struct llm_build_mpt : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7278,7 +7488,7 @@ struct llm_build_mpt : public llm_graph_context { }; struct llm_build_stablelm : public llm_graph_context { - llm_build_stablelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7365,7 +7575,7 @@ struct llm_build_stablelm : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7430,7 +7640,7 @@ struct llm_build_stablelm : public llm_graph_context { }; struct llm_build_qwen : public llm_graph_context { - llm_build_qwen(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7486,7 +7696,7 @@ struct llm_build_qwen : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7544,7 +7754,7 @@ struct llm_build_qwen : public llm_graph_context { }; struct llm_build_qwen2 : public llm_graph_context { - llm_build_qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7606,7 +7816,7 @@ struct llm_build_qwen2 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7654,6 +7864,113 @@ struct llm_build_qwen2 : public llm_graph_context { // lm_head cur = build_lora_mm(model.output, cur); + if (model.output_b != nullptr) { + cur = ggml_add(ctx0, cur, model.output_b); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + +struct llm_build_dream : public llm_graph_context { + llm_build_dream(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + //copied from qwen2 + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, + nullptr, 1.0f / sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); res->t_logits = cur; @@ -7662,7 +7979,7 @@ struct llm_build_qwen2 : public llm_graph_context { }; struct llm_build_qwen2vl : public llm_graph_context { - llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7727,7 +8044,7 @@ struct llm_build_qwen2vl : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7783,7 +8100,7 @@ struct llm_build_qwen2vl : public llm_graph_context { }; struct llm_build_qwen2moe : public llm_graph_context { - llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7854,7 +8171,7 @@ struct llm_build_qwen2moe : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7942,7 +8259,7 @@ struct llm_build_qwen2moe : public llm_graph_context { }; struct llm_build_qwen3 : public llm_graph_context { - llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8007,7 +8324,7 @@ struct llm_build_qwen3 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8063,7 +8380,7 @@ struct llm_build_qwen3 : public llm_graph_context { }; struct llm_build_qwen3moe : public llm_graph_context { - llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8128,7 +8445,7 @@ struct llm_build_qwen3moe : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8191,7 +8508,7 @@ struct llm_build_qwen3moe : public llm_graph_context { }; struct llm_build_phi2 : public llm_graph_context { - llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -8268,7 +8585,7 @@ struct llm_build_phi2 : public llm_graph_context { // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -8322,7 +8639,7 @@ struct llm_build_phi2 : public llm_graph_context { template struct llm_build_phi3 : public llm_graph_context { - llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -8405,7 +8722,7 @@ struct llm_build_phi3 : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); cb(Qcur, "Qcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -8480,7 +8797,7 @@ struct llm_build_phi3 : public llm_graph_context { }; struct llm_build_plamo : public llm_graph_context { - llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8539,7 +8856,7 @@ struct llm_build_plamo : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8595,7 +8912,7 @@ struct llm_build_plamo : public llm_graph_context { }; struct llm_build_gpt2 : public llm_graph_context { - llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -8647,7 +8964,7 @@ struct llm_build_gpt2 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8705,7 +9022,7 @@ struct llm_build_gpt2 : public llm_graph_context { }; struct llm_build_codeshell : public llm_graph_context { - llm_build_codeshell(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -8761,7 +9078,7 @@ struct llm_build_codeshell : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8819,7 +9136,7 @@ struct llm_build_codeshell : public llm_graph_context { }; struct llm_build_orion : public llm_graph_context { - llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8890,7 +9207,7 @@ struct llm_build_orion : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8946,7 +9263,7 @@ struct llm_build_orion : public llm_graph_context { }; struct llm_build_internlm2 : public llm_graph_context { - llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9017,7 +9334,7 @@ struct llm_build_internlm2 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -9073,7 +9390,7 @@ struct llm_build_internlm2 : public llm_graph_context { }; struct llm_build_minicpm3 : public llm_graph_context { - llm_build_minicpm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { //TODO: if the model varies, these parameters need to be read from the model const int64_t n_embd_base = 256; const float scale_embd = 12.0f; @@ -9205,7 +9522,7 @@ struct llm_build_minicpm3 : public llm_graph_context { ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); } @@ -9277,7 +9594,7 @@ struct llm_build_minicpm3 : public llm_graph_context { }; struct llm_build_gemma : public llm_graph_context { - llm_build_gemma(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; ggml_tensor * cur; @@ -9335,7 +9652,7 @@ struct llm_build_gemma : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); cb(Qcur, "Qcur_scaled", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -9393,7 +9710,7 @@ struct llm_build_gemma : public llm_graph_context { }; struct llm_build_gemma2_iswa : public llm_graph_context { - llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_k; ggml_tensor * cur; @@ -9450,7 +9767,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -9523,7 +9840,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { }; struct llm_build_gemma3_iswa : public llm_graph_context { - llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_k; ggml_tensor * cur; @@ -9592,7 +9909,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315 Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -9661,7 +9978,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context { struct llm_build_gemma3n_iswa : public llm_graph_context { const llama_model & model; - ggml_cgraph * gf; const int64_t n_embd_head; const int64_t n_embd_altup; @@ -9671,10 +9987,9 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { const int n_layer_sparsity = 10; // number of layers using activation sparsity const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) - llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) + llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model), - gf(gf), n_embd_head(model.hparams.n_embd_head_k), n_embd_altup(model.hparams.n_embd_altup), n_altup(model.hparams.n_altup), @@ -9775,7 +10090,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cb(Qcur, "Qcur_pos", il); cb(Kcur, "Kcur_pos", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il); } else { @@ -9793,7 +10108,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur_pos", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); } @@ -10087,7 +10402,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { // TODO: move up next to build_starcoder struct llm_build_starcoder2 : public llm_graph_context { - llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10158,7 +10473,7 @@ struct llm_build_starcoder2 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10219,7 +10534,6 @@ struct llm_graph_context_mamba : public llm_graph_context { ggml_tensor * build_mamba_layer( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, @@ -10244,13 +10558,13 @@ struct llm_graph_context_mamba : public llm_graph_context { const int64_t n_seq_tokens = ubatch.n_seq_tokens; GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.equal_seqs()); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); + ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} @@ -10331,7 +10645,7 @@ struct llm_graph_context_mamba : public llm_graph_context { return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); }; - ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); + ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); // store last states ggml_build_forward_expand(gf, @@ -10358,11 +10672,10 @@ struct llm_graph_context_mamba : public llm_graph_context { ggml_tensor * build_mamba2_layer( llm_graph_input_rs * inp, - ggml_cgraph * gf, - ggml_tensor * cur, - const llama_model & model, - const llama_ubatch & ubatch, - int il) const { + ggml_tensor * cur, + const llama_model & model, + const llama_ubatch & ubatch, + int il) const { const auto * mctx_cur = inp->mctx; @@ -10379,13 +10692,13 @@ struct llm_graph_context_mamba : public llm_graph_context { const int64_t n_seq_tokens = ubatch.n_seq_tokens; GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.equal_seqs()); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); + ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} @@ -10455,7 +10768,7 @@ struct llm_graph_context_mamba : public llm_graph_context { return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); }; - ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); + ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); // store last states ggml_build_forward_expand(gf, @@ -10491,7 +10804,7 @@ struct llm_graph_context_mamba : public llm_graph_context { }; struct llm_build_mamba : public llm_graph_context_mamba { - llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { ggml_tensor * cur; ggml_tensor * inpL; @@ -10510,9 +10823,9 @@ struct llm_build_mamba : public llm_graph_context_mamba { cb(cur, "attn_norm", il); if (model.arch == LLM_ARCH_MAMBA2) { - cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il); + cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il); } else { - cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il); + cur = build_mamba_layer(rs_inp, cur, model, ubatch, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -10548,7 +10861,7 @@ struct llm_build_mamba : public llm_graph_context_mamba { }; struct llm_build_jamba : public llm_graph_context_mamba { - llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { const int64_t n_embd_head = hparams.n_embd_head_v; ggml_tensor * cur; @@ -10568,7 +10881,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { cb(cur, "attn_norm", il); if (n_head_kv == 0) { - cur = build_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il); + cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il); } else { // Attention @@ -10589,7 +10902,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { cb(Vcur, "Vcur", il); // No RoPE :) - cur = build_attn(inp_hybrid->get_attn(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il); + cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -10657,7 +10970,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { }; struct llm_build_command_r : public llm_graph_context { - llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10745,7 +11058,7 @@ struct llm_build_command_r : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10804,7 +11117,7 @@ struct llm_build_command_r : public llm_graph_context { }; struct llm_build_cohere2_iswa : public llm_graph_context { - llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10880,7 +11193,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10940,7 +11253,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { // * removed bias // * removed MoE struct llm_build_olmo : public llm_graph_context { - llm_build_olmo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11011,7 +11324,7 @@ struct llm_build_olmo : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11068,7 +11381,7 @@ struct llm_build_olmo : public llm_graph_context { }; struct llm_build_olmo2 : public llm_graph_context { - llm_build_olmo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11131,7 +11444,7 @@ struct llm_build_olmo2 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11197,7 +11510,7 @@ struct llm_build_olmo2 : public llm_graph_context { // * removed bias // * added q, k norm struct llm_build_olmoe : public llm_graph_context { - llm_build_olmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11264,7 +11577,7 @@ struct llm_build_olmoe : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11325,7 +11638,7 @@ struct llm_build_olmoe : public llm_graph_context { }; struct llm_build_openelm : public llm_graph_context { - llm_build_openelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11397,7 +11710,7 @@ struct llm_build_openelm : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Qcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11454,7 +11767,7 @@ struct llm_build_openelm : public llm_graph_context { }; struct llm_build_gptneox : public llm_graph_context { - llm_build_gptneox(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11509,7 +11822,7 @@ struct llm_build_gptneox : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11600,7 +11913,7 @@ struct llm_build_gptneox : public llm_graph_context { }; struct llm_build_arctic : public llm_graph_context { - llm_build_arctic(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11659,7 +11972,7 @@ struct llm_build_arctic : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11738,7 +12051,7 @@ struct llm_build_arctic : public llm_graph_context { }; struct llm_build_deepseek : public llm_graph_context { - llm_build_deepseek(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11814,7 +12127,7 @@ struct llm_build_deepseek : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } @@ -11900,7 +12213,7 @@ struct llm_build_deepseek : public llm_graph_context { }; struct llm_build_deepseek2 : public llm_graph_context { - llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { bool is_lite = (hparams.n_layer == 27); const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); @@ -12042,7 +12355,7 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(Vcur, "Vcur", il); // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il); } else { @@ -12076,7 +12389,7 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(Kcur, "Kcur", il); // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } @@ -12163,7 +12476,7 @@ struct llm_build_deepseek2 : public llm_graph_context { }; struct llm_build_bitnet : public llm_graph_context { - llm_build_bitnet(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12243,7 +12556,7 @@ struct llm_build_bitnet : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, NULL, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); @@ -12323,7 +12636,7 @@ struct llm_build_bitnet : public llm_graph_context { }; struct llm_build_t5_enc : public llm_graph_context { - llm_build_t5_enc(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12366,7 +12679,7 @@ struct llm_build_t5_enc : public llm_graph_context { ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo_enc, nullptr, Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il); cb(cur, "kqv_out", il); @@ -12424,7 +12737,7 @@ struct llm_build_t5_enc : public llm_graph_context { }; struct llm_build_t5_dec : public llm_graph_context { - llm_build_t5_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; //const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12472,7 +12785,7 @@ struct llm_build_t5_dec : public llm_graph_context { ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b); - cur = build_attn(inp_attn_self, gf, + cur = build_attn(inp_attn_self, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il); cb(cur, "kqv_out", il); @@ -12504,7 +12817,7 @@ struct llm_build_t5_dec : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc); - cur = build_attn(inp_attn_cross, gf, + cur = build_attn(inp_attn_cross, model.layers[il].wo_cross, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); cb(cur, "kqv_out", il); @@ -12594,7 +12907,7 @@ struct llm_build_t5_dec : public llm_graph_context { }; struct llm_build_jais : public llm_graph_context { - llm_build_jais(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12636,7 +12949,7 @@ struct llm_build_jais : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il); } @@ -12689,7 +13002,7 @@ struct llm_build_jais : public llm_graph_context { }; struct llm_build_chatglm : public llm_graph_context { - llm_build_chatglm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12768,7 +13081,7 @@ struct llm_build_chatglm : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -12822,7 +13135,7 @@ struct llm_build_chatglm : public llm_graph_context { }; struct llm_build_glm4 : public llm_graph_context { - llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12901,7 +13214,7 @@ struct llm_build_glm4 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -12973,7 +13286,7 @@ struct llm_build_glm4 : public llm_graph_context { }; struct llm_build_nemotron : public llm_graph_context { - llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -13045,7 +13358,7 @@ struct llm_build_nemotron : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -13102,7 +13415,7 @@ struct llm_build_nemotron : public llm_graph_context { }; struct llm_build_exaone : public llm_graph_context { - llm_build_exaone(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -13176,7 +13489,7 @@ struct llm_build_exaone : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -13232,32 +13545,168 @@ struct llm_build_exaone : public llm_graph_context { } }; -struct llm_build_rwkv6_base : public llm_graph_context { - const llama_model & model; +template +struct llm_build_exaone4 : public llm_graph_context { + llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_k; - llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) { - } + GGML_ASSERT(n_embd_head == hparams.n_embd_head_v); + GGML_ASSERT(n_embd_head == hparams.n_rot); - ggml_tensor * build_rwkv6_channel_mix( - const llama_layer * layer, - ggml_tensor * cur, - ggml_tensor * x_prev, - llm_arch arch) const { - ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); - switch (arch) { - case LLM_ARCH_RWKV6: - { - ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); - ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); + ggml_tensor * cur; + ggml_tensor * inpL; - ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); - ggml_tensor * k = ggml_sqr( - ctx0, - ggml_relu( - ctx0, - build_lora_mm(layer->channel_mix_key, xk) - ) - ); + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_unified_iswa(); + } else { + inp_attn = build_attn_inp_kv_unified(); + } + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // use RoPE for SWA layers or non-SWA models + const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE; + + cur = inpL; + + // self-attention + { + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + cb(Kcur, "Kcur_normed", il); + + if (use_rope) { + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + cb(cur, "attn_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + cur = build_norm(cur, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_ffn(ffn_inp, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = build_norm(cur, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", -1); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + +struct llm_build_rwkv6_base : public llm_graph_context { + const llama_model & model; + + llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) { + } + + ggml_tensor * build_rwkv6_channel_mix( + const llama_layer * layer, + ggml_tensor * cur, + ggml_tensor * x_prev, + llm_arch arch) const { + ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + switch (arch) { + case LLM_ARCH_RWKV6: + { + ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); + ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); + + ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); + ggml_tensor * k = ggml_sqr( + ctx0, + ggml_relu( + ctx0, + build_lora_mm(layer->channel_mix_key, xk) + ) + ); cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); } break; default: @@ -13269,7 +13718,6 @@ struct llm_build_rwkv6_base : public llm_graph_context { ggml_tensor * build_rwkv6_time_mix( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, const llama_ubatch & ubatch, @@ -13396,7 +13844,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { } ggml_tensor * wkv_state = build_rs( - inp, gf, mctx_cur->get_s_l(il), + inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs); ggml_tensor * wkv_output; @@ -13442,7 +13890,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { }; struct llm_build_rwkv6 : public llm_build_rwkv6_base { - llm_build_rwkv6(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) { + llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) { GGML_ASSERT(hparams.token_shift_count == 2); ggml_tensor * cur; @@ -13463,7 +13911,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { const llama_layer * layer = &model.layers[il]; inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il); + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); @@ -13478,7 +13926,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { 1 ); - cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il); + cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il); ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -13543,7 +13991,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { - llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) { + llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) { GGML_ASSERT(n_embd == hparams.n_embd_r()); ggml_tensor * cur; @@ -13563,7 +14011,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { const llama_layer * layer = &model.layers[il]; inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il); + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); cb(att_norm, "attn_norm", il); @@ -13575,7 +14023,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { 1 ); - cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il); + cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); @@ -13665,7 +14113,6 @@ struct llm_build_rwkv7_base : public llm_graph_context { ggml_tensor * build_rwkv7_time_mix( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor *& first_layer_value, @@ -13751,7 +14198,7 @@ struct llm_build_rwkv7_base : public llm_graph_context { a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens); ggml_tensor * wkv_state = build_rs( - inp, gf, mctx_cur->get_s_l(il), + inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs); ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state); @@ -13798,7 +14245,7 @@ struct llm_build_rwkv7_base : public llm_graph_context { }; struct llm_build_rwkv7 : public llm_build_rwkv7_base { - llm_build_rwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) { + llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) { GGML_ASSERT(hparams.token_shift_count == 2); ggml_tensor * cur; @@ -13820,7 +14267,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base { const llama_layer * layer = &model.layers[il]; inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il); + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); @@ -13835,7 +14282,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base { 1 ); - cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il); + cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il); ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -13894,7 +14341,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base { struct llm_build_arwkv7 : public llm_build_rwkv7_base { - llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) { + llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) { GGML_ASSERT(n_embd == hparams.n_embd_r()); ggml_tensor * cur; @@ -13915,7 +14362,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base { const llama_layer * layer = &model.layers[il]; inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il); + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); cb(att_norm, "attn_norm", il); @@ -13927,7 +14374,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base { 1 ); - cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il); + cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); @@ -13984,8 +14431,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base { struct llm_build_granite : public llm_graph_context { llm_build_granite( const llama_model & model, - const llm_graph_params & params, - ggml_cgraph * gf) + const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -14019,7 +14465,7 @@ struct llm_build_granite : public llm_graph_context { // self-attention cur = build_attention_layer( - gf, cur, inp_pos, inp_attn, + cur, inp_pos, inp_attn, model, n_embd_head, il); if (il == n_layer - 1 && inp_out_ids) { @@ -14055,7 +14501,6 @@ struct llm_build_granite : public llm_graph_context { } ggml_tensor * build_attention_layer( - ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv_unified * inp_attn, @@ -14110,7 +14555,7 @@ struct llm_build_granite : public llm_graph_context { cb(Vcur, "Vcur", il); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -14198,11 +14643,9 @@ struct llm_build_granite : public llm_graph_context { }; struct llm_build_granite_hybrid : public llm_graph_context_mamba { - llm_build_granite_hybrid( const llama_model & model, - const llm_graph_params & params, - ggml_cgraph * gf) : + const llm_graph_params & params) : llm_graph_context_mamba(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -14234,11 +14677,11 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { if (hparams.is_recurrent(il)) { // ssm layer // - cur = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il); + cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il); } else { // attention layer // cur = build_attention_layer( - gf, cur, inp_pos, inp->get_attn(), model, + cur, inp_pos, inp->get_attn(), model, n_embd_head, il); } @@ -14277,7 +14720,6 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { } ggml_tensor * build_attention_layer( - ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv_unified * inp_attn, @@ -14332,7 +14774,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { cb(Vcur, "Vcur", il); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -14426,7 +14868,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { // * removed bias // * removed MoE struct llm_build_chameleon : public llm_graph_context { - llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14517,7 +14959,7 @@ struct llm_build_chameleon : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -14603,7 +15045,7 @@ struct llm_build_chameleon : public llm_graph_context { }; struct llm_build_wavtokenizer_dec : public llm_graph_context { - llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { ggml_tensor * cur; ggml_tensor * inpL; @@ -14755,7 +15197,7 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context { }; struct llm_build_plm : public llm_graph_context { - llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k)); const uint32_t n_embd_head_qk_rope = hparams.n_rot; @@ -14873,7 +15315,7 @@ struct llm_build_plm : public llm_graph_context { ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); } @@ -14927,7 +15369,7 @@ struct llm_build_plm : public llm_graph_context { }; struct llm_build_bailingmoe : public llm_graph_context { - llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { ggml_tensor * cur; ggml_tensor * inpL; @@ -14996,7 +15438,7 @@ struct llm_build_bailingmoe : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il); } @@ -15071,7 +15513,7 @@ struct llm_build_bailingmoe : public llm_graph_context { }; struct llm_build_dots1 : public llm_graph_context { - llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15136,7 +15578,7 @@ struct llm_build_dots1 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -15221,7 +15663,7 @@ struct llm_build_dots1 : public llm_graph_context { }; struct llm_build_ernie4_5 : public llm_graph_context { - llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15291,7 +15733,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -15350,8 +15792,178 @@ struct llm_build_ernie4_5 : public llm_graph_context { } }; +struct llm_build_ernie4_5_moe : public llm_graph_context { + llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0"); + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + // norm + { + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + } + + // self-attention + { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + cb(cur, "attn_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + bool is_moe_layer = static_cast(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0; + + if (!is_moe_layer) { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + ggml_tensor * moe_out = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(moe_out, "ffn_moe_out", il); + + // Shared expert (if present) + if (hparams.n_ff_shexp > 0) { + ggml_tensor * ffn_shexp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + } else { + cur = moe_out; + } + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_falcon_h1 : public llm_graph_context_mamba { - llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { const int64_t n_embd_head = hparams.n_embd_head_v; ggml_tensor * cur; @@ -15407,7 +16019,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { cb(Kcur, "Kcur-post-rope", il); cb(Vcur, "Vcur-post-rope", il); - ggml_tensor * attn_out = build_attn(inp->get_attn(), gf, + ggml_tensor * attn_out = build_attn(inp->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(attn_out, "attn_out", il); @@ -15418,7 +16030,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { // Mamba2 layer cb(cur, "ssm_in", il); - ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il); + ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il); cb(ssm_out, "ssm_out", il); // // Aggregation @@ -15476,8 +16088,321 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { } }; +struct llm_build_plamo2 : public llm_graph_context_mamba { + llm_build_plamo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { + ggml_tensor * cur; + ggml_tensor * inpL; + + // {n_embd, n_tokens} + inpL = build_inp_embd(model.tok_embd); + cb(inpL, "embedding_output", -1); + + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_hybrid = build_inp_mem_hybrid(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * residual = inpL; + + // ggml_graph_add_node(gf, model.layers[il].attn_norm); + // cb(model.layers[il].attn_norm, "attn_norm", il); + + // pre_mixer_norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + + // check if this layer is Mamba or Attention + bool is_mamba_layer = hparams.is_recurrent(il); + + if (is_mamba_layer) { + // PLaMo-2 Mamba layer + cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il); + } else { + // PLaMo-2 Attention layer + cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il); + } + + // post_mixer_norm + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + // residual connection + cur = ggml_add(ctx0, cur, residual); + cb(cur, "attn_residual", il); + residual = cur; + + // pre-ffn norm + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_pre_norm", il); + + // feed-forward network + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + + // post ffn norm + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_post_norm", il); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + + // residual connection + cur = ggml_add(ctx0, cur, residual); + cb(cur, "ffn_residual", il); + + inpL = cur; + } + + cur = inpL; + + // final norm + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + // Explicitly mark as output tensor to ensure proper backend assignment + ggml_set_output(cur); + + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + +private: + ggml_tensor * build_plamo2_attn_layer( + llm_graph_input_attn_kv_unified * inp, + ggml_tensor * inp_pos, + ggml_tensor * cur, + const llama_model & model, + int il) { + + // self-attention + { + // PLaMo-2 uses combined QKV tensor + ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur); + cb(qkv, "wqkv", il); + + // split QKV tensor into Q, K, V + const int64_t n_embd_head_q = hparams.n_embd_head_k; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_head_v = hparams.n_embd_head_v; + int32_t n_head_kv = hparams.n_head_kv(il); + + const int64_t q_offset = 0; + const int64_t k_offset = n_embd_head_q * n_head; + const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv; + + ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv)); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il); + } + + cb(cur, "attn_out", il); + + return cur; + } + + ggml_tensor * build_plamo2_mamba_layer( + llm_graph_input_rs * inp, + ggml_tensor * cur, + const llama_model & model, + const llama_ubatch & ubatch, + int il) { + + const auto * mctx_cur = inp->mctx; + + const auto kv_head = mctx_cur->get_head(); + + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t n_heads = hparams.ssm_dt_rank; + const int64_t head_dim = d_inner / n_heads; + const int64_t n_group = hparams.ssm_n_group; + const int64_t n_seqs = ubatch.n_seqs; + + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); + ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); + + ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); + conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs); + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + // in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} + ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur); + cb(zx, "mamba_in_proj", il); + // {8192, 5, 1, 1} -> {8192, 1, 5, 1} + zx = ggml_permute(ctx0, zx, 0, 2, 1, 3); + zx = ggml_cont(ctx0, zx); + zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs); + cb(zx, "mamba_in_proj_out", il); + + // split into z and x + // => {head_dim * n_heads, n_seq_tokens, n_seqs} + ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx)); + x = ggml_cont(ctx0, x); + x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs); + // x = ggml_permute(ctx0, x, 0, 2, 1, 3); + cb(x, "mamba_x_split", il); + + ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0); + cb(z, "mamba_z_split", il); + + // conv1d + { + // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} + ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0); + cb(conv_x, "mamba_conv1d_input", il); + + // copy last (d_conv - 1) columns back into the state cache + ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, + conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv, + ggml_view_1d(ctx0, conv_states_all, + (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); + cb(conv_states_all, "mamba_conv1d_state", il); + + // 1D convolution + x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); + cb(x, "mamba_conv1d", il); + + x = ggml_silu(ctx0, x); + cb(x, "mamba_conv1d_silu", il); + } + + // SSM + { + // bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} + ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x); + cb(x_bcdt, "mamba_bcdt_proj", il); + + // split into dt, B, C + const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16)); + ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0); + ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state); + ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state)); + cb(B, "mamba_B_raw", il); + cb(C, "mamba_C_raw", il); + cb(dt, "mamba_dt_raw", il); + + // Apply RMS norm to dt, B, C (PLaMo-2 specific) + B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il); + C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il); + dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il); + cb(B, "mamba_B_normed", il); + cb(C, "mamba_C_normed", il); + cb(dt, "mamba_dt_normed", il); + + // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} + dt = build_lora_mm(model.layers[il].ssm_dt, dt); + dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); + cb(dt, "mamba_dt_proj", il); + + ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads); + cb(A, "mamba_A", il); + + x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0); + B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0); + C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0); + + // use the states and the indices provided by build_recurrent_state + // (this is necessary in order to properly use the states before they are overwritten, + // while avoiding to make unnecessary copies of the states) + auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) { + ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size()); + + // Custom operator to optimize the parallel associative scan + // as described in the Annex D of the Mamba paper. + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); + }; + + ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); + cb(y_ssm, "mamba_ssm_scan", il); + + // store last states + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)), + ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all)))); + cb(ssm_states_all, "mamba_ssm_states", il); + + ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0); + cb(y, "mamba_y_view", il); + + // Add D parameter and apply gating with z + // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} + ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads); + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D)); + cb(y, "mamba_y_add_d", il); + + y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y); + cb(y, "mamba_y_swiglu_z", il); + + // out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0); + cur = build_lora_mm(model.layers[il].ssm_out, y); + cb(cur, "mamba_out_proj", il); + } + + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + cb(cur, "mamba_out", il); + + return cur; + } +}; + struct llm_build_arcee : public llm_graph_context { - llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15553,7 +16478,7 @@ struct llm_build_arcee : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -15612,7 +16537,7 @@ struct llm_build_arcee : public llm_graph_context { }; struct llm_build_hunyuan_moe : public llm_graph_context { - llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15698,7 +16623,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { LLM_NORM_RMS, il); cb(Qcur, "Qcur_norm", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -15773,7 +16698,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { }; struct llm_build_smollm3 : public llm_graph_context { - llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15850,7 +16775,7 @@ struct llm_build_smollm3 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -15912,7 +16837,7 @@ struct llm_build_smollm3 : public llm_graph_context { struct llm_build_lfm2 : public llm_graph_context { const llama_model & model; - llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) { + llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) { ggml_tensor * cur = build_inp_embd(model.tok_embd); cb(cur, "model.embed_tokens", -1); @@ -15927,8 +16852,8 @@ struct llm_build_lfm2 : public llm_graph_context { cb(cur, "model.layers.{}.operator_norm", il); cur = hparams.is_recurrent(il) ? - build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) : - build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ; + build_shortconv_block(cur, inp_hybrid->get_recr(), il) : + build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ; if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); @@ -15971,8 +16896,7 @@ struct llm_build_lfm2 : public llm_graph_context { return cur; } - ggml_tensor * build_attn_block(ggml_cgraph * gf, - ggml_tensor * cur, + ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv_unified * inp_attn, int il) const { @@ -16009,7 +16933,7 @@ struct llm_build_lfm2 : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow ); - cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "model.layers.{}.self_attn.out_proj", il); @@ -16017,11 +16941,22 @@ struct llm_build_lfm2 : public llm_graph_context { return cur; } - ggml_tensor * build_shortconv_block(ggml_cgraph * gf, - ggml_tensor * cur, + ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) { - const auto * mctx_cur = static_cast(mctx)->get_recr(); + const auto * mctx_cur = static_cast(mctx)->get_recr(); + const uint32_t kv_head = mctx_cur->get_head(); + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + GGML_ASSERT(hparams.n_shortconv_l_cache > 1); + const uint32_t d_conv = hparams.n_shortconv_l_cache - 1; + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur); cb(bcx, "model.layers.{}.conv.in_proj", il); @@ -16029,38 +16964,48 @@ struct llm_build_lfm2 : public llm_graph_context { constexpr auto n_chunks = 3; GGML_ASSERT(bcx->ne[0] % n_chunks == 0); auto const chunk_size = bcx->ne[0] / n_chunks; - auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx)); - auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx)); - auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx)); + auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx)); + auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx)); + auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx)); auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); - // read conv state directly, with build_rs generation is slower - ggml_tensor * conv_state = mctx_cur->get_r_l(il); - const int64_t n_seqs = ubatch.n_seqs; - ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs); - conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs); + // read conv state + auto * conv_state = mctx_cur->get_r_l(il); + auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs); + auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs); bx = ggml_concat(ctx0, conv, bx, 0); GGML_ASSERT(bx->ne[0] > conv->ne[0]); - auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx)); + // last d_conv columns is a new conv state + auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx)); GGML_ASSERT(ggml_are_same_shape(conv, new_conv)); - // write conv state - ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state)); + // write new conv conv state + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + new_conv, + ggml_view_1d( + ctx0, + conv_state, + ggml_nelements(new_conv), + kv_head*d_conv*n_embd*ggml_element_size(new_conv) + ) + ) + ); auto * conv_kernel = model.layers[il].shortconv.conv; - GGML_ASSERT(hparams.n_shortconv_l_cache > 0); - - // construct ssm_conv op - ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); + auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); cb(conv_out, "model.layers.{}.conv.conv", il); auto * y = ggml_mul(ctx0, c, conv_out); - y = build_lora_mm(model.layers[il].shortconv.out_proj, y); cb(y, "model.layers.{}.conv.out_proj", il); + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs); return y; } @@ -16078,6 +17023,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_NEO_BERT: case LLM_ARCH_WAVTOKENIZER_DEC: + case LLM_ARCH_DREAM: { res = nullptr; } break; @@ -16118,7 +17064,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } else { const auto padding = llama_kv_cache_unified::get_padding(cparams); - cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding); + uint32_t n_ctx_per_stream = cparams.n_ctx; + + if (!cparams.kv_unified) { + n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max; + n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding); + + cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max; + } else { + n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding); + + cparams.n_ctx = n_ctx_per_stream; + } LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); @@ -16132,7 +17089,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, !cparams.flash_attn, cparams.offload_kqv, params.swa_full, - cparams.n_ctx, + cparams.kv_unified, + n_ctx_per_stream, cparams.n_seq_max, cparams.n_ubatch, padding); @@ -16146,7 +17104,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, params.type_v, !cparams.flash_attn, cparams.offload_kqv, - cparams.n_ctx, + cparams.kv_unified, + n_ctx_per_stream, cparams.n_seq_max, padding, hparams.n_swa, @@ -16159,227 +17118,233 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, return res; } -llm_graph_result_ptr llama_model::build_graph( - const llm_graph_params & params, - ggml_cgraph * gf, - llm_graph_type type) const { +ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { std::unique_ptr llm; switch (arch) { case LLM_ARCH_LLAMA: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_LLAMA4: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DECI: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BAICHUAN: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_FALCON: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GROK: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_STARCODER: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_REFACT: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_NEO_BERT: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BLOOM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_MPT: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_STABLELM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; + case LLM_ARCH_DREAM: + { + llm = std::make_unique(*this, params); + } + break; case LLM_ARCH_QWEN2VL: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN2MOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN3: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN3MOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_PHI2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_PHI3: case LLM_ARCH_PHIMOE: { if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { - llm = std::make_unique> (*this, params, gf); + llm = std::make_unique> (*this, params); } else { - llm = std::make_unique>(*this, params, gf); + llm = std::make_unique>(*this, params); } } break; case LLM_ARCH_PLAMO: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); + } break; + case LLM_ARCH_PLAMO2: + { + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GPT2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_CODESHELL: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ORION: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_INTERNLM2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_MINICPM3: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GEMMA: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GEMMA2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GEMMA3: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GEMMA3N: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_STARCODER2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_MAMBA: case LLM_ARCH_MAMBA2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_JAMBA: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_XVERSE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_COMMAND_R: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_COHERE2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DBRX: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_OLMO: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_OLMO2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_OLMOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_OPENELM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GPTNEOX: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ARCTIC: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DEEPSEEK: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DEEPSEEK2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_CHATGLM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GLM4: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BITNET: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_T5: { - switch (type) { + switch (params.gtype) { case LLM_GRAPH_TYPE_ENCODER: - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); break; case LLM_GRAPH_TYPE_DEFAULT: case LLM_GRAPH_TYPE_DECODER: - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); break; default: GGML_ABORT("invalid graph type"); @@ -16387,99 +17352,111 @@ llm_graph_result_ptr llama_model::build_graph( } break; case LLM_ARCH_T5ENCODER: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_JAIS: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_NEMOTRON: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_EXAONE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); + } break; + case LLM_ARCH_EXAONE4: + { + if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) { + llm = std::make_unique>(*this, params); + } else { + llm = std::make_unique>(*this, params); + } } break; case LLM_ARCH_RWKV6: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_RWKV6QWEN2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_RWKV7: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ARWKV7: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_MINICPM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GRANITE_HYBRID: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_CHAMELEON: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_WAVTOKENIZER_DEC: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_PLM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BAILINGMOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DOTS1: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ARCEE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ERNIE4_5: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); + } break; + case LLM_ARCH_ERNIE4_5_MOE: + { + llm = std::make_unique(*this, params); } break; case LLM_ARCH_HUNYUAN_MOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_SMOLLM3: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_FALCON_H1: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_LFM2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; default: GGML_ABORT("fatal error"); } // add on pooling layer - llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b); + llm->build_pooling(cls, cls_b, cls_out, cls_out_b); - return std::move(llm->res); + return llm->res->get_gf(); } // @@ -16628,6 +17605,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_SMOLLM3: case LLM_ARCH_ARCEE: case LLM_ARCH_ERNIE4_5: + case LLM_ARCH_ERNIE4_5_MOE: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -16642,6 +17620,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: + case LLM_ARCH_DREAM: case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3MOE: @@ -16651,6 +17630,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_PHI3: case LLM_ARCH_PHIMOE: case LLM_ARCH_PLAMO: + case LLM_ARCH_PLAMO2: case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: case LLM_ARCH_GEMMA3: @@ -16662,6 +17642,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ORION: case LLM_ARCH_NEMOTRON: case LLM_ARCH_EXAONE: + case LLM_ARCH_EXAONE4: case LLM_ARCH_MINICPM3: case LLM_ARCH_DOTS1: case LLM_ARCH_HUNYUAN_MOE: diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h index 027a7f0c3e2..094e23808a8 100644 --- a/examples/talk-llama/llama-model.h +++ b/examples/talk-llama/llama-model.h @@ -99,8 +99,10 @@ enum llm_type { LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_128E, // llama4 Maverick LLM_TYPE_A13B, + LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_30B_A3B, LLM_TYPE_235B_A22B, + LLM_TYPE_300B_A47B, // Ernie MoE big LLM_TYPE_E2B, LLM_TYPE_E4B, }; @@ -452,10 +454,7 @@ struct llama_model { llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const; // TODO: move this to new llm_arch_model_i interface - llm_graph_result_ptr build_graph( - const llm_graph_params & params, - ggml_cgraph * gf, - llm_graph_type type) const; + ggml_cgraph * build_graph(const llm_graph_params & params) const; private: struct impl; diff --git a/examples/talk-llama/llama-quant.cpp b/examples/talk-llama/llama-quant.cpp index 4dbd1e30991..a00af7a1d17 100644 --- a/examples/talk-llama/llama-quant.cpp +++ b/examples/talk-llama/llama-quant.cpp @@ -884,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { if (qtype != new_type) { LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type)); - new_type = qtype; - break; // if two or more types are specified for the tensor, first match wins + new_type = qtype; // if two or more types are specified for the same tensor, the last match wins } } } diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp index e0e578d6394..e8bae645088 100644 --- a/examples/talk-llama/llama-vocab.cpp +++ b/examples/talk-llama/llama-vocab.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -404,6 +405,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_KIMI_K2: + regex_exprs = { + // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp + // The custom handler implements all K2 patterns with proper Han character exclusion + "\\p{Han}+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_SUPERBPE: regex_exprs = { "\\p{N}+", @@ -1196,6 +1204,284 @@ struct llm_tokenizer_rwkv_session { const llm_tokenizer_rwkv & tokenizer; }; +struct llm_tokenizer_plamo2 : llm_tokenizer { + llm_tokenizer_plamo2(const llama_vocab & vocab) { + build(vocab); + } + + void build(const llama_vocab & vocab) { + // Reset internal structures + tokens_.clear(); + bytes_.assign(256, 0); + to_suffix_id_.clear(); + table_.clear(); + + // Build token list and byte mapping + std::unordered_map suffix_to_score; + std::unordered_map token_to_id; + + for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) { + const auto & entry = vocab.get_token_data(token_id); + tokens_.push_back(entry.text); + token_to_id[entry.text] = static_cast(token_id); + + // Handle byte tokens + if (vocab.is_byte(token_id)) { + if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') { + std::string hex_str = entry.text.substr(3, 2); + int byte_val = std::stoi(hex_str, nullptr, 16); + bytes_[byte_val] = static_cast(token_id); + } + continue; + } + + // Add token and all its suffixes to suffix_to_score + suffix_to_score[entry.text] = entry.score; + + // Extract suffixes character by character (UTF-8 aware) + std::vector cpts = unicode_cpts_from_utf8(entry.text); + for (size_t i = 1; i < cpts.size(); ++i) { + std::string suffix; + for (size_t j = i; j < cpts.size(); ++j) { + suffix += unicode_cpt_to_utf8(cpts[j]); + } + if (suffix_to_score.find(suffix) == suffix_to_score.end()) { + suffix_to_score[suffix] = std::numeric_limits::quiet_NaN(); + } + } + } + + // Check that all byte tokens are set + for (int i = 0; i < 256; ++i) { + if (bytes_[i] == 0) { + throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set"); + } + } + + // Build suffix list in lexicographical order of reversed strings + std::vector suffixes; + for (const auto & pair : suffix_to_score) { + suffixes.push_back(pair.first); + } + suffixes.push_back(""); // Empty suffix + + std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) { + std::string rev_a(a.rbegin(), a.rend()); + std::string rev_b(b.rbegin(), b.rend()); + return rev_a < rev_b; + }); + + // Build suffix_to_id and to_suffix_id_ + std::unordered_map suffix_to_id; + int32_t num_pieces = 0; + + for (const auto & suffix : suffixes) { + suffix_to_id[suffix] = num_pieces; + if (!suffix.empty()) { + std::vector cpts = unicode_cpts_from_utf8(suffix); + + std::string remaining; + for (size_t i = 1; i < cpts.size(); ++i) { + remaining += unicode_cpt_to_utf8(cpts[i]); + } + + int64_t piece_code = (static_cast(cpts[0]) << 32) | suffix_to_id[remaining]; + to_suffix_id_[piece_code] = num_pieces; + + // Count number of pieces for this suffix + int32_t pieces_for_suffix = 1; // sentinel row + for (int32_t piece_length = static_cast(cpts.size()); piece_length > 0; --piece_length) { + std::string piece; + for (int32_t i = 0; i < piece_length; ++i) { + piece += unicode_cpt_to_utf8(cpts[i]); + } + if (suffix_to_score.find(piece) != suffix_to_score.end()) { + pieces_for_suffix++; + } + } + num_pieces += pieces_for_suffix; + } else { + num_pieces++; // Empty suffix contributes one piece (sentinel row) + } + } + + // Build flattened table + table_.resize(num_pieces, std::vector(4, 0)); + int32_t table_idx = 0; + + for (const auto & suffix : suffixes) { + // Add all prefixes of the suffix to the table (in decreasing order of length) + std::vector cpts = unicode_cpts_from_utf8(suffix); + for (int32_t piece_length = static_cast(cpts.size()); piece_length > 0; --piece_length) { + std::string piece; + for (int32_t i = 0; i < piece_length; ++i) { + piece += unicode_cpt_to_utf8(cpts[i]); + } + + auto score_it = suffix_to_score.find(piece); + if (score_it == suffix_to_score.end()) { + continue; + } + + table_[table_idx][TABLE_PIECE_LENGTH] = piece_length; + auto token_it = token_to_id.find(piece); + table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1; + + float score = score_it->second; + table_[table_idx][TABLE_SCORE] = std::isfinite(score) ? + static_cast(std::round(score * 1e4)) : INVALID_SCORE; + table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece]; + + table_idx++; + } + + // Add sentinel row + table_[table_idx][TABLE_PIECE_LENGTH] = 1; + table_[table_idx][TABLE_TOKEN_ID] = -1; + table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE; + table_idx++; + } + } + + std::vector encode(const std::string & text) const { + std::vector unicode_data = unicode_cpts_from_utf8(text); + // Skip the first code point if it is a BOM (Byte Order Mark) + if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) { + unicode_data.erase(unicode_data.begin()); + } + + if (unicode_data.empty()) { + return {}; + } + + const size_t data_len = unicode_data.size(); + + // Initialize scores array (dynamic programming) + std::vector scores(data_len + 1, static_cast(1) << 60); + scores[data_len] = 0; + + // Path array to track best tokenization + std::vector> path(data_len + 1, std::vector(3, 0)); + + int32_t suffix_id = 0; + + // Process from end to beginning + for (int i = static_cast(data_len) - 1; i >= 0; --i) { + uint32_t c = unicode_data[i]; + + // Find next suffix ID + for (size_t p = suffix_id; p < table_.size(); ++p) { + int64_t piece_code = (static_cast(c) << 32) | table_[p][TABLE_PIECE_ID]; + auto it = to_suffix_id_.find(piece_code); + suffix_id = (it != to_suffix_id_.end()) ? it->second : 0; + + if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) { + break; + } + } + + // Update best path + for (size_t p = suffix_id; p < table_.size(); ++p) { + int32_t score = table_[p][TABLE_SCORE]; + if (score > INVALID_SCORE) { + int32_t piece_length = table_[p][TABLE_PIECE_LENGTH]; + int64_t s = scores[i + piece_length] - score; + + if (s < scores[i]) { + scores[i] = s; + path[i][PATH_TOKEN_LENGTH] = piece_length; + path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID]; + path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1; + + if (score == UNKNOWN_SCORE) { + // Add UTF-8 byte count + path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000); + } + } + } + + if (score == UNKNOWN_SCORE) { + break; + } + } + } + + // Decode the best path + std::vector token_ids; + token_ids.reserve(path[0][PATH_NUM_TOKENS]); + + int pos = 0; + while (pos < static_cast(data_len)) { + if (path[pos][PATH_TOKEN_ID] >= 0) { + token_ids.push_back(path[pos][PATH_TOKEN_ID]); + } else { + // Fall back to byte tokens + uint32_t c = unicode_data[pos]; + int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000); + + for (int i = 0; i < s; ++i) { + uint8_t b; + if (s == 1) { + b = c; + } else { + if (i == 0) { + b = (0xF00 >> s) & 0xFF; + } else { + b = 0x80; + } + } + token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]); + } + } + + assert(path[pos][PATH_TOKEN_LENGTH] > 0); + pos += path[pos][PATH_TOKEN_LENGTH]; + } + + return token_ids; + } +private: + // Constants for table structure + static constexpr int32_t TABLE_PIECE_LENGTH = 0; + static constexpr int32_t TABLE_TOKEN_ID = 1; + static constexpr int32_t TABLE_SCORE = 2; + static constexpr int32_t TABLE_PIECE_ID = 3; + + // Constants for path array + static constexpr int32_t PATH_TOKEN_LENGTH = 0; + static constexpr int32_t PATH_TOKEN_ID = 1; + static constexpr int32_t PATH_NUM_TOKENS = 2; + + // Score constants + static constexpr int32_t INVALID_SCORE = -20000000; + static constexpr int32_t UNKNOWN_SCORE = -10000000; + + // List of tokens in the vocabulary + std::vector tokens_; + + // Mapping from byte code point to token ID (for byte fallback) + std::vector bytes_; + + // Mapping from piece code to suffix ID + std::unordered_map to_suffix_id_; + + // Flattened table representing the Trie structure + // Each row contains: [piece_length, token_id, score, piece_id] + std::vector> table_; +}; + +struct llm_tokenizer_plamo2_session { + llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {} + + void tokenize(const std::string & text, std::vector & output) { + std::vector tokens = tokenizer.encode(text); + output.insert(output.end(), tokens.begin(), tokens.end()); + } + +private: + const llm_tokenizer_plamo2 & tokenizer; +}; + // // impl // @@ -1499,6 +1785,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_unk_id = LLAMA_TOKEN_NULL; special_sep_id = LLAMA_TOKEN_NULL; special_pad_id = LLAMA_TOKEN_NULL; + } else if (tokenizer_model == "plamo2") { + type = LLAMA_VOCAB_TYPE_PLAMO2; + + // PLaMo-2 default special tokens (these will be overridden by model config) + special_bos_id = 1; // <|plamo:bos|> + special_eos_id = 2; // <|plamo:eos|> + special_unk_id = 0; // <|plamo:unk|> + special_sep_id = LLAMA_TOKEN_NULL; + special_pad_id = 3; // <|plamo:pad|> + special_mask_id = LLAMA_TOKEN_NULL; } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -1629,6 +1925,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "exaone") { pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE; + } else if ( + tokenizer_pre == "exaone4") { + pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "chameleon") { pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; @@ -1665,6 +1964,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "hunyuan") { pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; clean_spaces = false; + } else if ( + tokenizer_pre == "kimi-k2") { + pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -2145,13 +2448,14 @@ enum llama_vocab_type llama_vocab::impl::get_type() const { std::string llama_vocab::impl::type_name() const{ switch (type) { - case LLAMA_VOCAB_TYPE_NONE: return "no vocab"; - case LLAMA_VOCAB_TYPE_SPM: return "SPM"; - case LLAMA_VOCAB_TYPE_BPE: return "BPE"; - case LLAMA_VOCAB_TYPE_WPM: return "WPM"; - case LLAMA_VOCAB_TYPE_UGM: return "UGM"; - case LLAMA_VOCAB_TYPE_RWKV: return "RWKV"; - default: return "unknown"; + case LLAMA_VOCAB_TYPE_NONE: return "no vocab"; + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + case LLAMA_VOCAB_TYPE_UGM: return "UGM"; + case LLAMA_VOCAB_TYPE_RWKV: return "RWKV"; + case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2"; + default: return "unknown"; } } @@ -2234,6 +2538,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) { case LLAMA_VOCAB_TYPE_RWKV: tokenizer = std::make_unique(vocab); break; + case LLAMA_VOCAB_TYPE_PLAMO2: + tokenizer = std::make_unique(vocab); + break; default: GGML_ABORT("unsupported vocab type"); } @@ -2566,6 +2873,23 @@ std::vector llama_vocab::impl::tokenize( if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); +#ifdef PRETOKENIZERDEBUG + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); +#endif + + session.tokenize(text, output); + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + output.push_back(fragment.token); + } + } + } break; + case LLAMA_VOCAB_TYPE_PLAMO2: + { + llm_tokenizer_plamo2_session session(*static_cast(tokenizer.get())); + for (const auto & fragment : fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { + std::string text = fragment.raw_text.substr(fragment.offset, fragment.length); + #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif @@ -2664,6 +2988,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t memcpy(buf, result.data(), result.size()); return (int)result.size(); } + case LLAMA_VOCAB_TYPE_PLAMO2: { + // PLaMo-2 uses similar token handling as BPE/SPM + if (vocab.is_byte(token)) { + // Handle byte tokens like <0xXX> + if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') { + int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16); + if (length < 1) { + return -1; + } + buf[0] = static_cast(hex_val); + return 1; + } + } + + // Normal token - just copy the text + std::string result = token_text; + return _try_copy(result.data(), result.size()); + } default: GGML_ABORT("fatal error"); } @@ -2908,6 +3250,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const { case LLAMA_VOCAB_TYPE_BPE: { return pimpl->token_to_id.at(unicode_byte_to_utf8(ch)); } + case LLAMA_VOCAB_TYPE_PLAMO2: { + // PLaMo-2 uses byte tokens in format <0xXX> + char hex_str[8]; + snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch); + return pimpl->token_to_id.at(hex_str); + } default: GGML_ABORT("fatal error"); } @@ -3009,6 +3357,10 @@ llama_token llama_vocab::token_fim_sep() const { return pimpl->special_fim_sep_id; } +llama_token llama_vocab::token_mask() const { + return pimpl->special_mask_id; +} + bool llama_vocab::get_add_space_prefix() const { return pimpl->add_space_prefix; } @@ -3249,6 +3601,10 @@ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) { return vocab->token_fim_sep(); } +llama_token llama_vocab_mask(const struct llama_vocab* vocab) { + return vocab->token_mask(); +} + // deprecated const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) { return llama_vocab_get_text(vocab, token); @@ -3385,4 +3741,3 @@ int32_t llama_detokenize( bool unparse_special) { return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special); } - diff --git a/examples/talk-llama/llama-vocab.h b/examples/talk-llama/llama-vocab.h index 46a1ccecb51..842b129e861 100644 --- a/examples/talk-llama/llama-vocab.h +++ b/examples/talk-llama/llama-vocab.h @@ -45,6 +45,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, + LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37, }; struct LLM_KV; @@ -100,6 +101,7 @@ struct llama_vocab { llama_token token_sep() const; llama_token token_nl () const; llama_token token_pad() const; + llama_token token_mask() const; llama_token token_prefix() const; llama_token token_middle() const; diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h index f73b1ab65fe..6f454a508a0 100644 --- a/examples/talk-llama/llama.h +++ b/examples/talk-llama/llama.h @@ -71,12 +71,13 @@ extern "C" { typedef int32_t llama_seq_id; enum llama_vocab_type { - LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab - LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback - LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE - LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece - LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram - LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization + LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab + LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback + LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE + LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece + LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram + LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization + LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming }; enum llama_rope_type { @@ -334,6 +335,9 @@ extern "C" { bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 + bool kv_unified; // use a unified buffer across the input sequences when computing the attention + // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix + // ref: https://github.com/ggml-org/llama.cpp/pull/14363 }; // model quantization parameters @@ -724,7 +728,7 @@ extern "C" { // - lazily on next llama_decode() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - DEPRECATED(void llama_kv_self_seq_div( + DEPRECATED(LLAMA_API void llama_kv_self_seq_div( struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, @@ -952,6 +956,7 @@ extern "C" { // in the order they have appeared in the batch. // Rows: number of tokens for which llama_batch.logits[i] != 0 // Cols: n_vocab + // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522) LLAMA_API float * llama_get_logits(struct llama_context * ctx); // Logits for the ith token. For positive indices, Equivalent to: @@ -966,6 +971,7 @@ extern "C" { // in the order they have appeared in the batch. // shape: [n_outputs*n_embd] // Otherwise, returns NULL. + // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522) LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Get the embeddings for the ith token. For positive indices, Equivalent to: @@ -1004,6 +1010,7 @@ extern "C" { LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding + LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab); LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab); @@ -1389,6 +1396,7 @@ extern "C" { int32_t n_p_eval; int32_t n_eval; + int32_t n_reused; // number of times a ggml compute graph had been reused }; struct llama_perf_sampler_data { diff --git a/examples/talk-llama/unicode.cpp b/examples/talk-llama/unicode.cpp index 43a4581b961..65f36651715 100644 --- a/examples/talk-llama/unicode.cpp +++ b/examples/talk-llama/unicode.cpp @@ -557,6 +557,178 @@ static std::vector unicode_regex_split_stl(const std::string & text, con return bpe_offsets; } +// K2 system regex patterns (from tokenization_kimi.py): +// [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+ +static std::vector unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector & offsets) { + std::vector bpe_offsets; + bpe_offsets.reserve(offsets.size()); + + const auto cpts = unicode_cpts_from_utf8(text); + + size_t start = 0; + for (auto offset : offsets) { + const size_t offset_ini = start; + const size_t offset_end = start + offset; + assert(offset_end <= cpts.size()); + start = offset_end; + + static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; + auto _get_cpt = [&] (const size_t pos) -> uint32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; + }; + + auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{}; + }; + + size_t _prev_end = offset_ini; + auto _add_token = [&] (const size_t end) -> size_t { + assert(_prev_end <= end && end <= offset_end); + size_t len = end - _prev_end; + if (len > 0) { + bpe_offsets.push_back(len); + } + _prev_end = end; + return len; + }; + + for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { + const uint32_t cpt = _get_cpt(pos); + const auto flags = _get_flags(pos); + + // Pattern 1: [\p{Han}]+ (Chinese characters) + if (unicode_cpt_is_han(cpt)) { + while (unicode_cpt_is_han(_get_cpt(pos))) { + pos++; + } + _add_token(pos); + continue; + } + + // Pattern 2 & 3: Letter words excluding Han characters with optional contractions + // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)? + // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)? + // Check if current char is a letter OR if current char could be a leading char and next char is a letter + bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) || + (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) && + _get_flags(pos + 1).is_letter && !unicode_cpt_is_han(_get_cpt(pos + 1))); + + if (is_letter_pattern) { + // Handle optional leading non-letter/non-number character + bool has_leading_char = false; + if (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number)) { + has_leading_char = true; + pos++; + } + + // Match letter sequence (excluding Han characters) + bool has_letters = false; + while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) { + has_letters = true; + pos++; + } + + // Only proceed if we found letters (after potentially skipping leading char) + if (has_letters || (!has_leading_char && _get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos)))) { + if (!has_letters) pos++; // consume the first letter if we didn't already + + // Continue consuming letters + while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) { + pos++; + } + + // Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d) + if (_get_cpt(pos) == '\'' && pos + 1 < offset_end) { + uint32_t cpt_next = unicode_tolower(_get_cpt(pos + 1)); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') { + pos += 2; + } else if (pos + 2 < offset_end) { + uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos + 2)); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + pos += 3; + } + } + } + + _add_token(pos); + continue; + } else if (has_leading_char) { + // We consumed a leading char but found no letters, backtrack + pos--; + } + } + + // Pattern 4: \p{N}{1,3} (numbers 1-3 digits) + if (flags.is_number) { + size_t ini = pos; + while (_get_flags(pos).is_number) { + if (++pos - ini >= 3) { + _add_token(pos); + ini = pos; + } + } + _add_token(pos); + continue; + } + + // Pattern 5: ?[^\s\p{L}\p{N}]+[\r\n]* (optional space + non-word chars + optional newlines) + auto flags2 = (cpt == ' ' ? _get_flags(pos + 1) : flags); + if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) { + pos += (cpt == ' '); + while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) { + flags2 = _get_flags(++pos); + } + // Match optional [\r\n]* + uint32_t cpt2 = _get_cpt(pos); + while (cpt2 == '\r' || cpt2 == '\n') { + cpt2 = _get_cpt(++pos); + } + _add_token(pos); + continue; + } + + // Count whitespace characters + size_t num_whitespaces = 0; + size_t last_end_r_or_n = 0; + while (_get_flags(pos + num_whitespaces).is_whitespace) { + uint32_t cpt2 = _get_cpt(pos + num_whitespaces); + if (cpt2 == '\r' || cpt2 == '\n') { + last_end_r_or_n = pos + num_whitespaces + 1; + } + num_whitespaces++; + } + + // Pattern 6: \s*[\r\n]+ (whitespace with newlines) + if (last_end_r_or_n > 0) { + pos = last_end_r_or_n; + _add_token(pos); + continue; + } + + // Pattern 7: \s+(?!\S) (trailing whitespace) + if (num_whitespaces > 1 && _get_cpt(pos + num_whitespaces) != OUT_OF_RANGE) { + pos += num_whitespaces - 1; + _add_token(pos); + continue; + } + + // Pattern 8: \s+ (general whitespace) + if (num_whitespaces > 0) { + pos += num_whitespaces; + _add_token(pos); + continue; + } + + // No matches - consume single character + _add_token(++pos); + } + } + + return bpe_offsets; +} + static std::vector unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { std::vector bpe_offsets; @@ -567,6 +739,9 @@ static std::vector unicode_regex_split_custom(const std::string & text, regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") { bpe_offsets = unicode_regex_split_custom_llama3(text, offsets); + } else if (regex_expr == "\\p{Han}+") { + // K2's first pattern - handle all K2 patterns together + bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets); } return bpe_offsets; @@ -672,6 +847,38 @@ uint32_t unicode_tolower(uint32_t cpt) { return cpt; // Return the original code point if no lowercase mapping is found } +bool unicode_cpt_is_han(uint32_t cpt) { + // Han character ranges (Chinese/CJK characters) + // CJK Unified Ideographs (most common) + if (cpt >= 0x4E00 && cpt <= 0x9FFF) return true; + + // CJK Extension A + if (cpt >= 0x3400 && cpt <= 0x4DBF) return true; + + // CJK Extension B + if (cpt >= 0x20000 && cpt <= 0x2A6DF) return true; + + // CJK Extension C + if (cpt >= 0x2A700 && cpt <= 0x2B73F) return true; + + // CJK Extension D + if (cpt >= 0x2B740 && cpt <= 0x2B81F) return true; + + // CJK Extension E + if (cpt >= 0x2B820 && cpt <= 0x2CEAF) return true; + + // CJK Extension F + if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF) return true; + + // CJK Compatibility Ideographs + if (cpt >= 0xF900 && cpt <= 0xFAFF) return true; + + // CJK Compatibility Ideographs Supplement + if (cpt >= 0x2F800 && cpt <= 0x2FA1F) return true; + + return false; +} + std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { diff --git a/examples/talk-llama/unicode.h b/examples/talk-llama/unicode.h index c27098df7d4..0a5fa2a78ce 100644 --- a/examples/talk-llama/unicode.h +++ b/examples/talk-llama/unicode.h @@ -63,4 +63,6 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8); uint32_t unicode_tolower(uint32_t cpt); +bool unicode_cpt_is_han(uint32_t cpt); + std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs); From 28b39c624ecf1e09f90b93ac3ea3f90d3028f694 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 30 Jul 2025 16:08:57 +0300 Subject: [PATCH 0165/1348] ggml : remove old kompute, cann (skip) (#3349) ggml-ci --- ggml/include/ggml-kompute.h | 50 - ggml/src/ggml-cann/kernels/CMakeLists.txt | 30 - ggml/src/ggml-cann/kernels/ascendc_kernels.h | 19 - ggml/src/ggml-cann/kernels/dup.cpp | 234 -- ggml/src/ggml-cann/kernels/get_row_f16.cpp | 197 -- ggml/src/ggml-cann/kernels/get_row_f32.cpp | 190 -- ggml/src/ggml-cann/kernels/get_row_q4_0.cpp | 204 -- ggml/src/ggml-cann/kernels/get_row_q8_0.cpp | 191 -- .../ggml-cann/kernels/quantize_f16_q8_0.cpp | 218 -- .../ggml-cann/kernels/quantize_f32_q8_0.cpp | 216 -- .../kernels/quantize_float_to_q4_0.cpp | 295 --- ggml/src/ggml-kompute/CMakeLists.txt | 166 -- ggml/src/ggml-kompute/ggml-kompute.cpp | 2251 ----------------- .../ggml-kompute/kompute-shaders/common.comp | 112 - .../ggml-kompute/kompute-shaders/op_add.comp | 58 - .../kompute-shaders/op_addrow.comp | 25 - .../kompute-shaders/op_cpy_f16_f16.comp | 52 - .../kompute-shaders/op_cpy_f16_f32.comp | 52 - .../kompute-shaders/op_cpy_f32_f16.comp | 52 - .../kompute-shaders/op_cpy_f32_f32.comp | 52 - .../kompute-shaders/op_diagmask.comp | 30 - .../ggml-kompute/kompute-shaders/op_gelu.comp | 22 - .../kompute-shaders/op_getrows.comp | 17 - .../kompute-shaders/op_getrows_f16.comp | 31 - .../kompute-shaders/op_getrows_f32.comp | 31 - .../kompute-shaders/op_getrows_q4_0.comp | 38 - .../kompute-shaders/op_getrows_q4_1.comp | 39 - .../kompute-shaders/op_getrows_q6_k.comp | 44 - .../ggml-kompute/kompute-shaders/op_mul.comp | 52 - .../kompute-shaders/op_mul_mat_f16.comp | 69 - .../kompute-shaders/op_mul_mat_mat_f32.comp | 51 - .../kompute-shaders/op_mul_mat_q4_0.comp | 33 - .../kompute-shaders/op_mul_mat_q4_1.comp | 35 - .../kompute-shaders/op_mul_mat_q4_k.comp | 140 - .../kompute-shaders/op_mul_mat_q6_k.comp | 106 - .../kompute-shaders/op_mul_mat_q8_0.comp | 73 - .../kompute-shaders/op_mul_mv_q_n.comp | 52 - .../kompute-shaders/op_mul_mv_q_n_pre.comp | 28 - .../ggml-kompute/kompute-shaders/op_norm.comp | 84 - .../ggml-kompute/kompute-shaders/op_relu.comp | 21 - .../kompute-shaders/op_rmsnorm.comp | 53 - .../kompute-shaders/op_rope_neox_f16.comp | 52 - .../kompute-shaders/op_rope_neox_f32.comp | 52 - .../kompute-shaders/op_rope_norm_f16.comp | 52 - .../kompute-shaders/op_rope_norm_f32.comp | 52 - .../kompute-shaders/op_scale.comp | 19 - .../kompute-shaders/op_scale_8.comp | 23 - .../ggml-kompute/kompute-shaders/op_silu.comp | 22 - .../kompute-shaders/op_softmax.comp | 72 - .../kompute-shaders/rope_common.comp | 71 - 50 files changed, 6128 deletions(-) delete mode 100644 ggml/include/ggml-kompute.h delete mode 100644 ggml/src/ggml-cann/kernels/CMakeLists.txt delete mode 100644 ggml/src/ggml-cann/kernels/ascendc_kernels.h delete mode 100644 ggml/src/ggml-cann/kernels/dup.cpp delete mode 100644 ggml/src/ggml-cann/kernels/get_row_f16.cpp delete mode 100644 ggml/src/ggml-cann/kernels/get_row_f32.cpp delete mode 100644 ggml/src/ggml-cann/kernels/get_row_q4_0.cpp delete mode 100644 ggml/src/ggml-cann/kernels/get_row_q8_0.cpp delete mode 100644 ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp delete mode 100644 ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp delete mode 100644 ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp delete mode 100644 ggml/src/ggml-kompute/CMakeLists.txt delete mode 100644 ggml/src/ggml-kompute/ggml-kompute.cpp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/common.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_add.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_norm.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_relu.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_scale.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_silu.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp delete mode 100644 ggml/src/ggml-kompute/kompute-shaders/rope_common.comp diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h deleted file mode 100644 index 154aa56a742..00000000000 --- a/ggml/include/ggml-kompute.h +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define GGML_KOMPUTE_MAX_DEVICES 16 - -struct ggml_vk_device { - int index; - int type; // same as VkPhysicalDeviceType - size_t heapSize; - const char * name; - const char * vendor; - int subgroupSize; - uint64_t bufferAlignment; - uint64_t maxAlloc; -}; - -struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count); -bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name); -bool ggml_vk_has_vulkan(void); -bool ggml_vk_has_device(void); -struct ggml_vk_device ggml_vk_current_device(void); - -// -// backend API -// - -// forward declaration -typedef struct ggml_backend * ggml_backend_t; - -GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device); - -GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend); - -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt deleted file mode 100644 index d687220c3c5..00000000000 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -file(GLOB SRC_FILES - get_row_f32.cpp - get_row_f16.cpp - get_row_q4_0.cpp - get_row_q8_0.cpp - quantize_f32_q8_0.cpp - quantize_f16_q8_0.cpp - quantize_float_to_q4_0.cpp - dup.cpp -) - -set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) -set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") - -if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) -elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) -else() - message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.") -endif() -include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) - -ascendc_library(ascendc_kernels STATIC - ${SRC_FILES} -) - -message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.") -ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") -# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h deleted file mode 100644 index 7e153208cfd..00000000000 --- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef ASCENDC_KERNELS_H -#define ASCENDC_KERNELS_H - -#include "aclrtlaunch_ascendc_get_row_f32.h" -#include "aclrtlaunch_ascendc_get_row_f16.h" -#include "aclrtlaunch_ascendc_get_row_q8_0.h" -#include "aclrtlaunch_ascendc_get_row_q4_0.h" - -#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" -#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" -#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h" -#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h" - -#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h" - -#endif // ASCENDC_KERNELS_H diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp deleted file mode 100644 index d9b9574494b..00000000000 --- a/ggml/src/ggml-cann/kernels/dup.cpp +++ /dev/null @@ -1,234 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; - -#define BUFFER_NUM 2 -const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template - -template -class DupByRows { - public: - __aicore__ inline DupByRows() {} - __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub, - size_t *input_nb_ub) { - /* Dup by rows when src is contigous on first dimension and dst is - contiguous, each kernel process one row. - */ - - // Input has four dims. - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - // param - num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; - num_elem = input_ne_ub[0]; - - // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3) - idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]); - idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])) - / (input_ne_ub[1]); - idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) - - idx_ne2 * input_ne_ub[1]; - - // src may not contiguous in dim [1,2,3], so stride decited by ne&nb - src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2 - + input_nb_ub[1] * idx_ne1; - - // dst is contiguous - dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T)); - - src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src + - src_stride)); - dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst + - dst_stride)); - - pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem + - 32 - 1) / 32 * 32); - pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem + - 32 - 1) / 32 * 32); - } - - __aicore__ inline void copy_in() { - LocalTensor src_local = src_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(SRC_T); - size_t tail = num_elem % elem_per_block; - size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem; - DataCopy(src_local, src_gm, cpy_elements_len); - src_queue.EnQue(src_local); - } - - __aicore__ inline void copy_out() { - LocalTensor dst_local = dst_queue.DeQue(); -#ifdef ASCEND_310P - const size_t elem_per_block = 32 / sizeof(DST_T); - size_t tail = num_elem % elem_per_block; - size_t len = num_elem & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(dst_gm, dst_local, len); - } - if(tail != 0) { - for (size_t i = tail; i < elem_per_block; i++) { - dst_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(dst_gm[len], dst_local[len], elem_per_block); - SetAtomicNone(); - } -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = num_elem * sizeof(DST_T); - DataCopyPad(dst_gm, dst_local, dataCopyParams); -#endif - dst_queue.FreeTensor(dst_local); - } - - __aicore__ inline void dup() { - // main process, copy one row data from src to dst. - copy_in(); - - LocalTensor src_local = src_queue.DeQue(); - LocalTensor dst_local = dst_queue.AllocTensor(); - - int32_t BLOCK_NUM = 32 / sizeof(DST_T); - DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1) - / BLOCK_NUM * BLOCK_NUM); - dst_queue.EnQue(dst_local); - - src_queue.FreeTensor(src_local); - copy_out(); - } - - __aicore__ inline void dup_with_cast() { - // main process, copy one row data from src to dst. - // cast dtype from src to dst. - copy_in(); - - LocalTensor src_local = src_queue.DeQue(); - LocalTensor dst_local = dst_queue.AllocTensor(); - - Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem); - dst_queue.EnQue(dst_local); - - src_queue.FreeTensor(src_local); - copy_out(); - } - - private: - - TPipe pipe; - GlobalTensor src_gm; - GlobalTensor dst_gm; - - int64_t num_rows; - int64_t num_elem; - int64_t idx_ne3; - int64_t idx_ne2; - int64_t idx_ne1; - int64_t src_stride; - int64_t dst_stride; - - TQue src_queue; - TQue dst_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup_with_cast(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - // copy params from gm to ub. - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup_with_cast(); -} diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp deleted file mode 100644 index 416b45104de..00000000000 --- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp +++ /dev/null @@ -1,197 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -class GET_ROW_F16 { - public: - __aicore__ inline GET_ROW_F16() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *indices_ne_ub, size_t *indices_nb_ub, - int64_t *output_ne_ub, size_t *output_nb_ub) { - // TODO, use template for F16/f32 - int64_t op_block_num = GetBlockNum(); - op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ half *)input); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31) - & ~31); - uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31) - & ~31); - - local_buffer_elems = input_local_buffer_size / sizeof(half); - - // TODO, consider long row that can't put in UB. - // All data should asign to 32. It's ok because all data is align to 32. - pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size); - pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size); - } - - __aicore__ inline void copy_in(uint32_t offset, size_t len) { - size_t origin_len = len; - LocalTensor input_local = input_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(half); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if(tail != 0) { - len += elem_per_block; - } - DataCopy(input_local, input_gm[offset], len); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset, size_t len) { - LocalTensor output_local = output_queue.DeQue(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(output_gm[offset], output_local, len); - } - - if(tail != 0) { -#ifdef ASCEND_310P - for (size_t i = tail; i < elem_per_block; i++) { - output_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(output_gm[offset + len], output_local[len], elem_per_block); - SetAtomicNone(); -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPad(output_gm[offset + len], output_local[len], - dataCopyParams); -#endif - } - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_row(int64_t idx) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3]; - - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3]; - - copy_in(input_offset, input_ne[0]); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - - Cast(output_local, input_local, RoundMode::CAST_NONE, - local_buffer_elems); - output_queue.EnQue(output_local); - copy_out(output_offset, input_ne[0]); - - input_queue.FreeTensor(input_local); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - calculate_row(i); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - size_t local_buffer_elems; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - int64_t op_block_idx; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_f16( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, - GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_F16 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, - indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp deleted file mode 100644 index 02116905b18..00000000000 --- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp +++ /dev/null @@ -1,190 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -class GET_ROW_F32 { - public: - __aicore__ inline GET_ROW_F32() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *indices_ne_ub, size_t *indices_nb_ub, - int64_t *output_ne_ub, size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ float *)input); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31); - local_buffer_elems = local_buffer_size / sizeof(float); - - // TODO, consider long row that can't put in UB. - // All data should asign to 32. It's ok because all data is align to 32. - pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size); - pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size); - } - - __aicore__ inline void copy_in(uint32_t offset, size_t len) { - LocalTensor input_local = input_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if(tail != 0) { - len += elem_per_block; - } - DataCopy(input_local, input_gm[offset], len); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset, size_t len) { - LocalTensor output_local = output_queue.DeQue(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(output_gm[offset], output_local, len); - } - - if(tail != 0) { -#ifdef ASCEND_310P - for (size_t i = tail; i < elem_per_block; i++) { - output_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(output_gm[offset + len], output_local[len], elem_per_block); - SetAtomicNone(); -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPad(output_gm[offset + len], output_local[len], - dataCopyParams); -#endif - } - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_row(int64_t idx) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3]; - - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3]; - - copy_in(input_offset, input_ne[0]); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - - DataCopy(output_local, input_local, local_buffer_elems); - output_queue.EnQue(output_local); - copy_out(output_offset, input_ne[0]); - - input_queue.FreeTensor(input_local); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - calculate_row(i); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - size_t local_buffer_elems; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - int64_t op_block_idx; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_f32( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, - GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_F32 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, - indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp deleted file mode 100644 index 4fbe722086c..00000000000 --- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +++ /dev/null @@ -1,204 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support 4bit get row - extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support 4bit get row.\n"); - } -#else - -#define BUFFER_NUM 2 - -#define QK4_0 32 - -class GET_ROW_Q4_0 { - public: - __aicore__ inline GET_ROW_Q4_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, int64_t *indices_ne_ub, - size_t *indices_nb_ub, int64_t *output_ne_ub, - size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - scale_ne[i] = input_ne_ub[i]; - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // one scale for a group. - scale_ne[0] /= QK4_0; - - input_stride[0] = 1; - scale_stride[0] = 1; - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - group_size_in_row = input_ne[0] / QK4_0; - int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * - input_ne[3] / 2; - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ int4b_t *)input); - scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t)); - pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error? - DataCopy(input_local, input_gm[offset], QK4_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK4_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_group(int64_t idx, int64_t group) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3] + - group * QK4_0; - const int64_t scale_offset = selected_row_idx * scale_stride[1] + - indices_ne1_idx * scale_stride[2] + - indices_ne2_idx * scale_stride[3] + group; - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3] + - group * QK4_0; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor output_local = output_queue.AllocTensor(); - - // TODO: cast more data to speed up. - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); - Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); - - // Only mul need compile by group. - half scale = scale_gm.GetValue(scale_offset); - - Muls(output_local, output_local, (float)scale, QK4_0); - - input_queue.FreeTensor(input_local); - cast_queue.FreeTensor(cast_local); - output_queue.EnQue(output_local); - - copy_out(output_offset); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - calculate_group(i, j); - } - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t scale_ne[4]; - size_t scale_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t ir; - int64_t dr; - - int64_t group_size_in_row; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue cast_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_Q4_0 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, - indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp deleted file mode 100644 index ba9ab3c0483..00000000000 --- a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +++ /dev/null @@ -1,191 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -#define QK8_0 32 - -class GET_ROW_Q8_0 { - public: - __aicore__ inline GET_ROW_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, int64_t *indices_ne_ub, - size_t *indices_nb_ub, int64_t *output_ne_ub, - size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - scale_ne[i] = input_ne_ub[i]; - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // one scale for a group. - scale_ne[0] /= QK8_0; - - input_stride[0] = 1; - scale_stride[0] = 1; - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - group_size_in_row = input_ne[0] / QK8_0; - int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * - input_ne[3] * sizeof(int8_t); - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ int8_t *)input); - scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_group(int64_t idx, int64_t group) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3] + - group * QK8_0; - const int64_t scale_offset = selected_row_idx * scale_stride[1] + - indices_ne1_idx * scale_stride[2] + - indices_ne2_idx * scale_stride[3] + group; - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3] + - group * QK8_0; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor output_local = output_queue.AllocTensor(); - - // TODO: cast more data to speed up. - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); - Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0); - - // Only mul need compile by group. - half scale = scale_gm.GetValue(scale_offset); - Muls(output_local, output_local, (float)scale, QK8_0); - - input_queue.FreeTensor(input_local); - cast_queue.FreeTensor(cast_local); - output_queue.EnQue(output_local); - - copy_out(output_offset); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - calculate_group(i, j); - } - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t scale_ne[4]; - size_t scale_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t ir; - int64_t dr; - - int64_t group_size_in_row; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue cast_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_q8_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_Q8_0 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, - indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp deleted file mode 100644 index 504b43afaa1..00000000000 --- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +++ /dev/null @@ -1,218 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P - extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f16->8bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define QK8_0 32 - -class QUANTIZE_F16_Q8_0 { - public: - __aicore__ inline QUANTIZE_F16_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - } - - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / QK8_0; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t); - - input_gm.SetGlobalBuffer((__gm__ half *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir * - group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(work_queue, 1, 32); - pipe.InitBuffer(max_queue, 1, 32); - pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); - pipe.InitBuffer(scale_queue, 1, 32); - pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + QK8_0 * group; - - const int64_t output_offset = i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + QK8_0 * group; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor abs_local = abs_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); - Abs(abs_local, cast_local, QK8_0); - ReduceMax(max_local, abs_local, work_local, QK8_0); - - pipe_barrier(PIPE_ALL); - float d = max_local.GetValue(0); - d = d / ((1 << 7) - 1); - if (d != 0) { - Muls(cast_local, cast_local, 1.0f / d, QK8_0); - } - - Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0); - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - abs_queue.FreeTensor(abs_local); - max_queue.FreeTensor(max_local); - cast_queue.FreeTensor(cast_local); - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - if (scale_local_offset == 16) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, 16); - pipe_barrier(PIPE_ALL); - scale_global_offset += 16; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue abs_queue; - TQue scale_queue; - TQue cast_queue; - -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_F16_Q8_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp deleted file mode 100644 index 05b0bc1df59..00000000000 --- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +++ /dev/null @@ -1,216 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support f32->8bit quantization - extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f32->8bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define QK8_0 32 - -class QUANTIZE_F32_Q8_0 { - public: - __aicore__ inline QUANTIZE_F32_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - } - - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / QK8_0; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t); - - input_gm.SetGlobalBuffer((__gm__ float *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + - ir * group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(work_queue, 1, 32); - pipe.InitBuffer(max_queue, 1, 32); - pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); - pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half)); - pipe.InitBuffer(scale_queue, 1, 32); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + QK8_0 * group; - - const int64_t output_offset = i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + QK8_0 * group; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor abs_local = abs_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - - Abs(abs_local, input_local, QK8_0); - ReduceMax(max_local, abs_local, work_local, QK8_0); - pipe_barrier(PIPE_ALL); - float d = max_local.GetValue(0); - d = d / ((1 << 7) - 1); - if (d != 0) { - Muls(input_local, input_local, 1.0f / d, QK8_0); - } - - Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0); - Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0); - Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - abs_queue.FreeTensor(abs_local); - max_queue.FreeTensor(max_local); - cast_queue.FreeTensor(cast_local); - - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - if (scale_local_offset == 16) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, 16); - pipe_barrier(PIPE_ALL); - scale_global_offset += 16; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue abs_queue; - TQue cast_queue; - TQue scale_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_F32_Q8_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp deleted file mode 100644 index 1188937b744..00000000000 --- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +++ /dev/null @@ -1,295 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support float->4bit quantization - extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f32->4bit quantization.\n"); - } - - extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f16->4bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define Group_Size 32 - -template -class QUANTIZE_FLOAT_TO_Q4_0 { - public: - __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4], - // permute=[0,0,0,0]): - // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - // input stride of data elements - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - output_ne[i] = output_ne_ub[i]; - } - - // output stride of data elements - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - // scale saved one by one after data:. [group1_scale, group2_scale, ...] - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / Group_Size; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t) / 2; - - input_gm.SetGlobalBuffer((__gm__ SRC_T *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir * - group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T)); - pipe.InitBuffer(output_queue, BUFFER_NUM, - Group_Size * sizeof(int8_t) / 2); - pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float)); - pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half)); - pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t)); - pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], Group_Size); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t, - // and using DataCopyPad to avoid 32 bits align. - LocalTensor output_local = output_queue.DeQue(); - LocalTensor output_int8_local = - output_local.ReinterpretCast(); - - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t); - DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams); - - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void input_to_cast(LocalTensor cast_local, - LocalTensor input_local) { - DataCopy(cast_local, input_local, Group_Size); - } - - __aicore__ inline void input_to_cast(LocalTensor cast_local, - LocalTensor input_local) { - Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + Group_Size * group; - - // output_offset is stride for output_gm which datatype is int8_t and - // divided by 2 is needed for int4b_t. - const int64_t output_offset = (i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + - Group_Size * group) / 2; - copy_in(input_offset); - - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor min_local = min_queue.AllocTensor(); - LocalTensor int8_local = int8_queue.AllocTensor(); - LocalTensor half_local = half_queue.AllocTensor(); - - input_to_cast(cast_local, input_local); - - ReduceMax(max_local, cast_local, work_local, Group_Size); - ReduceMin(min_local, cast_local, work_local, Group_Size); - const float max_value = max_local.GetValue(0); - const float min_value = min_local.GetValue(0); - float d = max_value; - if (min_value < 0 && (-1 * min_value) > max_value) { - d = min_value; - } - - d = d / (-8); - if (d != 0) { - Muls(cast_local, cast_local, 1.0f / d, Group_Size); - } - - // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7] - float scalar = 8.5f; - Adds(cast_local, cast_local, scalar, Group_Size); - Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size); - scalar = 15.0f; - Mins(cast_local, cast_local, scalar, Group_Size); - scalar = -8.0f; - Adds(cast_local, cast_local, scalar, Group_Size); - - // float->half->int4b - Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size); - Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size); - - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - max_queue.FreeTensor(max_local); - min_queue.FreeTensor(min_local); - int8_queue.FreeTensor(int8_local); - half_queue.FreeTensor(half_local); - cast_queue.FreeTensor(cast_local); - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - // Copy Group_Size/2 length data each time. - if (scale_local_offset == Group_Size / 2) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, - Group_Size / 2); - pipe_barrier(PIPE_ALL); - scale_global_offset += Group_Size / 2; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - scale_queue.FreeTensor(scale_local); - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue min_queue; - TQue scale_queue; - TQue cast_queue; - TQue int8_queue; - TQue half_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_FLOAT_TO_Q4_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_FLOAT_TO_Q4_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt deleted file mode 100644 index c9109d5e8ee..00000000000 --- a/ggml/src/ggml-kompute/CMakeLists.txt +++ /dev/null @@ -1,166 +0,0 @@ - -find_package(Vulkan COMPONENTS glslc REQUIRED) -find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) - -if (NOT glslc_executable) - message(FATAL_ERROR "glslc not found") -endif() - -ggml_add_backend_library(ggml-kompute - ggml-kompute.cpp - ../../include/ggml-kompute.h - ) - -target_link_libraries(ggml-kompute PRIVATE ggml-base kompute) -target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - -add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) - -function(compile_shader) - set(options) - set(oneValueArgs) - set(multiValueArgs SOURCES) - cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - foreach(source ${compile_shader_SOURCES}) - get_filename_component(filename ${source} NAME) - set(spv_file ${filename}.spv) - add_custom_command( - OUTPUT ${spv_file} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp - COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} - COMMENT "Compiling ${source} to ${spv_file}" - ) - - get_filename_component(RAW_FILE_NAME ${spv_file} NAME) - set(FILE_NAME "shader${RAW_FILE_NAME}") - string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) - string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) - string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") - set(OUTPUT_HEADER_FILE "${HEADER_FILE}") - message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") - if(CMAKE_GENERATOR MATCHES "Visual Studio") - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" - ) - else() - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" - ) - endif() - endforeach() -endfunction() - -if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") - message(STATUS "Kompute found") - set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") - add_subdirectory(kompute) - - # Compile our shaders - compile_shader(SOURCES - kompute-shaders/op_scale.comp - kompute-shaders/op_scale_8.comp - kompute-shaders/op_add.comp - kompute-shaders/op_addrow.comp - kompute-shaders/op_mul.comp - kompute-shaders/op_silu.comp - kompute-shaders/op_relu.comp - kompute-shaders/op_gelu.comp - kompute-shaders/op_softmax.comp - kompute-shaders/op_norm.comp - kompute-shaders/op_rmsnorm.comp - kompute-shaders/op_diagmask.comp - kompute-shaders/op_mul_mat_mat_f32.comp - kompute-shaders/op_mul_mat_f16.comp - kompute-shaders/op_mul_mat_q8_0.comp - kompute-shaders/op_mul_mat_q4_0.comp - kompute-shaders/op_mul_mat_q4_1.comp - kompute-shaders/op_mul_mat_q4_k.comp - kompute-shaders/op_mul_mat_q6_k.comp - kompute-shaders/op_getrows_f32.comp - kompute-shaders/op_getrows_f16.comp - kompute-shaders/op_getrows_q4_0.comp - kompute-shaders/op_getrows_q4_1.comp - kompute-shaders/op_getrows_q6_k.comp - kompute-shaders/op_rope_norm_f16.comp - kompute-shaders/op_rope_norm_f32.comp - kompute-shaders/op_rope_neox_f16.comp - kompute-shaders/op_rope_neox_f32.comp - kompute-shaders/op_cpy_f16_f16.comp - kompute-shaders/op_cpy_f16_f32.comp - kompute-shaders/op_cpy_f32_f16.comp - kompute-shaders/op_cpy_f32_f32.comp - ) - - # Create a custom target for our generated shaders - add_custom_target(generated_shaders DEPENDS - shaderop_scale.h - shaderop_scale_8.h - shaderop_add.h - shaderop_addrow.h - shaderop_mul.h - shaderop_silu.h - shaderop_relu.h - shaderop_gelu.h - shaderop_softmax.h - shaderop_norm.h - shaderop_rmsnorm.h - shaderop_diagmask.h - shaderop_mul_mat_mat_f32.h - shaderop_mul_mat_f16.h - shaderop_mul_mat_q8_0.h - shaderop_mul_mat_q4_0.h - shaderop_mul_mat_q4_1.h - shaderop_mul_mat_q4_k.h - shaderop_mul_mat_q6_k.h - shaderop_getrows_f32.h - shaderop_getrows_f16.h - shaderop_getrows_q4_0.h - shaderop_getrows_q4_1.h - shaderop_getrows_q6_k.h - shaderop_rope_norm_f16.h - shaderop_rope_norm_f32.h - shaderop_rope_neox_f16.h - shaderop_rope_neox_f32.h - shaderop_cpy_f16_f16.h - shaderop_cpy_f16_f32.h - shaderop_cpy_f32_f16.h - shaderop_cpy_f32_f32.h - ) - - # Create a custom command that depends on the generated_shaders - add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp - COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp - DEPENDS generated_shaders - COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" - ) - - # Add the stamp to the main sources to ensure dependency tracking - target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) -else() - message(WARNING "Kompute not found") -endif() diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp deleted file mode 100644 index 50579227183..00000000000 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ /dev/null @@ -1,2251 +0,0 @@ -#include "ggml-impl.h" -#include "ggml-backend.h" -#include "ggml-backend-impl.h" -#include "ggml-kompute.h" - -// These are generated at build time by cmake custom command -#include "shaderop_scale.h" -#include "shaderop_scale_8.h" -#include "shaderop_add.h" -#include "shaderop_addrow.h" -#include "shaderop_mul.h" -#include "shaderop_silu.h" -#include "shaderop_relu.h" -#include "shaderop_gelu.h" -#include "shaderop_softmax.h" -#include "shaderop_norm.h" -#include "shaderop_rmsnorm.h" -#include "shaderop_diagmask.h" -#include "shaderop_mul_mat_f16.h" -#include "shaderop_mul_mat_q8_0.h" -#include "shaderop_mul_mat_q4_0.h" -#include "shaderop_mul_mat_q4_1.h" -#include "shaderop_mul_mat_q4_k.h" -#include "shaderop_mul_mat_q6_k.h" -#include "shaderop_mul_mat_mat_f32.h" -#include "shaderop_getrows_f32.h" -#include "shaderop_getrows_f16.h" -#include "shaderop_getrows_q4_0.h" -#include "shaderop_getrows_q4_1.h" -#include "shaderop_getrows_q6_k.h" -#include "shaderop_rope_norm_f16.h" -#include "shaderop_rope_norm_f32.h" -#include "shaderop_rope_neox_f16.h" -#include "shaderop_rope_neox_f32.h" -#include "shaderop_cpy_f16_f16.h" -#include "shaderop_cpy_f16_f32.h" -#include "shaderop_cpy_f32_f16.h" -#include "shaderop_cpy_f32_f32.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef __linux__ -#include // for setenv -#endif - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 -#define QK_NL 16 - -typedef ggml_fp16_t half; - -static std::string ggml_kompute_format_name(int device) { - return "Kompute" + std::to_string(device); -} - -struct ggml_kompute_context { - int device; - std::string name; - std::shared_ptr pool; - - ggml_kompute_context(int device) - : device(device), name(ggml_kompute_format_name(device)) {} -}; - -// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object -// and consolidate the init functions and simplify object lifetime management. As it currently stands, -// we *have* to have the kompute manager no matter what for device discovery, but the kompute context -// is only created when a device is set and vulkan is explicitly turned on. -static ggml_kompute_context *s_kompute_context = nullptr; - -class kompute_manager { - kp::Manager *s_mgr = nullptr; - -public: - kp::Manager *operator()() { - if (s_mgr && !s_mgr->hasInstance()) { - destroy(); - } - if (!s_mgr) { - s_mgr = new kp::Manager; - } - return s_mgr; - } - - void destroy() { - delete s_mgr; - s_mgr = nullptr; - } -}; - -static kompute_manager komputeManager; - -struct ggml_vk_memory { - void *data = nullptr; - size_t size = 0; - vk::DeviceMemory *primaryMemory = nullptr; - vk::Buffer *primaryBuffer = nullptr; - vk::DeviceMemory *stagingMemory = nullptr; - vk::Buffer *stagingBuffer = nullptr; -}; - -#ifdef __linux__ -__attribute__((constructor)) -static void enable_sam() { - setenv("RADV_PERFTEST", "sam", false); -} -#endif - -static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physical_device) { - vk::PhysicalDeviceFeatures availableFeatures; - physical_device.getFeatures(&availableFeatures); - - if (!availableFeatures.shaderInt16) - return false; - - vk::PhysicalDeviceVulkan11Features availableFeatures11; - vk::PhysicalDeviceVulkan12Features availableFeatures12; - - availableFeatures11.pNext = &availableFeatures12; - availableFeatures12.pNext = nullptr; - - vk::PhysicalDeviceFeatures2 features2; - features2.pNext = &availableFeatures11; - - physical_device.getFeatures2(&features2); - - if (!availableFeatures11.uniformAndStorageBuffer16BitAccess || - !availableFeatures11.storageBuffer16BitAccess) { - return false; - } - - if (!availableFeatures12.storageBuffer8BitAccess || - !availableFeatures12.uniformAndStorageBuffer8BitAccess || - !availableFeatures12.shaderFloat16 || - !availableFeatures12.shaderInt8) { - return false; - } - - return true; -} - -static const char * ggml_vk_getVendorName(uint32_t vendorID) { - switch (vendorID) { - case 0x10DE: - return "nvidia"; - case 0x1002: - return "amd"; - case 0x8086: - return "intel"; - default: - return "unknown"; - } -} - -static std::vector ggml_vk_available_devices_internal(size_t memoryRequired) { - std::vector results; - if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance()) - return results; - - std::vector physical_devices; - try { - physical_devices = komputeManager()->listDevices(); - } catch (vk::SystemError & err) { - std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n"; - return results; - } - - uint32_t deviceCount = physical_devices.size(); - if (deviceCount == 0) - return results; - - std::unordered_map count_by_name; - - for (uint32_t i = 0; i < deviceCount; i++) { - const auto & physical_device = physical_devices[i]; - - VkPhysicalDeviceProperties dev_props = physical_device.getProperties(); - VkPhysicalDeviceMemoryProperties memoryProperties = physical_device.getMemoryProperties(); - const uint32_t major = VK_VERSION_MAJOR(dev_props.apiVersion); - const uint32_t minor = VK_VERSION_MINOR(dev_props.apiVersion); - if (major < 1 || minor < 2) - continue; - - if (!ggml_vk_checkPhysicalDeviceFeatures(physical_device)) - continue; - - size_t heapSize = 0; - for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) { - VkMemoryHeap heap = memoryProperties.memoryHeaps[j]; - if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { - heapSize = heap.size; - break; - } - } - - if (heapSize < memoryRequired) - continue; - - auto ext_props = physical_device.enumerateDeviceExtensionProperties(); - bool has_maintenance4 = false; - - // Check if maintenance4 is supported - for (const auto & properties : ext_props) { - if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { - has_maintenance4 = true; - } - } - - vk::PhysicalDeviceSubgroupProperties subgroup_props; - vk::PhysicalDeviceProperties2 dev_props2; - vk::PhysicalDeviceMaintenance3Properties dev_props3; - vk::PhysicalDeviceMaintenance4Properties dev_props4; - dev_props2.pNext = &dev_props3; - dev_props3.pNext = &subgroup_props; - if (has_maintenance4) { - subgroup_props.pNext = &dev_props4; - } - physical_device.getProperties2(&dev_props2); - - if (subgroup_props.subgroupSize < 32) - continue; - - ggml_vk_device d; - d.index = i; - d.type = dev_props.deviceType; - d.heapSize = heapSize; - d.vendor = strdup(ggml_vk_getVendorName(dev_props.vendorID)); - d.subgroupSize = subgroup_props.subgroupSize; - d.bufferAlignment = dev_props.limits.minStorageBufferOffsetAlignment; - - if (has_maintenance4) { - d.maxAlloc = std::min(dev_props3.maxMemoryAllocationSize, dev_props4.maxBufferSize); - } else { - d.maxAlloc = dev_props3.maxMemoryAllocationSize; - } - - std::string name(dev_props.deviceName); - size_t n_idx = ++count_by_name[name]; - if (n_idx > 1) { - name += " (" + std::to_string(n_idx) + ")"; - } - d.name = strdup(name.c_str()); - - results.push_back(d); - } - - std::stable_sort(results.begin(), results.end(), - [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool { - if (lhs.type != rhs.type) { - if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true; - if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false; - - if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true; - if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false; - } - return lhs.heapSize < rhs.heapSize; - } - ); - - return results; -} - -static std::vector& ggml_vk_available_devices() { - static std::vector devices = ggml_vk_available_devices_internal(0); - return devices; -} - -static void ggml_vk_filterByVendor(std::vector& devices, const std::string& targetVendor) { - devices.erase( - std::remove_if(devices.begin(), devices.end(), - [&targetVendor](const ggml_vk_device& device) { - return device.vendor != targetVendor; - }), - devices.end() - ); -} - -static void ggml_vk_filterByName(std::vector& devices, const std::string& targetName) { - devices.erase( - std::remove_if(devices.begin(), devices.end(), - [&targetName](const ggml_vk_device& device) { - return device.name != targetName; - }), - devices.end() - ); -} - -static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) { - if (name.empty()) - return false; - - auto devices = ggml_vk_available_devices_internal(memoryRequired); - if (name == "amd" || name == "nvidia" || name == "intel") { - ggml_vk_filterByVendor(devices, name); - } else if (name != "gpu") { - ggml_vk_filterByName(devices, name); - } - - if (devices.empty()) - return false; - - *device = devices.front(); - return true; -} - -bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) { - return ggml_vk_get_device(device, memoryRequired, std::string(name)); -} - -bool ggml_vk_has_vulkan() { - return komputeManager()->hasVulkan(); -} - -bool ggml_vk_has_device() { - return komputeManager()->hasDevice(); -} - -ggml_vk_device ggml_vk_current_device() { - if (!komputeManager()->hasDevice()) - return ggml_vk_device(); - - auto devices = ggml_vk_available_devices(); - ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data()); - GGML_ASSERT(!devices.empty()); - return devices.front(); -} - -static -void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) { - std::vector descriptorPoolSizes = { - vk::DescriptorPoolSize( - vk::DescriptorType::eStorageBuffer, - 4 * size // Descriptor count is number of possible tensors to pass into an algorithm - ) - }; - - vk::DescriptorPoolCreateInfo descriptorPoolInfo( - vk::DescriptorPoolCreateFlags(), - size, // Max sets - static_cast(descriptorPoolSizes.size()), - descriptorPoolSizes.data()); - - ctx->pool = std::make_shared(); - vk::Result r = komputeManager()->device()->createDescriptorPool( - &descriptorPoolInfo, nullptr, ctx->pool.get()); - if (r != vk::Result::eSuccess) - std::cerr << "Error allocating descriptor pool" << vk::to_string(r); -} - -static -void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) { - if (ctx->pool) { - komputeManager()->device()->destroy( - *ctx->pool, - (vk::Optional)nullptr); - ctx->pool = nullptr; - } -} - -static -vk::Buffer *ggml_vk_allocate_buffer(size_t size) { - vk::BufferCreateInfo bufferCreateInfo; - bufferCreateInfo.size = size; - bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer | - vk::BufferUsageFlagBits::eTransferSrc | - vk::BufferUsageFlagBits::eTransferDst; - bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive; - - vk::Buffer *vkBuffer = new vk::Buffer; - vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer); - if (r != vk::Result::eSuccess) - std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl; - return vkBuffer; -} - -static -vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) { - - uint32_t memoryTypeIndex = -1; - bool memoryTypeIndexFound = false; - vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties(); - for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) { - const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i]; - const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex]; - if (memoryHeap.size < size) { - continue; - } - - if (requirements.memoryTypeBits & (1 << i)) { - if (((memoryProperties.memoryTypes[i]).propertyFlags & - flags) == flags) { - memoryTypeIndex = i; - memoryTypeIndexFound = true; - if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) { - *isHostVisible = true; - } - break; - } - } - } - if (!memoryTypeIndexFound) { - throw std::runtime_error( - "Memory type index for buffer creation not found"); - } - - vk::MemoryAllocateInfo allocInfo; - allocInfo.allocationSize = size; - allocInfo.memoryTypeIndex = memoryTypeIndex; - vk::DeviceMemory *vkDeviceMemory = new vk::DeviceMemory; - vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory); - if (r != vk::Result::eSuccess) { - std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl; - throw std::runtime_error("Error allocating vulkan memory."); - } - return vkDeviceMemory; -} - -static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) { - size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer); - - // If offset is already aligned, return it directly - if (offset % minStorageBufferOffsetAlignment == 0) { - return offset; - } - - // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset - return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment; -} - -static ggml_vk_memory ggml_vk_allocate(size_t size) { - ggml_vk_memory memory; - bool isHostVisible = false; - { - memory.primaryBuffer = ggml_vk_allocate_buffer(size); - vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer); - vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal; - memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible); - komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0); - if (isHostVisible) { - vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data); - if (r != vk::Result::eSuccess) - std::cerr << "Error mapping memory" << vk::to_string(r); - } - } - - if (!isHostVisible) { - memory.stagingBuffer = ggml_vk_allocate_buffer(size); - vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer); - vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible | - vk::MemoryPropertyFlagBits::eHostCoherent | - vk::MemoryPropertyFlagBits::eHostCached; - memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible); - komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0); - vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data); - if (r != vk::Result::eSuccess) - std::cerr << "Error mapping memory" << vk::to_string(r); - } - - memory.size = size; - return memory; -} - -static void ggml_vk_free_memory(ggml_vk_memory &memory) -{ - komputeManager()->device()->destroy( - *memory.primaryBuffer, - (vk::Optional)nullptr); - if (memory.stagingBuffer) { - komputeManager()->device()->destroy( - *memory.stagingBuffer, - (vk::Optional)nullptr); - } - komputeManager()->device()->freeMemory( - *memory.primaryMemory, - (vk::Optional)nullptr); - if (memory.stagingMemory) { - komputeManager()->device()->freeMemory( - *memory.stagingMemory, - (vk::Optional)nullptr); - } -} - -static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft); - -static -ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) { - ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; - - // compatibility with ggml-backend - GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name); - - ggml_vk_memory * buf_ctx = static_cast(buffer->context); - - const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data); - - GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size)); - - offset = uint64_t(ioffs); - return buf_ctx; -} - -static -const std::shared_ptr ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) { - uint64_t originalOffset = 0; - auto * res = ggml_vk_find_tensor(t, originalOffset); - if (!res) { - static std::shared_ptr nullTensor = nullptr; - return nullTensor; - } - - // Create a tensor whose memory will be composed of our buffers at the correct offset - const size_t nelements = ggml_nelements(t); - size_t nbytes = ggml_nbytes(t); - - size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset); - if (alignedOffset) { - *alignedOffset = originalOffset - vulkanOffset; - nbytes += *alignedOffset; - } - - return komputeManager()->tensor( - t->data, - nelements, - nbytes, kp::Tensor::TensorDataTypes::eFloat, - res->primaryMemory, res->primaryBuffer, - res->stagingMemory, res->stagingBuffer, - vulkanOffset); -} - -static std::vector getSpirvShader(const unsigned char* rawData, size_t size) { - if (size % sizeof(uint32_t) != 0) { - throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)"); - } - - const uint32_t* data_ptr = reinterpret_cast(rawData); - size_t count = size / sizeof(uint32_t); - return std::vector(data_ptr, data_ptr + count); -} - -inline static -uint32_t safe_divide(uint32_t a, uint32_t b) { - if (b <= 1) { - return a; - } - if ((a % b) != 0) { - fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b); - GGML_ABORT("safe_divide result would've had remainder"); - } - return a / b; -} - -static void ggml_vk_add( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, - int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13, - int32_t ne0, - int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv, - kp::shader_data::op_add_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00; - int32_t nb00, nb01, nb02, nb03; - int32_t ne10, ne11, ne12, ne13; - int32_t nb10, nb11, nb12, nb13; - int32_t ne0; - int32_t nb0, nb1, nb2, nb3; - } const pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, - nb10, nb11, nb12, nb13, - ne0, - nb0, nb1, nb2, nb3 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_addrow(kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - uint32_t size, uint32_t row = 0) { - - const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv, - kp::shader_data::op_addrow_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - uint32_t row; - } const pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - row - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); - else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({size}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, - int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13, - int32_t ne0, - int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv, - kp::shader_data::op_mul_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00; - int32_t nb00, nb01, nb02, nb03; - int32_t ne10, ne11, ne12, ne13; - int32_t nb10, nb11, nb12, nb13; - int32_t ne0; - int32_t nb0, nb1, nb2, nb3; - } const pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, - nb10, nb11, nb12, nb13, - ne0, - nb0, nb1, nb2, nb3 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_scale(kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - uint32_t size, float scale) { - const static auto spirv_1 = getSpirvShader( - kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len - ); - const static auto spirv_8 = getSpirvShader( - kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len - ); - - struct PushConstants { - uint32_t inOff, outOff; - float scale; - } const pushConsts { - safe_divide(inOff, 4), safe_divide(outOff, 4), - scale - }; - - const auto * spirv = &spirv_1; - std::string name(__func__); - if (size % 8 == 0) { - size /= 8; - name += "_8"; - spirv = &spirv_8; - } - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({size}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_xxlu( - const std::vector& spirv, const char * suffix, kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - uint32_t size -) { - struct PushConstants { - uint32_t inOff, outOff; - } const pushConsts { - safe_divide(inOff, 4), safe_divide(outOff, 4), - }; - - auto name = std::string(__func__) + "_" + suffix; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({size}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_silu(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv, - kp::shader_data::op_silu_comp_spv_len); - - ggml_vk_xxlu(spirv, "silu", std::forward(args)...); -} - -template -static void ggml_vk_relu(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv, - kp::shader_data::op_relu_comp_spv_len); - - ggml_vk_xxlu(spirv, "relu", std::forward(args)...); -} - -template -static void ggml_vk_gelu(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv, - kp::shader_data::op_gelu_comp_spv_len); - - ggml_vk_xxlu(spirv, "gelu", std::forward(args)...); -} - -static void ggml_vk_soft_max( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03, - float scale, float max_bias, float m0, float m1, - uint32_t n_head_log2 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv, - kp::shader_data::op_softmax_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne01, ne02; - float scale, max_bias, m0, m1; - uint32_t n_head_log2; - int32_t mask; - } pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne01, ne02, - scale, max_bias, m0, m1, - n_head_log2, - bool(inB) - }; - - auto & inB_ = inB ? inB : inA; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device - const uint32_t local_x = 32; - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB_, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_norm_( - const std::vector& spirv, const char * suffix, kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - int32_t ne00, int32_t nb01, - int32_t nrows, float epsilon -) { - GGML_ASSERT(nb01%sizeof(float) == 0); - GGML_ASSERT(ne00%sizeof(float) == 0); - - struct PushConstants { - uint32_t inOff, outOff; - uint32_t ne00, nb01; - float eps; - } pushConsts { - safe_divide(inOff, 4), safe_divide(outOff, 4), - (uint32_t)ne00, (uint32_t)nb01, epsilon - }; - - auto name = std::string(__func__) + "_" + suffix; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({(uint32_t)nrows}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_norm(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv, - kp::shader_data::op_norm_comp_spv_len); - - ggml_vk_norm_(spirv, "norm", std::forward(args)...); -} - -template -static void ggml_vk_rms_norm(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv, - kp::shader_data::op_rmsnorm_comp_spv_len); - - ggml_vk_norm_(spirv, "rms", std::forward(args)...); -} - -static void ggml_vk_diag_mask_inf(kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - uint32_t n_past, - int32_t ne00, int32_t ne01, int32_t ne02) { - const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv, - kp::shader_data::op_diagmask_comp_spv_len); - - struct PushConstants { - uint32_t inOff, outOff; - uint32_t n_past; - int32_t ne00, ne01; - } pushConsts { - safe_divide(inOff, 4), safe_divide(outOff, 4), - n_past, - ne00, ne01 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts}); - else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul_mat_f16( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13, - int32_t ne0, int32_t ne1, - uint32_t r2, uint32_t r3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv, - kp::shader_data::op_mul_mat_f16_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne01, ne02; - uint32_t nb00, nb01, nb02, nb03; - int32_t ne10, ne11, ne12; - uint32_t nb10, nb11, nb12, nb13; - int32_t ne0, ne1; - uint32_t r2, r3; - } pushConsts { - safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne01, ne02, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, - nb10, nb11, nb12, nb13, - ne0, ne1, - r2, r3 - }; - - const unsigned ny = unsigned((ne11 + 4 - 1)/4); - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - uint32_t nb01, uint32_t nb02, - int32_t ne11, int32_t ne12, - uint32_t nb11, uint32_t nb12, - uint32_t nb1, uint32_t nb2) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv, - kp::shader_data::op_mul_mat_mat_f32_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne01, ne02, ne11, ne12; - uint32_t nb01, nb02; - uint32_t nb11, nb12; - uint32_t nb1, nb2; - } pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne01, ne02, ne11, ne12, - nb01, nb02, nb11, nb12, - nb1, nb2 - }; - - const uint32_t local_x = ggml_vk_current_device().subgroupSize; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), - {inA, inB, out}, spirv, - {unsigned(ne01), - unsigned(ne11), - unsigned(std::max(ne12, ne02)) - }, - {local_x}, - {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), - unsigned(ne11), - unsigned(std::max(ne12, ne02)), - }); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul_mat_impl( - const std::vector& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t ne0, int32_t ne1, - uint32_t nb01, uint32_t nb02, uint32_t nb03, - uint32_t nb11, uint32_t nb12, uint32_t nb13, - uint32_t r2, uint32_t r3 -) { - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne01, ne02; - int32_t ne10, ne12; - int32_t ne0, ne1; - uint32_t nb01, nb02, nb03; - uint32_t nb11, nb12, nb13; - uint32_t r2, r3; - } pushConsts { - safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne01, ne02, - ne10, ne12, - ne0, ne1, - nb01, nb02, nb03, - nb11, nb12, nb13, - r2, r3 - }; - - auto name = std::string(__func__) + "_" + suffix; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8; - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_mul_mat_q4_0(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv, - kp::shader_data::op_mul_mat_q4_0_comp_spv_len); - - ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward(args)...); -} - -template -static void ggml_vk_mul_mat_q4_1(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv, - kp::shader_data::op_mul_mat_q4_1_comp_spv_len); - - ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward(args)...); -} - -template -static void ggml_vk_mul_mat_q8_0(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv, - kp::shader_data::op_mul_mat_q8_0_comp_spv_len); - - ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward(args)...); -} - -static void ggml_vk_mul_mat_q4_k( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t ne0, int32_t ne1, - uint32_t nb01, uint32_t nb02, uint32_t nb03, - uint32_t nb11, uint32_t nb12, uint32_t nb13, - uint32_t r2, uint32_t r3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv, - kp::shader_data::op_mul_mat_q4_k_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12; - uint32_t nb01, nb02, nb03, nb11, nb12, nb13; - uint32_t r2, r3; - } pushConsts { - inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne10, ne0, ne1, ne01, ne02, ne12, - nb01, nb02, nb03, nb11, nb12, nb13, - r2, r3 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul_mat_q6_k( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t ne0, int32_t ne1, - uint32_t nb01, uint32_t nb02, uint32_t nb03, - uint32_t nb11, uint32_t nb12, uint32_t nb13, - uint32_t r2, uint32_t r3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv, - kp::shader_data::op_mul_mat_q6_k_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12; - uint32_t nb01, nb02, nb03, nb11, nb12, nb13; - uint32_t r2, r3; - } pushConsts { - inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne10, ne0, ne1, ne01, ne02, ne12, - nb01, nb02, nb03, nb11, nb12, nb13, - r2, r3 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - const uint32_t local_x = 2; - const uint32_t local_y = ggml_vk_current_device().subgroupSize; - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_get_rows( - const std::vector& spirv, - const char * suffix, - unsigned element_size, unsigned qk, - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t nb01, int32_t nb1, - uint32_t size -) { - GGML_ASSERT(nb01%element_size == 0); - GGML_ASSERT(nb1%sizeof(float) == 0); - if (qk) GGML_ASSERT(ne00%qk == 0); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, nb01, nb1; - } pushConsts { - safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, nb01, nb1 - }; - - auto name = std::string(__func__) + "_" + suffix; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({size}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_get_rows_f32(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv, - kp::shader_data::op_getrows_f32_comp_spv_len); - - ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward(args)...); -} - -template -static void ggml_vk_get_rows_f16(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv, - kp::shader_data::op_getrows_f16_comp_spv_len); - - ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward(args)...); -} - -template -static void ggml_vk_get_rows_q4_0(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv, - kp::shader_data::op_getrows_q4_0_comp_spv_len); - - ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward(args)...); -} - -template -static void ggml_vk_get_rows_q4_1(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv, - kp::shader_data::op_getrows_q4_1_comp_spv_len); - - ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward(args)...); -} - -template -static void ggml_vk_get_rows_q6_k(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv, - kp::shader_data::op_getrows_q6_k_comp_spv_len); - ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward(args)...); -} - -static void ggml_vk_rope( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& inC, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff, - ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig, - float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow, - int32_t ne01, int32_t ne02, int32_t ne03, - uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, - int32_t ne0, - uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3 -) { - GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32); - - static const auto spirv_norm_f16 = getSpirvShader( - kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len - ); - static const auto spirv_norm_f32 = getSpirvShader( - kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len - ); - static const auto spirv_neox_f16 = getSpirvShader( - kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len - ); - static const auto spirv_neox_f32 = getSpirvShader( - kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len - ); - - int type_size = src0t == GGML_TYPE_F16 ? 2 : 4; - - GGML_ASSERT(nb03 % type_size == 0); - GGML_ASSERT(nb02 % type_size == 0); - GGML_ASSERT(nb01 % type_size == 0); - GGML_ASSERT(nb00 % type_size == 0); - GGML_ASSERT(nb3 % type_size == 0); - GGML_ASSERT(nb2 % type_size == 0); - GGML_ASSERT(nb1 % type_size == 0); - GGML_ASSERT(nb0 % type_size == 0); - - struct PushConstants { - uint32_t inAOff, inBOff, inCOff, outOff; - int32_t n_dims, mode, n_ctx_orig; - float freq_base, freq_scale; - bool has_freq_factors; - float ext_factor, attn_factor, beta_fast, beta_slow; - uint32_t nb00, nb01, nb02, nb03; - int32_t ne0; - uint32_t nb0, nb1, nb2, nb3; - } pushConsts { - safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size), - n_dims, mode, n_ctx_orig, - freq_base, freq_scale, - has_freq_factors, - ext_factor, attn_factor, beta_fast, beta_slow, - nb00, nb01, nb02, nb03, - ne0, - nb0, nb1, nb2, nb3 - }; - - auto & inC_ = inC ? inC : inA; - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const bool is_f16 = src0t == GGML_TYPE_F16; - - auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32"); - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32; - s_algo = komputeManager()->algorithm( - name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv, - {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts} - ); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({inA, inB, inC_, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_cpy( - const std::vector& spirv, - uint32_t in_element_size, uint32_t out_element_size, - kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, - uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, - int32_t ne0, int32_t ne1, int32_t ne2, - uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3 -) { - struct PushConstants { - uint32_t inOff, outOff; - int32_t ne00, ne01, ne02; - uint32_t nb00, nb01, nb02, nb03; - int32_t ne0, ne1, ne2; - uint32_t nb0, nb1, nb2, nb3; - } pushConsts { - safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size), - ne00, ne01, ne02, - nb00, nb01, nb02, nb03, - ne0, ne1, ne2, - nb0, nb1, nb2, nb3 - }; - - std::string name = std::string(__func__) - + "_i_" + std::to_string(in_element_size) - + "_o_" + std::to_string(out_element_size); - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); - else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_cpy_f32_f16(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv, - kp::shader_data::op_cpy_f32_f16_comp_spv_len); - ggml_vk_cpy(spirv, 4, 2, std::forward(args)...); -} - -template -static void ggml_vk_cpy_f32_f32(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv, - kp::shader_data::op_cpy_f32_f32_comp_spv_len); - ggml_vk_cpy(spirv, 4, 4, std::forward(args)...); -} - -template -static void ggml_vk_cpy_f16_f16(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv, - kp::shader_data::op_cpy_f16_f16_comp_spv_len); - ggml_vk_cpy(spirv, 2, 2, std::forward(args)...); -} - -template -static void ggml_vk_cpy_f16_f32(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv, - kp::shader_data::op_cpy_f16_f32_comp_spv_len); - ggml_vk_cpy(spirv, 2, 4, std::forward(args)...); -} - -static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - int64_t n = ggml_nelements(op); - switch (op->op) { - case GGML_OP_UNARY: - if (n % 4 != 0) return false; - switch (ggml_get_unary_op(op)) { - case GGML_UNARY_OP_GELU: - if (n % 8 != 0) return false; - // fall through - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_SILU: - return ggml_is_contiguous(op->src[0]); - default: - ; - } - break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - case GGML_OP_ADD: - case GGML_OP_MUL: - case GGML_OP_SCALE: - case GGML_OP_SOFT_MAX: - case GGML_OP_RMS_NORM: - case GGML_OP_NORM: - return true; - case GGML_OP_ROPE: - { - const int mode = ((const int32_t *) op->op_params)[2]; - if (mode & GGML_ROPE_TYPE_MROPE) { - return false; - } - if (mode & GGML_ROPE_TYPE_VISION) { - return false; - } - return true; - } - case GGML_OP_DUP: - case GGML_OP_CPY: - case GGML_OP_CONT: - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - break; - default: - return false; - } - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - break; - default: - return false; - } - return true; - case GGML_OP_DIAG_MASK_INF: - return op->ne[3] == 1; - case GGML_OP_GET_ROWS: - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q6_K: - return op->ne[2] == 1 && op->ne[3] == 1; - default: - ; - } - return false; - case GGML_OP_MUL_MAT: - if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1])) - return false; - - switch (op->src[0]->type) { - case GGML_TYPE_F32: - return op->ne[3] == 1; - case GGML_TYPE_Q6_K: - case GGML_TYPE_F16: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_K: - return true; - default: - ; - } - default: - ; - } - return false; - - GGML_UNUSED(dev); -} - -static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { - const int n_seq = 8; - - // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting - // it to the size of the graph, but I think it can be made smaller? - ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes); - - std::vector> sequences(n_seq); - - for (auto& sequence : sequences) { - sequence = komputeManager()->sequence(); - } - for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) { - const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq; - - auto& seq = *sequences[seq_idx]; - - const int node_start = (seq_idx + 0) * n_nodes_per_seq; - const int node_end = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes); - - bool any_commands_recorded = false; - - for (int i = node_start; i < node_end; ++i) { - struct ggml_tensor * src0 = gf->nodes[i]->src[0]; - struct ggml_tensor * src1 = gf->nodes[i]->src[1]; - struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2); - struct ggml_tensor * dst = gf->nodes[i]; - GGML_ASSERT(dst->data != nullptr); - - if (ggml_is_empty(dst)) { - continue; - } - - switch (dst->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - continue; // noop -> next node - default: - break; - } - - any_commands_recorded = true; - - const int32_t ne00 = src0 ? src0->ne[0] : 0; - const int32_t ne01 = src0 ? src0->ne[1] : 0; - const int32_t ne02 = src0 ? src0->ne[2] : 0; - const int32_t ne03 = src0 ? src0->ne[3] : 0; - - const uint32_t nb00 = src0 ? src0->nb[0] : 0; - const uint32_t nb01 = src0 ? src0->nb[1] : 0; - const uint32_t nb02 = src0 ? src0->nb[2] : 0; - const uint32_t nb03 = src0 ? src0->nb[3] : 0; - - const int32_t ne10 = src1 ? src1->ne[0] : 0; - const int32_t ne11 = src1 ? src1->ne[1] : 0; - const int32_t ne12 = src1 ? src1->ne[2] : 0; - const int32_t ne13 = src1 ? src1->ne[3] : 0; - - const uint32_t nb10 = src1 ? src1->nb[0] : 0; - const uint32_t nb11 = src1 ? src1->nb[1] : 0; - const uint32_t nb12 = src1 ? src1->nb[2] : 0; - const uint32_t nb13 = src1 ? src1->nb[3] : 0; - - const int32_t ne0 = dst ? dst->ne[0] : 0; - const int32_t ne1 = dst ? dst->ne[1] : 0; - const int32_t ne2 = dst ? dst->ne[2] : 0; -// const int32_t ne3 = dst ? dst->ne[3] : 0; - - const uint32_t nb0 = dst ? dst->nb[0] : 0; - const uint32_t nb1 = dst ? dst->nb[1] : 0; - const uint32_t nb2 = dst ? dst->nb[2] : 0; - const uint32_t nb3 = dst ? dst->nb[3] : 0; - - const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; - const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; - const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - - const static std::shared_ptr nullTensor = nullptr; - uint32_t off_src0 = 0; - uint32_t off_src1 = 0; - uint32_t off_src2 = 0; - uint32_t off_dst = 0; - const std::shared_ptr& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor; - const std::shared_ptr& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor; - const std::shared_ptr& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor; - const std::shared_ptr& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor; - - switch (dst->op) { - case GGML_OP_ADD: - { - if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { - // src1 is a row - ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00); - } else { - ggml_vk_add( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne03, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, - nb10, nb11, nb12, nb13, - ne0, - nb0, nb1, nb2, nb3 - ); - } - } break; - case GGML_OP_MUL: - { - ggml_vk_mul( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne03, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, - nb10, nb11, nb12, nb13, - ne0, - nb0, nb1, nb2, nb3 - ); - } break; - case GGML_OP_SCALE: - { - float scale; memcpy(&scale, dst->op_params, sizeof(float)); - - ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale); - } break; - case GGML_OP_UNARY: - { - int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); - switch (ggml_get_unary_op(gf->nodes[i])) { - case GGML_UNARY_OP_SILU: - { - ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4); - } break; - case GGML_UNARY_OP_RELU: - { - ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4); - } break; - case GGML_UNARY_OP_GELU: - { - GGML_ASSERT(n % 8 == 0); - ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8); - } break; - default: - { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ABORT("fatal error"); - } - } - } break; - case GGML_OP_SOFT_MAX: - { - float scale; - float max_bias; - - memcpy(&scale, (float *)dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float)); - -#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support") -#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") - GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32); - - const int64_t nrows_x = ggml_nrows(src0); - const int64_t nrows_y = src0->ne[1]; - - const uint32_t n_head = nrows_x/nrows_y; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2); - } break; - case GGML_OP_DIAG_MASK_INF: - { - const int n_past = ((int32_t *)(dst->op_params))[0]; - ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02); - } break; - case GGML_OP_NORM: - { - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps); - } break; - case GGML_OP_RMS_NORM: - { - GGML_ASSERT(ne00 % 4 == 0); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps); - } break; - case GGML_OP_MUL_MAT: - { - GGML_ASSERT(ne00 == ne10); - - GGML_ASSERT(ne12 % ne02 == 0); - GGML_ASSERT(ne13 % ne03 == 0); - - const uint32_t r2 = ne12/ne02; - const uint32_t r3 = ne13/ne03; - - if (src1t != GGML_TYPE_F32) { - fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); - goto not_implemented; - } - - if (ggml_is_transposed(src0) || - ggml_is_transposed(src1)) { - fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); - goto not_implemented; - } - - switch (src0t) { - case GGML_TYPE_F32: - ggml_vk_mul_mat_mat_f32( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2 - ); - break; - case GGML_TYPE_F16: - ggml_vk_mul_mat_f16( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13, - ne0, ne1, r2, r3 - ); - break; - case GGML_TYPE_Q8_0: - ggml_vk_mul_mat_q8_0( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - case GGML_TYPE_Q4_0: - ggml_vk_mul_mat_q4_0( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - case GGML_TYPE_Q4_1: - ggml_vk_mul_mat_q4_1( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - case GGML_TYPE_Q4_K: - ggml_vk_mul_mat_q4_k( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - case GGML_TYPE_Q6_K: - ggml_vk_mul_mat_q6_k( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - default: { - fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); - goto not_implemented; - } - } - - } break; - case GGML_OP_GET_ROWS: - { - if (src0t == GGML_TYPE_F32) { - ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0t == GGML_TYPE_F16) { - ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0t == GGML_TYPE_Q4_0) { - ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0t == GGML_TYPE_Q4_1) { - ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0t == GGML_TYPE_Q6_K) { - ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else { - fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t); - goto not_implemented; - } - } break; - case GGML_OP_ROPE: - { - GGML_ASSERT(ne10 == ne02); - GGML_ASSERT(src0t == dstt); - // const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - - const bool has_freq_factors = dst->src[2] != nullptr; - - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - ggml_vk_rope( - seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig, - freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow, - ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3 - ); - } break; - case GGML_OP_DUP: - case GGML_OP_CPY: - case GGML_OP_CONT: - { - switch (src0t) { - case GGML_TYPE_F32: - { - switch (dstt) { - case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; - case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; - default: goto not_implemented; - } - } break; - case GGML_TYPE_F16: - { - switch (dstt) { - case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; - case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; - default: goto not_implemented; - } break; - default: goto not_implemented; - } - } - } break; - default: goto not_implemented; - } - continue; - not_implemented: {} - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - //GGML_ABORT("fatal error"); - } - - // Evaluate sequence - if (any_commands_recorded) { - seq.evalAsync(); - } - } - - // Wait for all sequences to finish - for (auto& sequence : sequences) { - if (sequence->isRunning()) - sequence->evalAwait(); - } - - ggml_vk_free_descriptor_pool(ctx); -} - -template<> -kp::Tensor::TensorDataTypes -kp::TensorT::dataType() -{ - return TensorDataTypes::eFloat; -} - -template<> -kp::Tensor::TensorDataTypes -kp::TensorT::dataType() -{ - return TensorDataTypes::eUnsignedInt; -} - -//////////////////////////////////////////////////////////////////////////////// - -// backend interface - -struct ggml_backend_kompute_buffer_type_context { - int device; - int device_ref = 0; - uint64_t buffer_alignment; - uint64_t max_alloc; - std::string name; - - ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc) - : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {} -}; - -static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - - if (!ctx->device_ref) { - komputeManager()->initializeDevice( - ctx->device, {}, { - "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage", - "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info" - } - ); - } - - assert(ggml_vk_has_device()); - ctx->device_ref++; -} - -static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - - assert(ctx->device_ref > 0); - - ctx->device_ref--; - - if (!ctx->device_ref) { - komputeManager.destroy(); - } -} - -static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) { - auto * memory = (ggml_vk_memory *)buffer->context; - if (ggml_vk_has_device()) { - ggml_vk_free_memory(*memory); - } - delete memory; -} - -static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { - return ((ggml_vk_memory *)buffer->context)->data; -} - -static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_UNUSED(buffer); - - const auto res = ggml_vk_get_tensor(tensor); - GGML_ASSERT(res); - - memcpy((char *)tensor->data + offset, data, size); - - komputeManager()->sequence()->eval({res}); -} - -static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_UNUSED(buffer); - - const auto res = ggml_vk_get_tensor(tensor); - GGML_ASSERT(res); - - komputeManager()->sequence()->eval({res}); - - memcpy(data, (const char *)tensor->data + offset, size); -} - -static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - auto * memory = (ggml_vk_memory *)buffer->context; - memset(memory->data, value, buffer->size); - - if (memory->stagingBuffer) - komputeManager()->sequence()->eval(memory->primaryBuffer, memory->stagingBuffer, memory->size); -} - -static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = { - /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer, - /* .get_base = */ ggml_backend_kompute_buffer_get_base, - /* .init_tensor = */ NULL, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_kompute_buffer_clear, - /* .reset = */ NULL, -}; - -// default buffer type - -static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - return ctx->name.c_str(); -} - -static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_kompute_device_ref(buft); - auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size)); - return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size); -} - -static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - return ctx->buffer_alignment; -} - -static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - return ctx->max_alloc; -} - -static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { - /* .get_name = */ ggml_backend_kompute_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ NULL, -}; - -ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) { - static std::mutex mutex; - std::lock_guard lock(mutex); - - auto devices = ggml_vk_available_devices(); - int32_t device_count = (int32_t) devices.size(); - GGML_ASSERT(device < device_count); - GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES); - - static ggml_backend_buffer_type - ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES]; - - static bool ggml_backend_kompute_buffer_type_initialized = false; - - if (!ggml_backend_kompute_buffer_type_initialized) { - for (int32_t i = 0; i < device_count; i++) { - ggml_backend_kompute_buffer_types[i] = { - /* .iface = */ ggml_backend_kompute_buffer_type_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i), - /* .context = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc }, - }; - } - ggml_backend_kompute_buffer_type_initialized = true; - } - - return &ggml_backend_kompute_buffer_types[device]; -} - -// backend - -static const char * ggml_backend_kompute_name(ggml_backend_t backend) { - auto * ctx = static_cast(backend->context); - return ctx->name.c_str(); -} - -static void ggml_backend_kompute_free(ggml_backend_t backend) { - auto * ctx = static_cast(backend->context); - - assert(ctx == s_kompute_context); - s_kompute_context = nullptr; - if (ctx != nullptr) { - delete ctx; - } - - delete backend; -} - -static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - auto * ctx = static_cast(backend->context); - ggml_vk_graph_compute(ctx, cgraph); - return GGML_STATUS_SUCCESS; -} - -static struct ggml_backend_i kompute_backend_i = { - /* .get_name = */ ggml_backend_kompute_name, - /* .free = */ ggml_backend_kompute_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_kompute_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - -static ggml_guid_t ggml_backend_kompute_guid() { - static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49, 0xfb, 0x35, 0xfa, 0x9b, 0x18, 0x31, 0x1d, 0xca }; - return &guid; -} - -ggml_backend_t ggml_backend_kompute_init(int device) { - GGML_ASSERT(s_kompute_context == nullptr); - s_kompute_context = new ggml_kompute_context(device); - - ggml_backend_t kompute_backend = new ggml_backend { - /* .guid = */ ggml_backend_kompute_guid(), - /* .interface = */ kompute_backend_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device), - /* .context = */ s_kompute_context, - }; - - return kompute_backend; -} - -bool ggml_backend_is_kompute(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid()); -} - -static size_t ggml_backend_kompute_get_device_count() { - auto devices = ggml_vk_available_devices(); - return devices.size(); -} - -static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) { - auto devices = ggml_vk_available_devices(); - GGML_ASSERT((size_t) device < devices.size()); - snprintf(description, description_size, "%s", devices[device].name); -} - -static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) { - auto devices = ggml_vk_available_devices(); - GGML_ASSERT((size_t) device < devices.size()); - *total = devices[device].heapSize; - *free = devices[device].heapSize; -} - -////////////////////////// - -struct ggml_backend_kompute_device_context { - int device; - std::string name; - std::string description; -}; - -static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - return ctx->name.c_str(); -} - -static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - return ctx->description.c_str(); -} - -static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - ggml_backend_kompute_get_device_memory(ctx->device, free, total); -} - -static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - return ggml_backend_kompute_buffer_type(ctx->device); -} - -static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) { - return false; - } - - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context; - - return buft_ctx->device == ctx->device; -} - -static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) { - GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU; -} - -static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - props->name = ggml_backend_kompute_device_get_name(dev); - props->description = ggml_backend_kompute_device_get_description(dev); - props->type = ggml_backend_kompute_device_get_type(dev); - ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* async = */ false, - /* host_buffer = */ false, - /* .buffer_from_host_ptr = */ false, - /* events = */ false, - }; -} - -static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) { - GGML_UNUSED(params); - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - return ggml_backend_kompute_init(ctx->device); -} - -static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; - - return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - - GGML_UNUSED(dev); -} - -static const struct ggml_backend_device_i ggml_backend_kompute_device_i = { - /* .get_name = */ ggml_backend_kompute_device_get_name, - /* .get_description = */ ggml_backend_kompute_device_get_description, - /* .get_memory = */ ggml_backend_kompute_device_get_memory, - /* .get_type = */ ggml_backend_kompute_device_get_type, - /* .get_props = */ ggml_backend_kompute_device_get_props, - /* .init_backend = */ ggml_backend_kompute_device_init, - /* .get_buffer_type = */ ggml_backend_kompute_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ NULL, - /* .supports_op = */ ggml_backend_kompute_device_supports_op, - /* .supports_buft = */ ggml_backend_kompute_device_supports_buft, - /* .offload_op = */ ggml_backend_kompute_device_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, -}; - -static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) { - GGML_UNUSED(reg); - return "Kompute"; -} - -static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) { - GGML_UNUSED(reg); - return ggml_backend_kompute_get_device_count(); -} - -static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) { - static std::vector devices; - - static bool initialized = false; - - { - static std::mutex mutex; - std::lock_guard lock(mutex); - if (!initialized) { - for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) { - ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context; - char desc[256]; - ggml_backend_kompute_get_device_description(i, desc, sizeof(desc)); - ctx->device = i; - ctx->name = "Kompute" + std::to_string(i); - ctx->description = desc; - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_kompute_device_i, - /* .reg = */ reg, - /* .context = */ ctx, - }); - } - initialized = true; - } - } - - GGML_ASSERT(device < devices.size()); - return devices[device]; -} - -static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = { - /* .get_name = */ ggml_backend_kompute_reg_get_name, - /* .get_device_count = */ ggml_backend_kompute_reg_get_device_count, - /* .get_device = */ ggml_backend_kompute_reg_get_device, - /* .get_proc_address = */ NULL, -}; - -ggml_backend_reg_t ggml_backend_kompute_reg() { - static ggml_backend_reg reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_kompute_reg_i, - /* .context = */ nullptr, - }; - - return ® -} - -GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg) diff --git a/ggml/src/ggml-kompute/kompute-shaders/common.comp b/ggml/src/ggml-kompute/kompute-shaders/common.comp deleted file mode 100644 index dbe4cf804e6..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/common.comp +++ /dev/null @@ -1,112 +0,0 @@ -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int64: require -#extension GL_EXT_control_flow_attributes: enable -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_EXT_debug_printf : enable - -#define QK4_0 32 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 -#define TWOPI_F 6.283185307179586f - -#define QK_K 256 -#define K_SCALE_SIZE 12 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -mat4 dequantize_q4_0(const block_q4_0 xb, uint il) { - const float d1 = il != 0 ? (xb.d / 16.f) : xb.d; - const float d2 = d1 / 256.f; - const float md = -8.f * xb.d; - const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F); - const uint16_t mask1 = mask0 << 8; - - mat4 reg; - for (int i=0;i<8;i++) { - uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]); - reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md; - reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md; - } - return reg; -} - -#define sizeof_block_q4_1 0x14 -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; -mat4 dequantize_q4_1(const block_q4_1 xb, uint il) { - const float d1 = il != 0 ? (xb.d / 16.f) : xb.d; - const float d2 = d1 / 256.f; - const float m = xb.m; - const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F); - const uint16_t mask1 = mask0 << 8; - - mat4 reg; - for (int i=0;i<8;i++) { - uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]); - reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m; - reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m; - } - return reg; -} - -#define sizeof_block_q4_k 144 -struct block_q4_k { - float16_t d; - float16_t dmin; - uint8_t scales[K_SCALE_SIZE]; - uint8_t qs[QK_K/2]; -}; - -#define sizeof_block_q6_k 210 -struct block_q6_k { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -mat4 dequantize_q6_k(const block_q6_k xb, uint il) { - const float16_t d_all = xb.d; - - const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1); - const uint qhIndex = 32*(il/8) + 16*(il&1); - float16_t sc = xb.scales[(il%2) + 2 * ((il/2))]; - il = (il/2) & 3; - - const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3); - const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F); - const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f); - const float16_t ml = float16_t(d_all * sc * 32.f); - const float16_t dl = float16_t(d_all * sc * coef); - mat4 reg; - for (int i = 0; i < 16; ++i) { - const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2)) - : ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4)); - reg[i/4][i%4] = dl * q - ml; - } - return reg; -} - - -#define QK8_0 32 -// struct block_q8_0 { -// float16_t d; // delta -// int8_t qs[QK8_0]; // quants -// }; -#define sizeof_block_q8_0 34 diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_add.comp b/ggml/src/ggml-kompute/kompute-shaders/op_add.comp deleted file mode 100644 index b7b76a79dbd..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +++ /dev/null @@ -1,58 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1024) in; - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb00; - int nb01; - int nb02; - int nb03; - int ne10; - int ne11; - int ne12; - int ne13; - int nb10; - int nb11; - int nb12; - int nb13; - int ne0; - int nb0; - int nb1; - int nb2; - int nb3; - //int offs; // TODO: needed for GGML_OP_ACC, see metal code -} pcs; - -// general-purpose kernel for addition of two tensors -// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3 -// cons: not very efficient -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const uint i13 = i03 % pcs.ne13; - const uint i12 = i02 % pcs.ne12; - const uint i11 = i01 % pcs.ne11; - - int offs = 0; // TMP (see above) - - uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4); - uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4); - uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4); - - for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) { - const uint i10 = i0 % pcs.ne10; - out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10]; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp b/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp deleted file mode 100644 index 2376a6b8f03..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +++ /dev/null @@ -1,25 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inAOff; - uint inBOff; - uint outOff; - uint row; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 4; - - for (uint x = 0; x < 4; x++) { - const uint i = baseIndex + x; - out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff]; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp deleted file mode 100644 index d57247d2dcc..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -#define IN_TYPE float16_t -#define IN_TYPE_SIZE 2 -#define OUT_TYPE float16_t -#define OUT_TYPE_SIZE 2 - -layout(local_size_x = 1024) in; - -layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; -layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; - -layout (push_constant) uniform parameter { - uint inOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - int ne1; - int ne2; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; - - const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); - const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); - const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; - const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); - - const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ - out_[dst_data+i00] = OUT_TYPE(in_[src]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp deleted file mode 100644 index b568bcd7b26..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -#define IN_TYPE float16_t -#define IN_TYPE_SIZE 2 -#define OUT_TYPE float -#define OUT_TYPE_SIZE 4 - -layout(local_size_x = 1024) in; - -layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; -layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; - -layout (push_constant) uniform parameter { - uint inOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - int ne1; - int ne2; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; - - const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); - const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); - const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; - const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); - - const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ - out_[dst_data+i00] = OUT_TYPE(in_[src]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp deleted file mode 100644 index 99b22834308..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -#define IN_TYPE float -#define IN_TYPE_SIZE 4 -#define OUT_TYPE float16_t -#define OUT_TYPE_SIZE 2 - -layout(local_size_x = 1024) in; - -layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; -layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; - -layout (push_constant) uniform parameter { - uint inOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - int ne1; - int ne2; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; - - const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); - const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); - const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; - const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); - - const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ - out_[dst_data+i00] = OUT_TYPE(in_[src]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp deleted file mode 100644 index 2fc998492b7..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -#define IN_TYPE float -#define IN_TYPE_SIZE 4 -#define OUT_TYPE float -#define OUT_TYPE_SIZE 4 - -layout(local_size_x = 1024) in; - -layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; -layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; - -layout (push_constant) uniform parameter { - uint inOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - int ne1; - int ne2; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; - - const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); - const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); - const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; - const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); - - const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ - out_[dst_data+i00] = OUT_TYPE(in_[src]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp b/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp deleted file mode 100644 index 291c3fc1897..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +++ /dev/null @@ -1,30 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - uint n_past; - int ne00; - int ne01; -} pcs; - -void main() { - const uint i02 = gl_WorkGroupID.z; - const uint i01 = gl_WorkGroupID.y; - const uint i00 = gl_WorkGroupID.x; - - const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00; - - if (i00 > pcs.n_past + i01) { - out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000); - } else { - out_[index + pcs.outOff] = in_[index + pcs.inOff]; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp deleted file mode 100644 index 9d8c53710af..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 8; - - for (uint x = 0; x < 8; x++) { - const uint i = baseIndex + x; - const float y = in_[i + pcs.inOff]; - out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0))); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp deleted file mode 100644 index 1a5581b23a9..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +++ /dev/null @@ -1,17 +0,0 @@ -void main() { - const uint i = gl_WorkGroupID.x; - const int r = inB[i + pcs.inBOff]; - - int z = 0; - for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) { - const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK; - const mat4 result = dequantize_block(inIndex, ind%NL); - for (uint j = 0; j < 4; ++j) { - for (uint k = 0; k < 4; ++k) { - const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z; - out_[outIndex] = result[j][k]; - ++z; - } - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp deleted file mode 100644 index 48c93610811..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +++ /dev/null @@ -1,31 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { - for (int j = 0; j < k; j++) { - out_[y + j] = inA[x + j]; - } -} - -void main() { - const uint i = gl_WorkGroupID.x; - const int r = inB[i + pcs.inBOff]; - - dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp deleted file mode 100644 index 9d7acdaf8a8..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +++ /dev/null @@ -1,31 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { float inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { - for (int j = 0; j < k; j++) { - out_[y + j] = inA[x + j]; - } -} - -void main() { - const uint i = gl_WorkGroupID.x; - const int r = inB[i + pcs.inBOff]; - - dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp deleted file mode 100644 index 32b2e891e8f..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +++ /dev/null @@ -1,38 +0,0 @@ -#version 450 - -#include "common.comp" - -#define NL 2 -#define BYTES_FOR_TYPE 4 /*bytes for float*/ -#define SIZE_OF_BLOCK sizeof_block_q4_0 - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -block_q4_0 get_unaligned_block_q4_0(uint index) { - block_q4_0 fres; - fres.d = u8BufToFloat16(inA, index); - [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) { - fres.qs[it] = inA[index+2+it]; - } - return fres; -} - -mat4 dequantize_block(uint index, uint il) { - const block_q4_0 block = get_unaligned_block_q4_0(index); - return dequantize_q4_0(block, il); -} - -#include "op_getrows.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp deleted file mode 100644 index 87f2fbe17bb..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +++ /dev/null @@ -1,39 +0,0 @@ -#version 450 - -#include "common.comp" - -#define NL 2 -#define BYTES_FOR_TYPE 4 /*bytes for float*/ -#define SIZE_OF_BLOCK sizeof_block_q4_1 - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -block_q4_1 get_unaligned_block_q4_1(uint index) { - block_q4_1 fres; - fres.d = u8BufToFloat16(inA, index); - fres.m = u8BufToFloat16(inA, index+2); - [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) { - fres.qs[it] = inA[index+4+it]; - } - return fres; -} - -mat4 dequantize_block(uint index, uint il) { - const block_q4_1 block = get_unaligned_block_q4_1(index); - return dequantize_q4_1(block, il); -} - -#include "op_getrows.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp deleted file mode 100644 index 9ce3545d1ec..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +++ /dev/null @@ -1,44 +0,0 @@ -#version 450 - -#include "common.comp" - -#define NL 16 -#define BYTES_FOR_TYPE 4 /*bytes for float*/ -#define SIZE_OF_BLOCK sizeof_block_q6_k - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -block_q6_k get_unaligned_block_q6_k(uint index) { - block_q6_k fres; - [[unroll]] for (uint it = 0; it != QK_K / 2; it++) { - fres.ql[it] = inA[index + it]; - } - [[unroll]] for (uint it = 0; it != QK_K / 4; it++) { - fres.qh[it] = inA[index + QK_K/2 + it]; - } - [[unroll]] for (uint it = 0; it != QK_K / 16; it++) { - fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]); - } - fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16); - return fres; -} - -mat4 dequantize_block(uint index, uint il) { - const block_q6_k block = get_unaligned_block_q6_k(index); - return dequantize_q6_k(block, il); -} - -#include "op_getrows.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp deleted file mode 100644 index c92647c4db1..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1024) in; - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb00; - int nb01; - int nb02; - int nb03; - int ne10; - int ne11; - int ne12; - int ne13; - int nb10; - int nb11; - int nb12; - int nb13; - int ne0; - int nb0; - int nb1; - int nb2; - int nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const uint i13 = i03 % pcs.ne13; - const uint i12 = i02 % pcs.ne12; - const uint i11 = i01 % pcs.ne11; - - uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4); - uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4); - uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4); - - for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) { - const uint i10 = i0 % pcs.ne10; - out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10]; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp deleted file mode 100644 index 0ab1b2fc20e..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +++ /dev/null @@ -1,69 +0,0 @@ -#version 450 - -#include "common.comp" - -#extension GL_KHR_shader_subgroup_arithmetic : require - -layout(local_size_x_id = 0) in; - -layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne10; - int ne11; - int ne12; - uint nb10; - uint nb11; - uint nb12; - uint nb13; - int ne0; - int ne1; - uint r2; - uint r3; -} pcs; - -#define N_F16_F32 4 - -void main() { - const uint r0 = gl_WorkGroupID.x; - const uint rb = gl_WorkGroupID.y*N_F16_F32; - const uint im = gl_WorkGroupID.z; - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03; - - const uint x = offset0 / 2 + pcs.inAOff; // Based from inA - - for (uint row = 0; row < N_F16_F32; ++row) { - uint r1 = rb + row; - if (r1 >= pcs.ne11) { - break; - } - - const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; - - float sumf = 0; - for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { - sumf += float(inA[x+i]) * float(inB[y+i]); - } - - const float all_sum = subgroupAdd(sumf); - if (subgroupElect()) { - out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp deleted file mode 100644 index d1ca4ad6c25..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +++ /dev/null @@ -1,51 +0,0 @@ -#version 450 - -#include "common.comp" - -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_EXT_debug_printf : enable - -// device subgroup size -layout (local_size_x_id = 0) in; - -layout(binding = 0) readonly buffer tensorInA { float inA[]; }; -layout(binding = 1) readonly buffer tensorInB { float inB[]; }; -layout(binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout(push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne01; - int ne02; - int ne11; - int ne12; - uint nb01; - uint nb02; - uint nb11; - uint nb12; - uint nb1; - uint nb2; -} -pcs; - - -void main() { - uvec3 gid = gl_WorkGroupID; - - uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z; - uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z; - - const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA - const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB - float sum = 0.0f; - for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { - sum += float(inA[x+i]) * float(inB[y+i]); - } - - const float all_sum = subgroupAdd(sum); - if (subgroupElect()) { - out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp deleted file mode 100644 index b0cea8bbe67..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +++ /dev/null @@ -1,33 +0,0 @@ -#version 450 - -#include "common.comp" - -#define BLOCKS_IN_QUANT QK4_0 -#define SIZE_OF_BLOCK sizeof_block_q4_0 -#define N_ROWS 4 - -#include "op_mul_mv_q_n_pre.comp" - -// The q4_0 version of this function -float block_q_n_dot_y(uint block_index, uint yb, uint il) { - vec2 acc = vec2(0.0, 0.0); - const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; - float d = float(u8BufToFloat16(inA, index)); - float sumy = 0.0f; - for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { - const uint16_t b = u8BufToU16(inA, index + 2 + il + i); - - const float yl0 = inB[yb + i]; - const float yl1 = inB[yb + i + 1]; - const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; - const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; - - sumy += yl0 + yl1 + yl8 + yl9; - - acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); - acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); - } - return d * (sumy * -8.f + acc[0] + acc[1]); -} - -#include "op_mul_mv_q_n.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp deleted file mode 100644 index 8582c61a3be..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +++ /dev/null @@ -1,35 +0,0 @@ -#version 450 - -#include "common.comp" - -#define BLOCKS_IN_QUANT QK4_1 -#define SIZE_OF_BLOCK sizeof_block_q4_1 -#define N_ROWS 4 - -#include "op_mul_mv_q_n_pre.comp" - -// The q4_1 version of this function -float block_q_n_dot_y(uint block_index, uint yb, uint il) { - vec2 acc = vec2(0.0, 0.0); - const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; - float d = float(u8BufToFloat16(inA, index)); - float m = float(u8BufToFloat16(inA, index+2)); - - float sumy = 0.0f; - for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { - const uint16_t b = u8BufToU16(inA, index + 4 + il + i); - - const float yl0 = inB[yb + i]; - const float yl1 = inB[yb + i + 1]; - const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; - const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; - - sumy += yl0 + yl1 + yl8 + yl9; - - acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); - acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); - } - return d * (acc[0] + acc[1]) + sumy * m; -} - -#include "op_mul_mv_q_n.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp deleted file mode 100644 index a5752a3a006..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +++ /dev/null @@ -1,140 +0,0 @@ -#version 450 - -#include "common.comp" - -#define N_DST 4 -#define SIZE_OF_BLOCK sizeof_block_q4_k - -layout(local_size_x = 4) in; -layout(local_size_y = 8) in; -layout(local_size_z = 1) in; - -layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne10; - int ne0; - int ne1; - int ne01; - int ne02; - int ne12; - uint nb01; - uint nb02; - uint nb03; - uint nb11; - uint nb12; - uint nb13; - uint r2; - uint r3; -} pcs; - -void main() { - const uint16_t kmask1 = uint16_t(0x3f3f); - const uint16_t kmask2 = uint16_t(0x0f0f); - const uint16_t kmask3 = uint16_t(0xc0c0); - - const uint ix = gl_SubgroupInvocationID/8; // 0...3 - const uint it = gl_SubgroupInvocationID%8; // 0...7 - const uint iq = it/4; // 0 or 1 - const uint ir = it%4; // 0...3 - - const uint nb = pcs.ne00/QK_K; - - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - const uint im = gl_WorkGroupID.z; - - const uint first_row = r0 * N_DST; - const uint ib_row = first_row * nb; - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); - const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13; - - const uint xblk = offset0 + pcs.inAOff; - const uint y = (offset1 / 4) + pcs.inBOff; - - float yl[16]; - float yh[16]; - float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f}; - float all_sum = 0.f; - - uint y4 = y + ix * QK_K + 64 * iq + 8 * ir; - - for (uint ib = ix; ib < nb; ib += 4) { - const uint blk_idx = ib + xblk; - - float sumy[4] = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; ++i) { - yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0]; - yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8]; - yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0]; - yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8]; - } - - for (int row = 0; row < N_DST; row++) { - uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK); - - uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0); - uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2); - uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4); - uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6); - uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8); - - uint16_t sc16[4]; - sc16[0] = sc_0 & kmask1; - sc16[1] = sc_2 & kmask1; - sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2); - sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2); - - float acc1[4] = {0.f, 0.f, 0.f, 0.f}; - float acc2[4] = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; i += 2) { - uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i); - uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i); - acc1[0] += yl[i+0] * (q1 & 0x000F); - acc1[1] += yl[i+1] * (q1 & 0x0F00); - acc1[2] += yl[i+8] * (q1 & 0x00F0); - acc1[3] += yl[i+9] * (q1 & 0xF000); - acc2[0] += yh[i+0] * (q2 & 0x000F); - acc2[1] += yh[i+1] * (q2 & 0x0F00); - acc2[2] += yh[i+8] * (q2 & 0x00F0); - acc2[3] += yh[i+9] * (q2 & 0xF000); - } - - uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF); - uint8_t sc8_1 = uint8_t(sc16[0] >> 8 ); - uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF); - uint8_t sc8_3 = uint8_t(sc16[1] >> 8 ); - uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF); - uint8_t sc8_5 = uint8_t(sc16[2] >> 8 ); - uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF); - uint8_t sc8_7 = uint8_t(sc16[3] >> 8 ); - - float dall = float(inA[blk_idx + row_idx].d); - float dmin = float(inA[blk_idx + row_idx].dmin); - sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 + - (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f + - (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 + - (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) - - dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7); - } - - y4 += 4 * QK_K; - } - - for (int row = 0; row < N_DST; ++row) { - all_sum = subgroupAdd(sumf[row]); - if (subgroupElect()) { - out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp deleted file mode 100644 index d331d1a7057..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +++ /dev/null @@ -1,106 +0,0 @@ -#version 450 - -#include "common.comp" - -#define SIZE_OF_BLOCK sizeof_block_q6_k - -layout(local_size_x_id = 0) in; -layout(local_size_y_id = 1) in; -layout(local_size_z = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne10; - int ne0; - int ne1; - int ne01; - int ne02; - int ne12; - uint nb01; - uint nb02; - uint nb03; - uint nb11; - uint nb12; - uint nb13; - uint r2; - uint r3; -} pcs; - -void main() { - const uint8_t kmask1 = uint8_t(0x03); - const uint8_t kmask2 = uint8_t(0x0C); - const uint8_t kmask3 = uint8_t(0x30); - const uint8_t kmask4 = uint8_t(0xC0); - - const uint nb = pcs.ne00/QK_K; - - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - const uint im = gl_WorkGroupID.z; - - const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID); - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); - const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; - - float sumf = 0; - - // bits of invocation ID for gl_SubgroupSize=32: - // x x x x x - // 4 3 2 1 0 - // ( tid ) ix - // ip ( il ) - - const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes - const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0 - const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1 - const uint ip = tid/8; // first or second half of block (0 or 1) - const uint il = tid%8; // each half has 8 parts, one per scale - const uint n = 4; // 4 scales at a time (and 4 sums) - const uint l0 = n*il; // offset into half-block, 0..28 - const uint is = 8*ip + l0/16; // 0, 1, 8, 9 - - const uint y_offset = 128*ip + l0; - const uint q_offset_l = 64*ip + l0; - const uint q_offset_h = 32*ip + l0; - - for (uint i = ix; i < nb; i += block_stride) { - - const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff; - - const uint qlIndex = q_offset_l; - const uint q2Index = qlIndex + QK_K/8; - const uint qhIndex = q_offset_h; - const uint y = yy + i * QK_K + y_offset; - - float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (uint l = 0; l < n; ++l) { - const uint8_t currentQ1 = inA[baseIndex + qlIndex + l]; - const uint8_t currentQ2 = inA[baseIndex + q2Index + l]; - const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l]; - - sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32); - sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32); - sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32); - sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32); - } - - float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16); - sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is])); - } - - const float tot = subgroupAdd(sumf); - if (subgroupElect()) { - out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp deleted file mode 100644 index 34d015e90b8..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +++ /dev/null @@ -1,73 +0,0 @@ -#version 450 - -#include "common.comp" - -#include "op_mul_mv_q_n_pre.comp" - -#define SIZE_OF_D 2 - -#define N_DST 4 // each SIMD group works on 4 rows -#define N_SIMDGROUP 2 // number of SIMD groups in a thread group -#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 - -#define NB_Q8_0 8 - -void main() { - // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64 - if (gl_SubgroupInvocationID > 31) - return; - - const int nr = N_DST; - const int nsg = N_SIMDGROUP; - const int nw = N_SIMDWIDTH; - - const int nb = pcs.ne00/QK8_0; - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - const uint im = gl_WorkGroupID.z; - - const uint first_row = (r0 * nsg + gl_SubgroupID) * nr; - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02); - - const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA - const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB - - float yl[NB_Q8_0]; - float sumf[N_DST]={0.f, 0.f, 0.f, 0.f}; - - const uint ix = gl_SubgroupInvocationID.x/4; - const uint il = gl_SubgroupInvocationID.x%4; - - uint yb = y + ix * QK8_0 + NB_Q8_0*il; - - // each thread in a SIMD group deals with NB_Q8_0 quants at a time - for (uint ib = ix; ib < nb; ib += nw/4) { - for (int i = 0; i < NB_Q8_0; ++i) { - yl[i] = inB[yb + i]; - } - - for (int row = 0; row < nr; row++) { - const uint block_offset = (ib+row*nb) * sizeof_block_q8_0; - float sumq = 0.f; - for (int iq = 0; iq < NB_Q8_0; ++iq) { - const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]); - sumq += qs_iq * yl[iq]; - } - const float16_t d = u8BufToFloat16(inA, x + block_offset); - sumf[row] += sumq*d; - } - - yb += NB_Q8_0 * nw; - } - - for (int row = 0; row < nr; ++row) { - const float tot = subgroupAdd(sumf[row]); - if (subgroupElect() && first_row + row < pcs.ne01) { - out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp deleted file mode 100644 index a6517cc1f19..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +++ /dev/null @@ -1,52 +0,0 @@ -void main() { - // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64 - if (gl_SubgroupInvocationID > 31) - return; - - const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT); - - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - const uint im = gl_WorkGroupID.z; - - const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS; - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - // pointers to src0 rows - uint ax[N_ROWS]; - for (int row = 0; row < N_ROWS; ++row) { - const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); - - ax[row] = offset0 + pcs.inAOff; - } - - const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; - - float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f}; - - const uint ix = gl_SubgroupInvocationID/2; - const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2); - - uint yb = y + ix * BLOCKS_IN_QUANT + il; - - //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n", - // gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize, - // gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z); - - for (uint ib = ix; ib < nb; ib += 16) { - for (int row = 0; row < N_ROWS; row++) { - sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il); - } - - yb += BLOCKS_IN_QUANT * 16; - } - - for (int row = 0; row < N_ROWS; ++row) { - const float tot = subgroupAdd(sumf[row]); - if (first_row + row < pcs.ne01 && subgroupElect()) { - out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp deleted file mode 100644 index a9a2f22180f..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +++ /dev/null @@ -1,28 +0,0 @@ -layout(local_size_x_id = 0) in; -layout(local_size_y = 8) in; -layout(local_size_z = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne01; - int ne02; - int ne10; - int ne12; - int ne0; - int ne1; - uint nb01; - uint nb02; - uint nb03; - uint nb11; - uint nb12; - uint nb13; - uint r2; - uint r3; -} pcs; diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp b/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp deleted file mode 100644 index ad0c3c01b9d..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +++ /dev/null @@ -1,84 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 256) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - uint ne00; - uint nb01; - float eps; -} pcs; - -shared float sum[gl_WorkGroupSize.x]; - -void main() { - const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ - // MEAN - // parallel sum - sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - sum[gl_LocalInvocationID.x] += in_[x+i00]; - } - - // reduce - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; - } - barrier(); - memoryBarrierShared(); - } - - // broadcast - if (gl_LocalInvocationID.x == 0) { - sum[0] /= float(pcs.ne00); - } - barrier(); - memoryBarrierShared(); - const float mean = sum[0]; - - // recenter - const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - out_[y+i00] = in_[x+i00] - mean; - } - - // VARIANCE - // parallel sum - sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00]; - } - - // reduce - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; - } - barrier(); - memoryBarrierShared(); - } - - // broadcast - if (gl_LocalInvocationID.x == 0) { - sum[0] /= float(pcs.ne00); - } - barrier(); - memoryBarrierShared(); - const float variance = sum[0]; - - const float scale = 1.0f/sqrt(variance + pcs.eps); - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - out_[y+i00] *= scale; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp deleted file mode 100644 index 52a601fe6da..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 4; - - for (uint x = 0; x < 4; x++) { - const uint i = baseIndex + x; - out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp deleted file mode 100644 index da658c1601e..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +++ /dev/null @@ -1,53 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 512) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - uint ne00; - uint nb01; - float eps; -} pcs; - -shared float sum[gl_WorkGroupSize.x]; - -void main() { - const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ - - // parallel sum - sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00]; - } - - // reduce - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; - } - barrier(); - memoryBarrierShared(); - } - - // broadcast - if (gl_LocalInvocationID.x == 0) { - sum[0] /= float(pcs.ne00); - } - barrier(); - memoryBarrierShared(); - - const float scale = 1.0f/sqrt(sum[0] + pcs.eps); - - const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - out_[y+i00] = in_[x+i00] * scale; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp deleted file mode 100644 index 63659cbfe55..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; -layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - float theta_base = float(inB[pcs.inBOff + i2]); - float inv_ndims = -1.f/pcs.n_dims; - - float cos_theta; - float sin_theta; - - for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { - if (i0 < pcs.n_dims) { - uint ic = i0/2; - - float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); - - const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; - - rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - const float x0 = float(inA[src]); - const float x1 = float(inA[src+pcs.n_dims/2]); - - out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); - out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta); - } else { - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - out_[dst_data] = inA[src]; - out_[dst_data+1] = inA[src+1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp deleted file mode 100644 index 4df56204d72..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; -layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - float theta_base = float(inB[pcs.inBOff + i2]); - float inv_ndims = -1.f/pcs.n_dims; - - float cos_theta; - float sin_theta; - - for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { - if (i0 < pcs.n_dims) { - uint ic = i0/2; - - float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); - - const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; - - rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - const float x0 = inA[src]; - const float x1 = inA[src+pcs.n_dims/2]; - - out_[dst_data] = x0*cos_theta - x1*sin_theta; - out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta; - } else { - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - out_[dst_data] = inA[src]; - out_[dst_data+1] = inA[src+1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp deleted file mode 100644 index a3c0eda8bd3..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; -layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - float theta_base = float(inB[pcs.inBOff + i2]); - float inv_ndims = -1.f/pcs.n_dims; - - float cos_theta; - float sin_theta; - - for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { - if (i0 < pcs.n_dims) { - uint ic = i0/2; - - float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); - - const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; - - rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - const float x0 = float(inA[src]); - const float x1 = float(inA[src+1]); - - out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); - out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta); - } else { - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - out_[dst_data] = inA[src]; - out_[dst_data+1] = inA[src+1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp deleted file mode 100644 index b7963ae7253..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; -layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - float theta_base = float(inB[pcs.inBOff + i2]); - float inv_ndims = -1.f/pcs.n_dims; - - float cos_theta; - float sin_theta; - - for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { - if (i0 < pcs.n_dims) { - uint ic = i0/2; - - float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); - - const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; - - rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - const float x0 = inA[src]; - const float x1 = inA[src+1]; - - out_[dst_data] = x0*cos_theta - x1*sin_theta; - out_[dst_data+1] = x0*sin_theta + x1*cos_theta; - } else { - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - out_[dst_data] = inA[src]; - out_[dst_data+1] = inA[src+1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp b/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp deleted file mode 100644 index bdae2673820..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +++ /dev/null @@ -1,19 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - float scale; -} pcs; - -void main() { - const uint i = gl_WorkGroupID.x; - out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp b/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp deleted file mode 100644 index ada69754b2c..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - float scale; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 8; - - for (uint x = 0; x < 8; x++) { - const uint i = baseIndex + x; - out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp deleted file mode 100644 index 0fb8e4b7405..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 4; - - for (uint x = 0; x < 4; x++) { - const uint i = baseIndex + x; - const float y = in_[i + pcs.inOff]; - out_[i + pcs.outOff] = y / (1.0 + exp(-y)); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp deleted file mode 100644 index 4165295bf4b..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +++ /dev/null @@ -1,72 +0,0 @@ -// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4) - -#version 450 - -#include "common.comp" - -layout(local_size_x_id = 0) in; - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne01; - int ne02; - float scale; - float max_bias; - float m0; - float m1; - uint n_head_log2; - int mask; -} pcs; - -void main() { - if (gl_SubgroupInvocationID > 31) - return; - - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00; - const uint psrc0 = extra_off + pcs.inAOff; // Based from inA - const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB - const uint pdst = extra_off + pcs.outOff; // Based from out_ - - float slope = 1.0f; - - // ALiBi - if (pcs.max_bias > 0.0f) { - int64_t h = i02; - - float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1; - int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1; - - slope = pow(base, float(exp)); - } - - // parallel max - float localMax = uintBitsToFloat(0xFF800000); - for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { - localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f)); - } - float max_ = subgroupMax(localMax); - - // parallel sum - float localSum = 0.0f; - for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { - const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_); - localSum += exp_psrc0; - out_[pdst + i00] = exp_psrc0; - } - - const float sum = subgroupAdd(localSum); - for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { - out_[pdst + i00] /= sum; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp deleted file mode 100644 index 0fca640dcc2..00000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +++ /dev/null @@ -1,71 +0,0 @@ -#include "common.comp" - -#define GGML_ROPE_TYPE_NEOX 2 - -// TODO: use a local size of 32 or more (Metal uses 1024) -layout(local_size_x = 1) in; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint inCOff; - uint outOff; - int n_dims; - int mode; - int n_ctx_orig; - float freq_base; - float freq_scale; - bool has_freq_factors; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -float rope_yarn_ramp(const float low, const float high, const float i0) { - const float y = (i0 / 2 - low) / max(0.001f, high - low); - return 1.0f - min(1.0f, max(0.0f, y)); -} - -// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn -// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -void rope_yarn( - float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale, - out float cos_theta, out float sin_theta -) { - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta = theta_interp; - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - - // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * log(1.0f / freq_scale); - } - cos_theta = cos(theta) * mscale; - sin_theta = sin(theta) * mscale; -} - -// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get -// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base)); -} - -void rope_yarn_corr_dims( - int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2] -) { - // start and end correction dims - dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))); - dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base))); -} From f7502dca872866a310fe69d30b163fa87d256319 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 30 Jul 2025 21:54:58 +0300 Subject: [PATCH 0166/1348] whisper : reset conv scheduler when CoreML is used (#3350) ggml-ci --- src/whisper.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/whisper.cpp b/src/whisper.cpp index 5c08478aefd..a3496d604a6 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -2432,6 +2432,8 @@ static bool whisper_encode_internal( return false; } } else { + ggml_backend_sched_reset(sched); + #if defined(WHISPER_USE_COREML) whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data); #elif defined(WHISPER_USE_OPENVINO) From 0becabc8d68d9ffa6ddfba5240e38cd7a2642046 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 2 Aug 2025 07:03:04 +0200 Subject: [PATCH 0167/1348] stream.wasm : add language selection support (#3354) * stream.wasm : add language selection support This commit adds support for selecting the language in the stream.wasm example. This is includes adding the model `base` which supports multilingual transcription, and allowing the user to select a language from a dropdown menu in the HTML interface. The motivation for this is that it allows users to transcribe audio in various languages. Refs: https://github.com/ggml-org/whisper.cpp/issues/3347 * squash! stream.wasm : add language selection support Remove strdup() for language in stream.wasm and update butten text for base (should not be "base.en" but just "base"). --- examples/stream.wasm/emscripten.cpp | 12 ++--- examples/stream.wasm/index-tmpl.html | 78 +++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 7 deletions(-) diff --git a/examples/stream.wasm/emscripten.cpp b/examples/stream.wasm/emscripten.cpp index 43e71bf23f0..5dff24ad3bd 100644 --- a/examples/stream.wasm/emscripten.cpp +++ b/examples/stream.wasm/emscripten.cpp @@ -31,10 +31,11 @@ void stream_set_status(const std::string & status) { g_status = status; } -void stream_main(size_t index) { +void stream_main(size_t index, const std::string & lang) { stream_set_status("loading data ..."); struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY); + bool is_multilingual = whisper_is_multilingual(g_contexts[index]); wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency()); wparams.offset_ms = 0; @@ -52,7 +53,7 @@ void stream_main(size_t index) { // disable temperature fallback wparams.temperature_inc = -1.0f; - wparams.language = "en"; + wparams.language = is_multilingual ? lang.c_str() : "en"; printf("stream: using %d threads\n", wparams.n_threads); @@ -127,9 +128,8 @@ void stream_main(size_t index) { g_contexts[index] = nullptr; } } - EMSCRIPTEN_BINDINGS(stream) { - emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { + emscripten::function("init", emscripten::optional_override([](const std::string & path_model, const std::string & lang) { for (size_t i = 0; i < g_contexts.size(); ++i) { if (g_contexts[i] == nullptr) { g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params()); @@ -138,8 +138,8 @@ EMSCRIPTEN_BINDINGS(stream) { if (g_worker.joinable()) { g_worker.join(); } - g_worker = std::thread([i]() { - stream_main(i); + g_worker = std::thread([i, lang]() { + stream_main(i, lang); }); return i + 1; diff --git a/examples/stream.wasm/index-tmpl.html b/examples/stream.wasm/index-tmpl.html index c831b2f52b7..309dfe73a36 100644 --- a/examples/stream.wasm/index-tmpl.html +++ b/examples/stream.wasm/index-tmpl.html @@ -55,6 +55,7 @@ Whisper model: +

Quantized models:

@@ -66,6 +67,77 @@ --> + + + + +
+ Language: + +
+
@@ -176,6 +248,7 @@ let urls = { 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', + 'base' : 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin', 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin', 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin', @@ -184,6 +257,7 @@ let sizes = { 'tiny.en': 75, 'base.en': 142, + 'base': 142, 'tiny-en-q5_1': 31, 'base-en-q5_1': 57, @@ -197,6 +271,7 @@ document.getElementById('fetch-whisper-tiny-en').style.display = 'none'; document.getElementById('fetch-whisper-base-en').style.display = 'none'; + document.getElementById('fetch-whisper-base').style.display = 'none'; document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none'; document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none'; @@ -212,6 +287,7 @@ var el; el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block'; + el = document.getElementById('fetch-whisper-base'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block'; @@ -368,7 +444,7 @@ function onStart() { if (!instance) { - instance = Module.init('whisper.bin'); + instance = Module.init('whisper.bin', document.getElementById('language').value); if (instance) { printTextarea("js: whisper initialized, instance: " + instance); From 4245c77b654cd384ad9f53a4a302be716b3e5861 Mon Sep 17 00:00:00 2001 From: Adam Debono Date: Thu, 7 Aug 2025 12:37:45 +1000 Subject: [PATCH 0168/1348] ruby : Add ruby binding for max_len (#3365) * add ruby binding for max_len * add test, update param numbers --- bindings/ruby/ext/ruby_whisper_params.c | 80 +++++++++++++++++-------- bindings/ruby/sig/whisper.rbs | 7 +++ bindings/ruby/test/test_params.rb | 8 +++ 3 files changed, 70 insertions(+), 25 deletions(-) diff --git a/bindings/ruby/ext/ruby_whisper_params.c b/bindings/ruby/ext/ruby_whisper_params.c index 71337c818c3..882c68d042f 100644 --- a/bindings/ruby/ext/ruby_whisper_params.c +++ b/bindings/ruby/ext/ruby_whisper_params.c @@ -26,7 +26,7 @@ rb_define_method(cParams, #param_name, ruby_whisper_params_get_ ## param_name, 0); \ rb_define_method(cParams, #param_name "=", ruby_whisper_params_set_ ## param_name, 1); -#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 35 +#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 36 extern VALUE cParams; extern VALUE cVADParams; @@ -49,6 +49,7 @@ static ID id_print_timestamps; static ID id_suppress_blank; static ID id_suppress_nst; static ID id_token_timestamps; +static ID id_max_len; static ID id_split_on_word; static ID id_initial_prompt; static ID id_diarize; @@ -514,6 +515,33 @@ ruby_whisper_params_set_token_timestamps(VALUE self, VALUE value) { BOOL_PARAMS_SETTER(self, token_timestamps, value) } + +/* + * max segment length in characters. + * + * call-seq: + * max_len -> Integer + */ +static VALUE +ruby_whisper_params_get_max_len(VALUE self) +{ + ruby_whisper_params *rwp; + TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); + return INT2NUM(rwp->params.max_len); +} +/* + * call-seq: + * max_len = length -> length + */ +static VALUE +ruby_whisper_params_set_max_len(VALUE self, VALUE value) +{ + ruby_whisper_params *rwp; + TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); + rwp->params.max_len = NUM2INT(value); + return value; +} + /* * If true, split on word rather than on token (when used with max_len). * @@ -1137,6 +1165,7 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self) SET_PARAM_IF_SAME(suppress_blank) SET_PARAM_IF_SAME(suppress_nst) SET_PARAM_IF_SAME(token_timestamps) + SET_PARAM_IF_SAME(max_len) SET_PARAM_IF_SAME(split_on_word) SET_PARAM_IF_SAME(initial_prompt) SET_PARAM_IF_SAME(offset) @@ -1271,30 +1300,31 @@ init_ruby_whisper_params(VALUE *mWhisper) DEFINE_PARAM(suppress_blank, 8) DEFINE_PARAM(suppress_nst, 9) DEFINE_PARAM(token_timestamps, 10) - DEFINE_PARAM(split_on_word, 11) - DEFINE_PARAM(initial_prompt, 12) - DEFINE_PARAM(diarize, 13) - DEFINE_PARAM(offset, 14) - DEFINE_PARAM(duration, 15) - DEFINE_PARAM(max_text_tokens, 16) - DEFINE_PARAM(temperature, 17) - DEFINE_PARAM(max_initial_ts, 18) - DEFINE_PARAM(length_penalty, 19) - DEFINE_PARAM(temperature_inc, 20) - DEFINE_PARAM(entropy_thold, 21) - DEFINE_PARAM(logprob_thold, 22) - DEFINE_PARAM(no_speech_thold, 23) - DEFINE_PARAM(new_segment_callback, 24) - DEFINE_PARAM(new_segment_callback_user_data, 25) - DEFINE_PARAM(progress_callback, 26) - DEFINE_PARAM(progress_callback_user_data, 27) - DEFINE_PARAM(encoder_begin_callback, 28) - DEFINE_PARAM(encoder_begin_callback_user_data, 29) - DEFINE_PARAM(abort_callback, 30) - DEFINE_PARAM(abort_callback_user_data, 31) - DEFINE_PARAM(vad, 32) - DEFINE_PARAM(vad_model_path, 33) - DEFINE_PARAM(vad_params, 34) + DEFINE_PARAM(max_len, 11) + DEFINE_PARAM(split_on_word, 12) + DEFINE_PARAM(initial_prompt, 13) + DEFINE_PARAM(diarize, 14) + DEFINE_PARAM(offset, 15) + DEFINE_PARAM(duration, 16) + DEFINE_PARAM(max_text_tokens, 17) + DEFINE_PARAM(temperature, 18) + DEFINE_PARAM(max_initial_ts, 19) + DEFINE_PARAM(length_penalty, 20) + DEFINE_PARAM(temperature_inc, 21) + DEFINE_PARAM(entropy_thold, 22) + DEFINE_PARAM(logprob_thold, 23) + DEFINE_PARAM(no_speech_thold, 24) + DEFINE_PARAM(new_segment_callback, 25) + DEFINE_PARAM(new_segment_callback_user_data, 26) + DEFINE_PARAM(progress_callback, 27) + DEFINE_PARAM(progress_callback_user_data, 28) + DEFINE_PARAM(encoder_begin_callback, 29) + DEFINE_PARAM(encoder_begin_callback_user_data, 30) + DEFINE_PARAM(abort_callback, 31) + DEFINE_PARAM(abort_callback_user_data, 32) + DEFINE_PARAM(vad, 33) + DEFINE_PARAM(vad_model_path, 34) + DEFINE_PARAM(vad_params, 35) rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0); rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0); diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index 5966ce31592..0489432a249 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -135,6 +135,7 @@ module Whisper ?suppress_blank: boolish, ?suppress_nst: boolish, ?token_timestamps: boolish, + ?max_len: Integer, ?split_on_word: boolish, ?initial_prompt: string | nil, ?diarize: boolish, @@ -222,6 +223,12 @@ module Whisper # def token_timestamps: () -> (true | false) + def max_len=: (Integer) -> Integer + + # max segment length in characters. + # + def max_len: () -> Integer + def split_on_word=: (boolish) -> boolish # If true, split on word rather than on token (when used with max_len). diff --git a/bindings/ruby/test/test_params.rb b/bindings/ruby/test/test_params.rb index 9a9535799b7..d5c5d140e8c 100644 --- a/bindings/ruby/test/test_params.rb +++ b/bindings/ruby/test/test_params.rb @@ -13,6 +13,7 @@ class TestParams < TestBase :suppress_blank, :suppress_nst, :token_timestamps, + :max_len, :split_on_word, :initial_prompt, :diarize, @@ -139,6 +140,13 @@ def test_token_timestamps assert !@params.token_timestamps end + def test_max_len + @params.max_len = 42 + assert_equal @params.max_len, 42 + @params.max_len = 0 + assert_equal @params.max_len, 0 + end + def test_split_on_word @params.split_on_word = true assert @params.split_on_word From b02242d0adb5c6c4896d59ac86d9ec9fe0d0fe33 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Aug 2025 13:00:17 +0300 Subject: [PATCH 0169/1348] wasm : change ggml model host to HF (#3369) --- examples/bench.wasm/index-tmpl.html | 18 +++++++-------- examples/command.wasm/index-tmpl.html | 8 +++---- examples/stream.wasm/index-tmpl.html | 10 ++++----- examples/whisper.wasm/index-tmpl.html | 32 +++++++++++++-------------- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/examples/bench.wasm/index-tmpl.html b/examples/bench.wasm/index-tmpl.html index e9b49e07216..91589c35b3f 100644 --- a/examples/bench.wasm/index-tmpl.html +++ b/examples/bench.wasm/index-tmpl.html @@ -191,15 +191,15 @@ function loadWhisper(model) { let urls = { - 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', - 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', - 'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin', - - 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin', - 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin', - 'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin', - 'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin', - 'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin', + 'tiny.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin', + 'base.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin', + 'small.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin', + + 'tiny-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin', + 'base-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q5_1.bin', + 'small-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en-q5_1.bin', + 'medium-en-q5_0':'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q5_0.bin', + 'large-q5_0': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-q5_0.bin', }; let sizes = { diff --git a/examples/command.wasm/index-tmpl.html b/examples/command.wasm/index-tmpl.html index 752d851e9cf..2221e9340ba 100644 --- a/examples/command.wasm/index-tmpl.html +++ b/examples/command.wasm/index-tmpl.html @@ -174,11 +174,11 @@ function loadWhisper(model) { let urls = { - 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', - 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', + 'tiny.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin', + 'base.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin', - 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin', - 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin', + 'tiny-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin', + 'base-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q5_1.bin', }; let sizes = { diff --git a/examples/stream.wasm/index-tmpl.html b/examples/stream.wasm/index-tmpl.html index 309dfe73a36..941f45075c4 100644 --- a/examples/stream.wasm/index-tmpl.html +++ b/examples/stream.wasm/index-tmpl.html @@ -246,12 +246,12 @@ function loadWhisper(model) { let urls = { - 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', - 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', - 'base' : 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin', + 'tiny.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin', + 'base.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin', + 'base' : 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin', - 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin', - 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin', + 'tiny-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin', + 'base-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q5_1.bin', }; let sizes = { diff --git a/examples/whisper.wasm/index-tmpl.html b/examples/whisper.wasm/index-tmpl.html index 32fdf12f93e..d5f1be8929c 100644 --- a/examples/whisper.wasm/index-tmpl.html +++ b/examples/whisper.wasm/index-tmpl.html @@ -338,22 +338,22 @@ function loadWhisper(model) { let urls = { - 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', - 'tiny': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin', - 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', - 'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin', - 'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin', - 'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin', - - 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin', - 'tiny-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin', - 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin', - 'base-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base-q5_1.bin', - 'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin', - 'small-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small-q5_1.bin', - 'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin', - 'medium-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-medium-q5_0.bin', - 'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin', + 'tiny.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin', + 'tiny': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin', + 'base.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin', + 'base': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin', + 'small.en': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin', + 'small': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin', + + 'tiny-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin', + 'tiny-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-q5_1.bin', + 'base-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q5_1.bin', + 'base-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base-q5_1.bin', + 'small-en-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en-q5_1.bin', + 'small-q5_1': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small-q5_1.bin', + 'medium-en-q5_0':'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q5_0.bin', + 'medium-q5_0': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin', + 'large-q5_0': 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-q5_0.bin', }; let sizes = { From 5527454cdb3e15d7e2b8a6e2afcb58cb61651fd2 Mon Sep 17 00:00:00 2001 From: Dw9 Date: Tue, 12 Aug 2025 18:58:52 +0800 Subject: [PATCH 0170/1348] whisper : fixed crash in GPU device selection on multi-GPU systems (#3372) --- src/whisper.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index a3496d604a6..52de68c2b12 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1327,7 +1327,7 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { ggml_backend_dev_t dev_cur = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev_cur) == GGML_BACKEND_DEVICE_TYPE_GPU) { - if (cnt == 0 || cnt == params.gpu_device) { + if (cnt == params.gpu_device) { dev = dev_cur; } @@ -1396,7 +1396,7 @@ static buft_list_t make_buft_list(whisper_context_params & params) { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { - if (cnt == 0 || cnt == params.gpu_device) { + if (cnt == params.gpu_device) { auto * buft = ggml_backend_dev_buffer_type(dev); if (buft) { buft_list.emplace_back(dev, buft); From 16c2924cb2c4b5c9f79220aa7708eb5b346b029b Mon Sep 17 00:00:00 2001 From: ustas <82833595+ustas-eth@users.noreply.github.com> Date: Wed, 13 Aug 2025 14:30:45 -0300 Subject: [PATCH 0171/1348] ci : update main-cuda.Dockerfile (#3371) * Update main-cuda.Dockerfile Bump CUDA to 13.0.0 and exclude the `compute_50` arch from build because it was deprecated and now throws an error. * Add quotes in main-cuda.Dockerfile --- .devops/main-cuda.Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index b9f4873937b..c2bf0fbd1c6 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 # This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.3.1 +ARG CUDA_VERSION=13.0.0 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} # Target the CUDA runtime image @@ -20,12 +20,12 @@ RUN apt-get update && \ && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* # Ref: https://stackoverflow.com/a/53464012 -ENV CUDA_MAIN_VERSION=12.3 +ENV CUDA_MAIN_VERSION=13.0 ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH COPY .. . # Enable cuBLAS -RUN make base.en CMAKE_ARGS="-DGGML_CUDA=1" +RUN make base.en CMAKE_ARGS="-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES='75;80;86;90'" RUN find /app/build -name "*.o" -delete && \ find /app/build -name "*.a" -delete && \ @@ -34,7 +34,7 @@ RUN find /app/build -name "*.o" -delete && \ rm -rf /app/build/_deps FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime -ENV CUDA_MAIN_VERSION=12.3 +ENV CUDA_MAIN_VERSION=13.0 ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH WORKDIR /app From 040510a132f0a9b51d4692b57a6abfd8c9660696 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 15 Aug 2025 14:54:23 +0200 Subject: [PATCH 0172/1348] node : add win platform check for require path (#3363) This commit adds a check to the platform in use and adjust the path to the addon.node shared library. The motivation for this change is that on windows addon.node library is built into build\bin\Release and on linux into build/Release. Resolves: https://github.com/ggml-org/whisper.cpp/issues/3360 --- examples/addon.node/index.js | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/addon.node/index.js b/examples/addon.node/index.js index 9324d6fa548..2e4ed610b66 100644 --- a/examples/addon.node/index.js +++ b/examples/addon.node/index.js @@ -1,8 +1,10 @@ -const path = require("path"); -const { whisper } = require(path.join( - __dirname, - "../../build/Release/addon.node" -)); +const path = require('path'); +const os = require('os'); + +const isWindows = os.platform() === 'win32'; +const buildPath = isWindows ? "../../build/bin/Release/addon.node" : "../../build/Release/addon.node"; + +const { whisper } = require(path.join(__dirname, buildPath)); const { promisify } = require("util"); const whisperAsync = promisify(whisper); From 9446500b9d8e9792bfc10797c441d64e88647c20 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Aug 2025 19:31:13 +0300 Subject: [PATCH 0173/1348] scripts : update sync scripts --- scripts/sync-ggml-am.sh | 50 +++-------------------------------------- scripts/sync-ggml.sh | 17 +------------- 2 files changed, 4 insertions(+), 63 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 924f67ee5b5..1f87e23122b 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -61,21 +61,7 @@ while read c; do cmake/ggml-config.cmake.in \ src/ggml-cpu/cmake/FindSIMD.cmake \ src/ggml*.h \ - src/ggml*.c \ - src/ggml*.cpp \ - src/gguf*.cpp \ - src/ggml-blas/* \ - src/ggml-cann/* \ - src/ggml-cpu/* \ - src/ggml-cuda/* \ - src/ggml-hip/* \ - src/ggml-kompute/* \ - src/ggml-metal/* \ - src/ggml-musa/* \ - src/ggml-opencl/* \ - src/ggml-rpc/* \ - src/ggml-sycl/* \ - src/ggml-vulkan/* \ + src/ggml* \ include/ggml*.h \ include/gguf*.h \ examples/common.h \ @@ -118,22 +104,7 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then # cmake/ggml-config.cmake.in -> ggml/cmake/ggml-config.cmake.in # src/ggml-cpu/cmake/FindSIMD.cmake -> ggml/src/ggml-cpu/cmake/FindSIMD.cmake # - # src/ggml*.c -> ggml/src/ggml*.c - # src/ggml*.cpp -> ggml/src/ggml*.cpp - # src/ggml*.h -> ggml/src/ggml*.h - # src/gguf*.cpp -> ggml/src/gguf*.cpp - # src/ggml-blas/* -> ggml/src/ggml-blas/* - # src/ggml-cann/* -> ggml/src/ggml-cann/* - # src/ggml-cpu/* -> ggml/src/ggml-cpu/* - # src/ggml-cuda/* -> ggml/src/ggml-cuda/* - # src/ggml-hip/* -> ggml/src/ggml-hip/* - # src/ggml-kompute/* -> ggml/src/ggml-kompute/* - # src/ggml-metal/* -> ggml/src/ggml-metal/* - # src/ggml-musa/* -> ggml/src/ggml-musa/* - # src/ggml-opencl/* > ggml/src/ggml-opencl/* - # src/ggml-rpc/* -> ggml/src/ggml-rpc/* - # src/ggml-sycl/* -> ggml/src/ggml-sycl/* - # src/ggml-vulkan/* -> ggml/src/ggml-vulkan/* + # src/ggml* -> ggml/src/ggml*.c # # include/ggml*.h -> ggml/include/ggml*.h # include/gguf*.h -> ggml/include/gguf*.h @@ -154,22 +125,7 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then -e 's/(^[[:space:]]| [ab]\/)cmake\/common.cmake/\1ggml\/cmake\/common.cmake/g' \ -e 's/(^[[:space:]]| [ab]\/)cmake\/ggml-config.cmake.in/\1ggml\/cmake\/ggml-config.cmake.in/g' \ -e 's/(^[[:space:]]| [ab]\/)src\/ggml-cpu\/cmake\/FindSIMD.cmake/\1ggml\/src\/ggml-cpu\/cmake\/FindSIMD.cmake/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \ - -e 's/([[:space:]]| [ab]\/)src\/gguf(.*)\.cpp/\1ggml\/src\/gguf\2.cpp/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-blas\//\1ggml\/src\/ggml-blas\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-hip\//\1ggml\/src\/ggml-hip\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-kompute\//\1ggml\/src\/ggml-kompute\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-metal\//\1ggml\/src\/ggml-metal\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-musa\//\1ggml\/src\/ggml-musa\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-opencl\//\1ggml\/src\/ggml-opencl\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-vulkan\//\1ggml\/src\/ggml-vulkan\//g' \ + -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)/\1ggml\/src\/ggml\2/g' \ -e 's/(^[[:space:]]| [ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \ -e 's/(^[[:space:]]| [ab]\/)include\/gguf(.*)\.h/\1ggml\/include\/gguf\2.h/g' \ -e 's/(^[[:space:]]| [ab]\/)examples\/common\.h/\1examples\/common.h/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 00162daa05b..4296ddf5f50 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -6,22 +6,7 @@ cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt cp -rpv ../ggml/cmake/* ./ggml/cmake/ cp -rpv ../ggml/src/ggml-cpu/cmake/* ./ggml/src/ggml-cpu/cmake/ -cp -rpv ../ggml/src/ggml*.c ./ggml/src/ -cp -rpv ../ggml/src/ggml*.cpp ./ggml/src/ -cp -rpv ../ggml/src/ggml*.h ./ggml/src/ -cp -rpv ../ggml/src/gguf*.cpp ./ggml/src/ -cp -rpv ../ggml/src/ggml-blas/* ./ggml/src/ggml-blas/ -cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ -cp -rpv ../ggml/src/ggml-cpu/* ./ggml/src/ggml-cpu/ -cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ -cp -rpv ../ggml/src/ggml-hip/* ./ggml/src/ggml-hip/ -cp -rpv ../ggml/src/ggml-kompute/* ./ggml/src/ggml-kompute/ -cp -rpv ../ggml/src/ggml-metal/* ./ggml/src/ggml-metal/ -cp -rpv ../ggml/src/ggml-musa/* ./ggml/src/ggml-musa/ -cp -rpv ../ggml/src/ggml-opencl/* ./ggml/src/ggml-opencl/ -cp -rpv ../ggml/src/ggml-rpc/* ./ggml/src/ggml-rpc/ -cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ -cp -rpv ../ggml/src/ggml-vulkan/* ./ggml/src/ggml-vulkan/ +cp -rpv ../ggml/src/ggml* ./ggml/src/ cp -rpv ../ggml/include/ggml*.h ./ggml/include/ cp -rpv ../ggml/include/gguf*.h ./ggml/include/ From 01bdc522e0ea38b8089041ca72319dc756603670 Mon Sep 17 00:00:00 2001 From: Kai Pastor Date: Wed, 30 Jul 2025 14:52:26 +0200 Subject: [PATCH 0174/1348] vulkan : fix 32-bit builds (ggml/1313) The pipeline member can be cast to VkPipeline. This is a VkPipeline_T* on 64 bit but a uint64_t on 32 bit. Cf. VK_DEFINE_NON_DISPATCHABLE_HANDLE documentation. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a99b1c73130..b1a1cff904b 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1341,7 +1341,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin vk::DebugUtilsObjectNameInfoEXT duoni; duoni.objectType = vk::ObjectType::ePipeline; duoni.pObjectName = pipeline->name.c_str(); - duoni.objectHandle = reinterpret_cast(static_cast(pipeline->pipeline)); + duoni.objectHandle = /*reinterpret_cast*/(uint64_t)(static_cast(pipeline->pipeline)); vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast(duoni)); } From 45784c05ae36a431b2336f42e131a4ecc33567c8 Mon Sep 17 00:00:00 2001 From: Kai Pastor Date: Wed, 30 Jul 2025 14:53:16 +0200 Subject: [PATCH 0175/1348] cmake : Fix BLAS link interface (ggml/1316) --- ggml/cmake/ggml-config.cmake.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in index fe34cda4e01..2322c6cd9d0 100644 --- a/ggml/cmake/ggml-config.cmake.in +++ b/ggml/cmake/ggml-config.cmake.in @@ -34,8 +34,8 @@ if (NOT GGML_SHARED_LIB) if (GGML_BLAS) find_dependency(BLAS) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES}) - list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS}) + list(APPEND GGML_BLAS_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES}) + list(APPEND GGML_BLAS_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS}) endif() if (GGML_CUDA) From 923619ffd566d92a235750eb927d581c9d4c3914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Mon, 28 Jul 2025 11:05:53 +0100 Subject: [PATCH 0176/1348] sycl: refactor quantization to q8_1 (llama/14815) * sycl: quantization to q8_1 refactor * Refactored src1 copy logic in op_mul_mat --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 256 ++++++------------------------- ggml/src/ggml-sycl/quantize.hpp | 133 ++++++++++++++++ 3 files changed, 184 insertions(+), 206 deletions(-) create mode 100644 ggml/src/ggml-sycl/quantize.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index f839a42bc90..410a67b0195 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -28,6 +28,7 @@ #include "mmvq.hpp" #include "norm.hpp" #include "outprod.hpp" +#include "quantize.hpp" #include "quants.hpp" #include "rope.hpp" #include "set_rows.hpp" diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index a023d6fb452..b08941c328b 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -44,6 +44,7 @@ #include "ggml-sycl/set_rows.hpp" #include "ggml-sycl/sycl_hw.hpp" #include "ggml-sycl/getrows.hpp" +#include "ggml-sycl/quantize.hpp" #include "ggml.h" static bool g_sycl_loaded = false; @@ -1373,120 +1374,6 @@ typedef void (*ggml_sycl_op_mul_mat_t)( -template -static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, - const sycl::nd_item<3> &item_ct1) { - const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE; - - if (ix >= kx_padded) { - return; - } - - const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - - const int i_padded = iy*kx_padded + ix; - - block_q8_1 * y = (block_q8_1 *) vy; - - const int ib = i_padded / QK8_1; // block index - const int iqs = i_padded % QK8_1; // quant index - typedef sycl::vec TC; - typedef sycl::vec TQ; - TC zeros; - TQ qzeros; -#pragma unroll - for (int i = 0; i < QUANT_BLOCK_TILE; i++) - { - zeros[i] = 0.f; - qzeros[i] = 0; - } - const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros; - float sum = xi[0]; - float amax = sycl::fabs(xi[0]); -#pragma unroll - for (int i = 1; i < QUANT_BLOCK_TILE; i++) - { - sum += xi[i]; - amax = sycl::fmax(sycl::fabs(xi[i]), amax); - } - sum = warp_reduce_sum(sum, item_ct1); - amax = warp_reduce_max(amax, item_ct1); - - const float d = amax / 127; - TQ q = qzeros; - if (amax != 0.0f) - { -#pragma unroll - for (int i = 0; i < QUANT_BLOCK_TILE; i++) { - q[i] = sycl::round(xi[i] / d); - } - } - - *(TQ *)&y[ib].qs[iqs] = q; - - if (iqs > 0) { - return; - } - - reinterpret_cast(y[ib].ds.x()) = d; - reinterpret_cast(y[ib].ds.y()) = sum; -} - -template -static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor, - const int kx, const int kx_padded, const sycl::nd_item<1> & it) { - /* - Quantizes and reorders the resultant q8 tensor in a per row fashion - Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values - */ - - auto subgroup_id = it.get_group(0); - auto wi_id = it.get_local_id(0); - - const int num_blocks_per_row = kx / QK8_1; - auto row = subgroup_id / num_blocks_per_row; - auto col = subgroup_id % num_blocks_per_row; - - auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1); - auto col_offset = QK8_1 * col + wi_id * ElementsPerWI; - - auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset); - auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2)); - - sycl::vec wi_f32_vals; - sycl::vec quantized_values; - - auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id; - wi_f32_vals = *reinterpret_cast *>(x + float_ptr_offset); - - float sum = 0.0f; - float amax = 0.0f; - -#pragma unroll(ElementsPerWI) - for (int i = 0; i < ElementsPerWI; i++) { - sum += wi_f32_vals[i]; - amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i])); - quantized_values[i] = 0; - } - sum = sycl::reduce_over_group(it.get_group(), sum, sycl::plus()); - amax = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum()); - float d = amax == 0 ? 1 : amax / 127; - -#pragma unroll(ElementsPerWI) - for (int i = 0; i < ElementsPerWI; i++) { - quantized_values[i] = sycl::round(wi_f32_vals[i] / d); - } - - d = amax == 0 ? 0 : d; - - *reinterpret_cast *>(quant_ptr) = quantized_values; - if (wi_id == 0) { - *ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum)); - } -} - static void mul_mat_p021_f16_f32( const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, @@ -1770,32 +1657,6 @@ static void pool2d_nchw_kernel( o_ptr[cur_oh * ow + cur_ow] = res; } -static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded, - bool reorder_q8_tensor, queue_ptr stream) { - if (reorder_q8_tensor) { - auto local_range = std::size_t(WARP_SIZE); - auto num_quant_blocks = ky * (kx / QK8_1); - auto global_range = num_quant_blocks * local_range; - stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), - [=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - quantize_and_reorder_q8_1(x, vy, kx, kx_padded, it); - }); - } else { - const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE; - const sycl::range<3> num_blocks(1, ky, block_num_x); - int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE; - static_assert(QK8_1 % WARP_SIZE == 0); - const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE); - { - dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - - stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - quantize_q8_1(x, vy, kx, kx_padded, item_ct1); - }); - } - } -} static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y, float *dst, const int ncols_x, @@ -2372,10 +2233,10 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { peer_access_enabled = enable_peer_access; } +template