diff --git a/.github/workflows/build-msys.yml b/.github/workflows/build-msys.yml index c2633c151a5..15c55cf12cc 100644 --- a/.github/workflows/build-msys.yml +++ b/.github/workflows/build-msys.yml @@ -27,8 +27,8 @@ jobs: fail-fast: false matrix: include: - - { sys: UCRT64, env: ucrt-x86_64, build: Release } - - { sys: CLANG64, env: clang-x86_64, build: Release } + - { sys: UCRT64, env: ucrt-x86_64, compiler: gcc, build: Release } + - { sys: CLANG64, env: clang-x86_64, compiler: clang, build: Release } steps: - name: Clone @@ -48,9 +48,7 @@ jobs: update: true msystem: ${{matrix.sys}} install: >- - base-devel - git - mingw-w64-${{matrix.env}}-toolchain + mingw-w64-${{matrix.env}}-${{matrix.compiler}} mingw-w64-${{matrix.env}}-cmake mingw-w64-${{matrix.env}}-openblas diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 6f1f2721e45..8195a55ff28 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -82,8 +82,8 @@ jobs: { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" }, { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, - { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, - { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, + { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, + { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }, diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md index 06d97ae78ee..197173faed8 100644 --- a/.pi/gg/SYSTEM.md +++ b/.pi/gg/SYSTEM.md @@ -16,12 +16,12 @@ Pull requests (PRs): - New branch names are prefixed with "gg/" - Before opening a pull request, ask the user to confirm the description - When creating a pull request, look for the repository's PR template and follow it -- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]" +- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]" - Ask the user to tell you what model was used and write it in place of [MODEL] - Always create the pull requests in draft mode Commits: -- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag +- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag - Do not explicitly set the git author in commits - rely on the default git config - Always use `--no-gpg-sign` when committing - Never `git push` without explicit confirmation from the user diff --git a/build-xcframework.sh b/build-xcframework.sh index 5d289922a84..180c01a88e9 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -130,14 +130,7 @@ setup_framework_structure() { # Create module map (common for all platforms) cat > ${module_path}module.modulemap << EOF framework module llama { - header "llama.h" - header "ggml.h" - header "ggml-alloc.h" - header "ggml-backend.h" - header "ggml-metal.h" - header "ggml-cpu.h" - header "ggml-blas.h" - header "gguf.h" + umbrella "Headers" link "c++" link framework "Accelerate" diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 1a56c25857f..c42320c46b1 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -78,6 +78,8 @@ add_library(${TARGET} hf-cache.cpp hf-cache.h http.h + imatrix-loader.cpp + imatrix-loader.h json-partial.cpp json-partial.h json-schema-to-grammar.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 04ddf32fde6..5355408b2a4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -446,7 +446,13 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex) opts.offline = params.offline; opts.skip_download = params.skip_download; opts.download_mtp = spec_type_draft_mtp; - opts.download_mmproj = !params.no_mmproj; + opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty(); + + // sub-models (draft, mmproj, vocoder) are explicitly specified by the user, + // so we should not auto-discover mtp/mmproj siblings for them + common_download_opts sub_opts = opts; + sub_opts.download_mtp = false; + sub_opts.download_mmproj = false; try { auto res = common_params_handle_model(params.model, opts); @@ -459,7 +465,7 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex) // only download mmproj if the current example is using it for (const auto & ex : mmproj_examples) { if (curr_ex == ex) { - common_params_handle_model(params.mmproj, opts); + common_params_handle_model(params.mmproj, sub_opts); break; } } @@ -472,8 +478,8 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex) params.speculative.draft.mparams.url.empty()) { params.speculative.draft.mparams.path = res.mtp.path; } - common_params_handle_model(params.speculative.draft.mparams, opts); - common_params_handle_model(params.vocoder.model, opts); + common_params_handle_model(params.speculative.draft.mparams, sub_opts); + common_params_handle_model(params.vocoder.model, sub_opts); return true; } catch (const common_skip_download_exception &) { return false; diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index 12e747d1ca1..9bc5ac98be6 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -87,6 +87,8 @@ static std::string normalize_quotes_to_json(const std::string & input) { bool in_single_quoted = false; bool in_double_quoted = false; + auto is_word_char = [](char ch) { return std::isalnum(static_cast(ch)) || ch == '_'; }; + for (size_t i = 0; i < input.size(); ++i) { char c = input[i]; @@ -151,6 +153,29 @@ static std::string normalize_quotes_to_json(const std::string & input) { in_single_quoted = true; result += '"'; } + } else if (!in_single_quoted && !in_double_quoted && (c == 'T' || c == 'F' || c == 'N') && + (i == 0 || !is_word_char(input[i - 1]))) { + // Python literals -> JSON; prefix match keeps streamed partials monotonic. + static constexpr std::pair literals[] = { + { "True", "true" }, { "False", "false" }, { "None", "null" }, + }; + size_t n = 0; + while (i + n < input.size() && is_word_char(input[i + n])) { + ++n; + } + std::string_view token(input.data() + i, n); + bool matched = false; + for (const auto & [py, js] : literals) { + if (py.substr(0, n) == token) { + result += js.substr(0, n); + i += n - 1; + matched = true; + break; + } + } + if (!matched) { + result += c; + } } else { result += c; } @@ -353,12 +378,8 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) { } value_to_add += escape_json_string_inner(value_content); } else if (!value_content.empty()) { - // For potential containers, normalize Python-style single quotes to JSON double quotes - bool is_potential_container = value_content[0] == '[' || value_content[0] == '{'; - if (is_potential_container) { - value_content = normalize_container_value(value_content); - } - value_to_add += value_content; + // Pythonic scalars/containers -> JSON. + value_to_add += normalize_container_value(value_content); } args_target() += value_to_add; @@ -466,11 +487,34 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools( return force_tool_calls ? section : optional(section); } +// Like python_value(), but the leaf also accepts JSON-cased true/false/null, used by LFM2/LFM2.5 +common_peg_parser common_chat_peg_builder::python_or_json_value() { + return rule("python-or-json-value", [this]() { + auto ws = space(); + auto value = python_or_json_value(); + + auto member = sequence({ python_string(), ws, literal(":"), ws, value }); + auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) }); + auto dict = rule("python-or-json-dict", [&]() { + return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }), ws }); + }); + + auto elements = sequence({ value, zero_or_more(sequence({ literal(","), ws, value })) }); + auto array = rule("python-or-json-array", [&]() { + return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }), ws }); + }); + + return choice({ dict, array, python_string(), python_number(), + python_bool(), python_null(), json_bool(), json_null() }); + }); +} + // Python-style tool calls: name(arg1="value1", arg2=123) // Used only by LFM2 for now, so we don't merge it into autoparser common_peg_parser common_chat_peg_builder::python_style_tool_calls( const ordered_json & tools, - bool parallel_tool_calls) { + bool parallel_tool_calls, + bool allow_json_literals) { if (!tools.is_array() || tools.empty()) { return eps(); } @@ -504,7 +548,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls( if (is_string_type) { arg_value_parser = string_value_parser; } else { - arg_value_parser = tool_arg_value(python_value()); + arg_value_parser = tool_arg_value(allow_json_literals ? python_or_json_value() : python_value()); } // Full argument: name="value" or name=value diff --git a/common/chat-peg-parser.h b/common/chat-peg-parser.h index be92f17d909..a4643fbea86 100644 --- a/common/chat-peg-parser.h +++ b/common/chat-peg-parser.h @@ -132,9 +132,13 @@ class common_chat_peg_builder : public common_peg_parser_builder { // Helper for Python-style function call format: name(arg1="value1", arg2=123) // Used by LFM2 and similar templates common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools, - bool parallel_tool_calls); + bool parallel_tool_calls, + bool allow_json_literals); private: + // Python values plus JSON true/false/null. + common_peg_parser python_or_json_value(); + // Implementation helpers for standard_json_tools — one per JSON tool call layout mode common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools, const std::string & args_key, @@ -195,4 +199,3 @@ struct tagged_peg_parser { tagged_peg_parser build_tagged_peg_parser( const std::function & fn); - diff --git a/common/chat.cpp b/common/chat.cpp index ef151691c38..24e58ab0640 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1608,42 +1608,51 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp return data; } -// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt -// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls. -// - Reasoning: {reasoning} (optional) -// - Content: text before a tool call (optional) -// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")] -// Tool calls can appear multiple times (parallel tool calls supported) -static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, - const autoparser::generation_params & inputs) { +// LFM2/LFM2.5 parser. Tool calls are almost Python-style and parallel-capable +// (except dotted names and JSON literals true/false/null). +// Always wrapped in <|tool_call_start|>[name(args)]<|tool_call_end|> with optional reasoning. +// tool_list_tokens preserves LFM2 system tool-list markers. +static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, + const autoparser::generation_params & inputs, + bool tool_list_tokens) { common_chat_params data; - data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); - data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); - data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.supports_thinking = true; - data.preserved_tokens = { - "<|tool_list_start|>", - "<|tool_list_end|>", - "<|tool_call_start|>", - "<|tool_call_end|>", - "", - "", - }; - - auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); - auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; - auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; - const std::string TOOL_CALL_START = "<|tool_call_start|>"; const std::string TOOL_CALL_END = "<|tool_call_end|>"; + const std::string TOOL_LIST_START = "<|tool_list_start|>"; + const std::string TOOL_LIST_END = "<|tool_list_end|>"; const std::string THINK_START = ""; const std::string THINK_END = ""; const std::string GEN_PROMPT = "<|im_start|>assistant\n"; + // Copy reasoning to the "thinking" field the template expects + auto adjusted_messages = json::array(); + for (auto msg : inputs.messages) { + if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) { + msg["thinking"] = msg.at("reasoning_content"); + } + adjusted_messages.push_back(msg); + } + + data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, adjusted_messages); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, adjusted_messages); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + data.supports_thinking = true; + data.preserved_tokens = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END }; + if (tool_list_tokens) { + data.preserved_tokens.push_back(TOOL_LIST_START); + data.preserved_tokens.push_back(TOOL_LIST_END); + } + data.thinking_start_tag = THINK_START; data.thinking_end_tag = THINK_END; + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); + // Gate by reasoning format and whether the template supports + auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE && + tmpl.source().find(THINK_START) != std::string::npos; + auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; + if (inputs.has_continuation()) { const auto & msg = inputs.continue_msg; @@ -1660,7 +1669,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat auto end = p.end(); auto reasoning = p.eps(); - if (extract_reasoning && inputs.enable_thinking) { + if (extract_reasoning) { reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END); } @@ -1670,7 +1679,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat auto tool_calls = p.rule("tool-calls", p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) + - p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) + + p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls, /* allow_json_literals = */ true) + p.literal(TOOL_CALL_END) ) ); @@ -1697,93 +1706,6 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START } }; } - return data; -} - -// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens. -// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>. -// - Reasoning: {reasoning} (optional) -// - Content: text before a tool call (optional) -// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")] -// Tool calls can appear multiple times (parallel tool calls supported) -static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template & tmpl, - const autoparser::generation_params & inputs) { - common_chat_params data; - - data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); - data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); - data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.supports_thinking = true; - data.preserved_tokens = { - "<|tool_call_start|>", - "<|tool_call_end|>", - "", - "", - }; - - auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); - auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; - auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; - - const std::string THINK_START = ""; - const std::string THINK_END = ""; - const std::string GEN_PROMPT = "<|im_start|>assistant\n"; - - data.thinking_start_tag = THINK_START; - data.thinking_end_tag = THINK_END; - - if (inputs.has_continuation()) { - const auto & msg = inputs.continue_msg; - - data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content; - if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { - data.generation_prompt += THINK_END + msg.render_content(); - } - - data.prompt += data.generation_prompt; - } - - auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { - auto generation_prompt = p.literal(GEN_PROMPT); - auto end = p.end(); - - auto reasoning = p.eps(); - if (extract_reasoning && inputs.enable_thinking) { - reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END); - } - - if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { - return generation_prompt + reasoning + p.content(p.rest()) + end; - } - - auto tool_calls = p.rule("tool-calls", - p.trigger_rule("tool-call", - p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) - ) - ); - - auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["})); - auto maybe_start = p.optional(p.literal("<|tool_call_start|>")); - return generation_prompt + reasoning + content + maybe_start + tool_calls + end; - }); - - data.parser = parser.save(); - - if (include_grammar) { - data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool.at("function"); - auto schema = function.at("parameters"); - builder.resolve_refs(schema); - }); - parser.build_grammar(builder, data.grammar_lazy); - }); - foreach_function(inputs.tools, [&](const json & tool) { - const std::string name = tool.at("function").at("name"); - data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" }); - }); - } return data; } @@ -2298,14 +2220,14 @@ std::optional common_chat_try_specialized_template( if (is_lfm2_template(src)) { LOG_DBG("Using specialized template: LFM2\n"); - return common_chat_params_init_lfm2(tmpl, params); + return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true); } // LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens if (src.find("List of tools: [") != std::string::npos && src.find("<|tool_list_start|>") == std::string::npos) { LOG_DBG("Using specialized template: LFM2.5\n"); - return common_chat_params_init_lfm2_5(tmpl, params); + return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ false); } // GigaChatV3 format detection diff --git a/common/imatrix-loader.cpp b/common/imatrix-loader.cpp new file mode 100644 index 00000000000..efe9aecee3f --- /dev/null +++ b/common/imatrix-loader.cpp @@ -0,0 +1,165 @@ +#include "imatrix-loader.h" +#include "common.h" +#include "log.h" +#include "gguf.h" + +#include +#include +#include + +static bool common_imatrix_load_legacy(const std::string & fname, common_imatrix & imatrix) { + std::ifstream in(fname, std::ios::binary); + if (!in) { + LOG_ERR("%s: failed to open %s\n", __func__, fname.c_str()); + return false; + } + + int n_entries; + in.read((char *) &n_entries, sizeof(n_entries)); + if (in.fail() || n_entries < 1) { + LOG_ERR("%s: no data in file %s\n", __func__, fname.c_str()); + return false; + } + + for (int i = 0; i < n_entries; ++i) { + int32_t len = 0; + in.read((char *) &len, sizeof(len)); + std::vector name_as_vec(len + 1); + in.read((char *) name_as_vec.data(), len); + if (in.fail()) { + LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname.c_str()); + return false; + } + name_as_vec[len] = 0; + std::string name{ name_as_vec.data() }; + + int32_t ncall = 0; + in.read((char *) &ncall, sizeof(ncall)); + int32_t nval = 0; + in.read((char *) &nval, sizeof(nval)); + if (in.fail() || nval < 1) { + LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i); + return false; + } + + auto & e = imatrix.entries[std::move(name)]; + e.sums.resize(nval); + in.read((char *) e.sums.data(), nval * sizeof(float)); + if (in.fail()) { + LOG_ERR("%s: failed reading data for entry %d\n", __func__, i); + return false; + } + + e.counts.resize(1); + e.counts[0] = ncall; + } + + // the trailing data (chunk count + dataset name) is optional + if (in.peek() != EOF) { + int32_t n_calls = 0; + in.read((char *) &n_calls, sizeof(n_calls)); + imatrix.chunk_count = n_calls; + + if (!in.fail()) { + int32_t len = 0; + in.read((char *) &len, sizeof(len)); + if (!in.fail() && len > 0) { + std::vector dataset(len + 1, 0); + in.read(dataset.data(), len); + if (!in.fail()) { + imatrix.datasets.push_back(dataset.data()); + } + } + } + } + + imatrix.chunk_size = 0; + imatrix.is_legacy = true; + + return true; +} + +bool common_imatrix_load(const std::string & fname, common_imatrix & imatrix) { + struct ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), meta_gguf_params); + if (!ctx_gguf) { + return common_imatrix_load_legacy(fname, imatrix); + } + + const int32_t n_entries = gguf_get_n_tensors(ctx_gguf); + if (n_entries < 1) { + LOG_ERR("%s: no data in file %s\n", __func__, fname.c_str()); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS); + const int64_t chunk_count_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT); + const int64_t chunk_size_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE); + + if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) { + const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key); + imatrix.datasets.reserve(imatrix.datasets.size() + n); + for (int64_t i = 0; i < n; ++i) { + imatrix.datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i)); + } + } + + imatrix.has_metadata = (datasets_key != -1 && chunk_count_key != -1 && chunk_size_key != -1); + imatrix.chunk_count = (chunk_count_key != -1) ? gguf_get_val_u32(ctx_gguf, chunk_count_key) : 0; + imatrix.chunk_size = (chunk_size_key != -1) ? gguf_get_val_u32(ctx_gguf, chunk_size_key) : 0; + + const std::string in_sum2_suffix{ ".in_sum2" }; + const std::string counts_suffix{ ".counts" }; + + std::map> sums_counts_for; + + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string name = cur->name; + + if (name.empty()) { continue; } + + if (string_remove_suffix(name, in_sum2_suffix)) { + sums_counts_for[std::move(name)].first = cur; + } else if (string_remove_suffix(name, counts_suffix)) { + sums_counts_for[std::move(name)].second = cur; + } + } + + for (const auto & sc : sums_counts_for) { + const std::string & name = sc.first; + const struct ggml_tensor * in_sum2 = sc.second.first; + const struct ggml_tensor * counts = sc.second.second; + + if (!in_sum2 || !counts) { + LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str()); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + auto & e = imatrix.entries[name]; + + const int64_t nval = ggml_nelements(in_sum2); + const int64_t ncounts = ggml_nelements(counts); + + e.sums.resize(nval); + for (int64_t j = 0; j < nval; ++j) { + e.sums[j] = ((const float *) in_sum2->data)[j]; + } + + e.counts.resize(ncounts); + for (int64_t j = 0; j < ncounts; ++j) { + e.counts[j] = std::lround(((const float *) counts->data)[j]); + } + } + + gguf_free(ctx_gguf); + ggml_free(ctx); + return true; +} diff --git a/common/imatrix-loader.h b/common/imatrix-loader.h new file mode 100644 index 00000000000..ed00d724ac8 --- /dev/null +++ b/common/imatrix-loader.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include +#include + +inline constexpr const char * LLM_KV_IMATRIX_DATASETS = "imatrix.datasets"; +inline constexpr const char * LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count"; +inline constexpr const char * LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size"; + +struct common_imatrix_entry { + std::vector sums; + std::vector counts; +}; + +struct common_imatrix { + std::map entries; + std::vector datasets; + int32_t chunk_count = 0; + int32_t chunk_size = 0; + bool is_legacy = false; + bool has_metadata = false; +}; + +bool common_imatrix_load(const std::string & fname, common_imatrix & imatrix); diff --git a/common/speculative.cpp b/common/speculative.cpp index 3f25c0eb57d..8880add5ea7 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -3,13 +3,14 @@ #include "common.h" #include "ggml.h" #include "llama.h" -#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP) #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" #include "ngram-mod.h" #include "sampling.h" +#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP) + #include #include #include @@ -419,6 +420,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { int32_t n_embd = 0; + bool is_mem_shared = false; + // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. // The last h-row of one process() call needs the first token of the NEXT // call to pair with, so it's stashed here until that next call fires. @@ -445,7 +448,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set"); - n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); + n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft)); + GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) && + "MTP input row width must match the target h_nextn width"); LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling); @@ -491,6 +496,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false); llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); + is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt; + pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); i_batch_beg.assign(n_seq, -1); @@ -527,9 +534,11 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { if (N <= 0) { return; } + auto * ctx_dft = this->params.ctx_dft; const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); - if (pos_max < N - 1) { + + if (pos_max < N - 1 && !is_mem_shared) { LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " "process() hook may not have run on every prefill ubatch " "(need_embd / logits=1 on every prompt position?). " @@ -572,48 +581,42 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { const size_t row_bytes = (size_t) n_embd * sizeof(float); - common_batch_clear(batch); + // if kv is shared with target (e.g Gemma4), then we can skip this catch-up decode + if (!is_mem_shared) { + common_batch_clear(batch); - for (int k = 0; k < n_tokens; ++k) { - common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); - } + for (int k = 0; k < n_tokens; ++k) { + common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); + } - // shift the tgt embeddings to the right by one position - // assumes that the tokens in the batch are sequential for each sequence - // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] - // ^--- this is a problem - // TODO:this is generally true, but would be nice to assert it - { - const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt); - std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + // shift the tgt embeddings to the right by one position + // assumes that the tokens in the batch are sequential for each sequence + // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] + // ^--- this is a problem + // TODO:this is generally true, but would be nice to assert it + { + const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt); + std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + } - //{ - // // string with seq_ids in the batch - // std::stringstream ss; - // for (int i = 0; i < n_tokens; ++i) { - // ss << batch_in.seq_id[i][0] << ","; - // } - // LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str()); - //} - } + // fill the pending embeddings from a previous run + auto set_h = [&](int idx, const float * h_row) { + std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); + }; - // fill the pending embeddings from a previous run - auto set_h = [&](int idx, const float * h_row) { - std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); - }; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_beg[seq_id] < 0) { + continue; + } - for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { - if (i_batch_beg[seq_id] < 0) { - continue; + set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); } - set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); - } - - const int32_t rc = llama_decode(ctx_dft, batch); - if (rc != 0) { - LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); - return false; + const int32_t rc = llama_decode(ctx_dft, batch); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); + return false; + } } for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { @@ -722,7 +725,13 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { continue; } - common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + if (is_mem_shared) { + // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens + // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37 + common_batch_add(batch, id, dp.n_past, { seq_id }, true); + } else { + common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + } std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes); } diff --git a/conversion/__init__.py b/conversion/__init__.py index 2c79580f8a3..18162976f45 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -75,9 +75,11 @@ "Gemma3TextModel": "gemma", "Gemma3nForCausalLM": "gemma", "Gemma3nForConditionalGeneration": "gemma", + "Gemma4AssistantForCausalLM": "gemma", "Gemma4ForConditionalGeneration": "gemma", "Gemma4ForCausalLM": "gemma", "Gemma4UnifiedForConditionalGeneration": "gemma", + "Gemma4UnifiedAssistantForCausalLM": "gemma", "GemmaForCausalLM": "gemma", "Glm4ForCausalLM": "glm", "Glm4MoeForCausalLM": "glm", @@ -253,6 +255,7 @@ "Glm4vMoeForConditionalGeneration": "qwen3vl", "GlmOcrForConditionalGeneration": "qwen3vl", "GlmasrModel": "ultravox", + "Granite4VisionForConditionalGeneration": "granite", "GraniteSpeechForConditionalGeneration": "granite", "HunYuanVLForConditionalGeneration": "hunyuan", "Idefics3ForConditionalGeneration": "smolvlm", diff --git a/conversion/gemma.py b/conversion/gemma.py index 2025e782b7f..d8cf8be575c 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -785,6 +785,16 @@ def set_gguf_parameters(self): self.gguf_writer.add_suppress_tokens(suppress_tokens) +@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM") +class Gemma4AssistantModel(Gemma4Model): + model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"]) + self.gguf_writer.add_nextn_predict_layers(self.block_count) + + @ModelBase.register("Gemma4ForConditionalGeneration") class Gemma4VisionAudioModel(MmprojModel): has_audio_encoder = True @@ -798,7 +808,8 @@ def __init__(self, *args, **kwargs): # remap audio hparams if self.hparams_audio: self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128) - self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 + if "hidden_size" in self.hparams_audio: + self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 else: self.has_audio_encoder = False @@ -811,10 +822,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) # audio params - assert self.hparams_audio is not None - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6)) + if self.has_audio_encoder: + assert self.hparams_audio is not None + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6)) def is_audio_tensor(self, name: str) -> bool: return "audio_tower" in name or "embed_audio" in name @@ -872,7 +884,7 @@ def __init__(self, *args, **kwargs): assert self.hparams_audio is not None text_embd_dim = self.hparams_vision["mm_embed_dim"] self.hparams_vision["hidden_size"] = text_embd_dim - self.hparams_audio["hidden_size"] = text_embd_dim + self.hparams_audio["hidden_size"] = self.hparams_audio["audio_embed_dim"] # this is a transformer-less vision tower, the params below are redundant but set to avoid error self.hparams_vision["intermediate_size"] = 0 self.hparams_vision["num_layers"] = 0 @@ -897,7 +909,10 @@ def modify_tensors(self, data_torch, name, bid): # ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC). # Permute columns so column i aligns with CHW input position i. assert self.hparams_vision is not None - p = self.hparams_vision["model_patch_size"] + if "model_patch_size" in self.hparams_vision: + p = self.hparams_vision["model_patch_size"] + else: + p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"] i = torch.arange(p * p * 3) ch = i // (p * p) row = (i % (p * p)) // p @@ -908,7 +923,10 @@ def modify_tensors(self, data_torch, name, bid): elif "patch_ln1.weight" in name or "patch_ln1.bias" in name: # same permutation for patch_ln1 as patch_dense to align with CHW input order assert self.hparams_vision is not None - p = self.hparams_vision["model_patch_size"] + if "model_patch_size" in self.hparams_vision: + p = self.hparams_vision["model_patch_size"] + else: + p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"] i = torch.arange(p * p * 3) ch = i // (p * p) row = (i % (p * p)) // p diff --git a/conversion/granite.py b/conversion/granite.py index 647269ba740..53441fe5701 100644 --- a/conversion/granite.py +++ b/conversion/granite.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import Any, Callable, Iterable, TYPE_CHECKING import torch @@ -13,7 +14,7 @@ from .mamba import Mamba2Model -@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration") +@ModelBase.register("GraniteForCausalLM") class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE @@ -46,11 +47,29 @@ def set_gguf_parameters(self): self.gguf_writer.add_logit_scale(logits_scale) logger.info("gguf: (granite) logits_scale = %s", logits_scale) + # If being used as the base for Granite4 Vision, add deepstack_layer_arr + if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"): + normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams) + deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels + for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map): + # Skip the first projector which is handled as the base embedding + # stream like normal + if proj_idx == 0: + continue + deepstack_mapping_arr[llm_layer] = proj_idx + self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr) + @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: name, gen = item - if name.startswith("encoder."): - return None + # Skip multimodal tensors + if ( + name.startswith(("encoder.")) + or "image_" in name + or "layerwise_projectors" in name + or "spatial_projectors" in name + ): + return return super().filter_tensors(item) @@ -241,7 +260,8 @@ def set_gguf_parameters(self): assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" def set_vocab(self): - self.hparams["pad_vocab_size_multiple"] = 8 + # For models with no ssm layers, don't pad for mamba2 + self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1 Mamba2Model.set_vocab(self) @@ -326,3 +346,133 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.squeeze(1) yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Granite4VisionForConditionalGeneration") +class Granite4VisionMmprojModel(MmprojModel): + has_vision_encoder = True + has_audio_encoder = False + + @staticmethod + def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]: + """Normalize both deepstack and spatial projector maps to the form: + (vision_layer, llm_layer, , type_index) + + This is then used to populate the following mappings: + - vision_feature_layers (mmproj hparam): ordered list of all + vision_layer values where order corresponds with the order of the + stacked projector tensors + NOTE: Values may appear multiple times for spatial projectors + - tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to + the index of the corresponding projector in the stacked tensors + - deepstack_layer_arr (llm hparam): per-text-layer array indicating + which input vision feature should be injected at that layer + (-1 if none) + + Output: (vision_layer, llm_layer, , type_index) + """ + deepstack_map = global_config.get("deepstack_layer_map", []) # [[vis_layer, llm_layer], ...] + spatial_layers = global_config.get("spatial_target_layers", []) # [llm_layer, ...] + n_text_layers = global_config["text_config"]["num_hidden_layers"] + n_vision_layers = global_config["vision_config"]["num_hidden_layers"] + normalized_projector_map = [] + if deepstack_map: + for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)): + if vision_layer < 0: + vision_layer = n_vision_layers + vision_layer + if llm_layer < 0: + llm_layer = n_text_layers + llm_layer + normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx)) + if spatial_layers: + spatial_vision_layer = global_config.get("spatial_vision_layer", -1) + if spatial_vision_layer < 0: + spatial_vision_layer = n_vision_layers + spatial_vision_layer + for spatial_idx, llm_layer in enumerate(spatial_layers): + normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx)) + return list(sorted(normalized_projector_map, key=(lambda entry: entry[1]))) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + normalized_projector_map = self.get_normalized_projector_map(self.global_config) + self._n_proj = len(normalized_projector_map) + + self._tensor_prefix_map = { + f"model.{proj_type}_projectors.{type_idx}": proj_idx + for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map) + } + self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map] + self._spatial_offsets = [ + type_idx if proj_type == "spatial" else -1 + for _, _, proj_type, type_idx in normalized_projector_map + ] + + def set_gguf_parameters(self): + assert self.hparams_vision is not None + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION) + + # SigLIP encoder hparams + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + + # Preprocessor + self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384)) + + # QFormer projector config + ds_rate = self.global_config["downsample_rate"] + ds_parts = ds_rate.split("/") + assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}" + query_side, window_side = [int(p) for p in ds_parts] + self.gguf_writer.add_vision_projector_query_side(query_side) + self.gguf_writer.add_vision_projector_window_side(window_side) + + # Set vision feature layers + self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers) + + # Set the spatial offests per projector + self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets) + + # Add flattened image grind pinpoints (resolution candidates internally) + if pinpoints := self.global_config.get("image_grid_pinpoints"): + # Flatten with h, w -> w, h inversion + pinpoints = [val for h, w in pinpoints for val in (w, h)] + self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, _ = item + if ("vision_model.head" in name or name.startswith("lm_head")): + return None + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # Detect projector tensors and bin them + projector_idx = None + for prefix, proj_idx in self._tensor_prefix_map.items(): + if name.startswith(prefix): + projector_idx = proj_idx + break + if projector_idx is not None: + # If this projector tensor has a block id within the projector, + # alias the bid to projector_idx + # + # TODO: currently, none of the Granite 4 Vision models have + # projectors with multiple QFormer layers, so the `layer.{}` index + # is always 0. This allows us to simply map to a single `bid` that + # matches the projector index. If this changes, we'll need a + # convention that merges the two IDs. + id_matches = list(re.finditer(r"\.([0-9]+)\.", name)) + all_ids = [int(m.group(1)) for m in id_matches] + assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names" + # If not layer id, just use the projector index + new_bid = projector_idx + if len(all_ids) == 1: + new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:] + else: # len(all_ids) == 2 + new_bid = projector_idx # + all_ids[1] + new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:] + yield from super().modify_tensors(data_torch, new_name, new_bid) + return + yield from super().modify_tensors(data_torch, name, bid) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 33bf4b703a2..c03871e93c0 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -311,6 +311,10 @@ def parse_args() -> argparse.Namespace: "--base-model-id", type=str, help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", ) + parser.add_argument( + "--trust-remote-code", default=False, action="store_true", + help="trust remote code in the model", + ) parser.add_argument( "lora_path", type=Path, help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", @@ -319,11 +323,11 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]: +def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]: from huggingface_hub import try_to_load_from_cache # normally, adapter does not come with base model config, we need to load it from AutoConfig - config = AutoConfig.from_pretrained(hf_model_id) + config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code) cache_dir = try_to_load_from_cache(hf_model_id, "config.json") cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None @@ -372,13 +376,13 @@ def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None] # load base model if base_model_id is not None: logger.info(f"Loading base model from Hugging Face: {base_model_id}") - hparams, dir_base_model = load_hparams_from_hf(base_model_id) + hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code) elif dir_base_model is None: if "base_model_name_or_path" in lparams: model_id = lparams["base_model_name_or_path"] logger.info(f"Loading base model from Hugging Face: {model_id}") try: - hparams, dir_base_model = load_hparams_from_hf(model_id) + hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code) except OSError as e: logger.error(f"Failed to load base model config: {e}") logger.error("Please try downloading the base model and add its path to --base") @@ -393,7 +397,9 @@ def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None] with torch.inference_mode(): try: - model_class = get_model_class(hparams["architectures"][0]) + model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0] + logger.info("Using model architecture: %s", model_arch) + model_class = get_model_class(model_arch) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 648c6fcaba7..0a7119b4e1f 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -355,6 +355,78 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + float sumf = 0; + +#if defined __wasm_simd128__ + v128_t sumv = wasm_f32x4_splat(0.0f); + float summs = 0.0f; + + for (int ib = 0; ib < nb; ++ib) { + const block_q4_1 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; + + summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + + const v128_t raw = wasm_v128_load(x0->qs); + const v128_t v0s = wasm_v128_and(raw, wasm_i8x16_splat(0x0F)); + const v128_t v1s = wasm_u8x16_shr(raw, 4); + + const v128_t ys_lo = wasm_v128_load(y0->qs); + const v128_t ys_hi = wasm_v128_load(y0->qs + 16); + + const v128_t v0s_l = wasm_u16x8_extend_low_u8x16(v0s); + const v128_t v0s_h = wasm_u16x8_extend_high_u8x16(v0s); + const v128_t ylo_l = wasm_i16x8_extend_low_i8x16(ys_lo); + const v128_t ylo_h = wasm_i16x8_extend_high_i8x16(ys_lo); + const v128_t v1s_l = wasm_u16x8_extend_low_u8x16(v1s); + const v128_t v1s_h = wasm_u16x8_extend_high_u8x16(v1s); + const v128_t yhi_l = wasm_i16x8_extend_low_i8x16(ys_hi); + const v128_t yhi_h = wasm_i16x8_extend_high_i8x16(ys_hi); + + const v128_t acc = wasm_i32x4_add( + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(v0s_l, ylo_l), + wasm_i32x4_dot_i16x8(v0s_h, ylo_h)), + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(v1s_l, yhi_l), + wasm_i32x4_dot_i16x8(v1s_h, yhi_h))); + + sumv = wasm_f32x4_add(sumv, + wasm_f32x4_mul( + wasm_f32x4_convert_i32x4(acc), + wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)))); + } + + sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; + + *s = sumf; + +#else + UNUSED(nb); + UNUSED(x); + UNUSED(y); + UNUSED(sumf); + + ggml_vec_dot_q4_1_q8_1_generic( + n, s, bs, vx, bx, vy, by, nrc); +#endif +} + void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 0ecf7ae02ac..9e54b676b93 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -38,6 +38,7 @@ #include "kleidiai.h" #include "ggml-cpu.h" +#include "ggml-cpu-impl.h" #include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-threading.h" @@ -61,7 +62,8 @@ struct ggml_kleidiai_context { ggml_kleidiai_kernels * kernels_q8; int sme_thread_cap; // <= 0 means “SME disabled/unknown”; int thread_hint; // <= 0 means “no hint” -} static ctx = { CPU_FEATURE_NONE, nullptr, nullptr, 0, -1 }; + int chunk_multiplier; +} static ctx = { CPU_FEATURE_NONE, nullptr, nullptr, 0, -1, 4 }; static const char* cpu_feature_to_string(cpu_feature f) { if (f == CPU_FEATURE_NONE) { @@ -186,8 +188,9 @@ static void init_kleidiai_context(void) { if (!initialized) { initialized = true; - const char *env_sme = getenv("GGML_KLEIDIAI_SME"); - const char *env_threads = getenv("GGML_TOTAL_THREADS"); + const char *env_sme = getenv("GGML_KLEIDIAI_SME"); + const char *env_threads = getenv("GGML_TOTAL_THREADS"); + const char *env_chunk_mult = getenv("GGML_KLEIDIAI_CHUNK_MULTIPLIER"); const bool cpu_has_sme = ggml_cpu_has_sme(); size_t detected_smcus = 0; @@ -204,6 +207,14 @@ static void init_kleidiai_context(void) { } } + if (env_chunk_mult) { + bool ok = false; + int multiplier = parse_uint_env(env_chunk_mult, "GGML_KLEIDIAI_CHUNK_MULTIPLIER", &ok); + if (ok && multiplier > 0) { + ctx.chunk_multiplier = multiplier; + } + } + // SME policy: // - If CPU doesn't support SME: SME always off. // - Else: @@ -296,6 +307,50 @@ static inline size_t align_up(size_t value, size_t alignment) { return remainder == 0 ? value : value + (alignment - remainder); } +static inline size_t gcd_size(size_t a, size_t b) { + while (b != 0) { + const size_t t = a % b; + a = b; + b = t; + } + return a; +} + +static inline bool lcm_size(size_t a, size_t b, size_t & result) { + if (a == 0 || b == 0) { + result = 0; + return false; + } + const size_t g = gcd_size(a, b); + const size_t q = a / g; + if (q > SIZE_MAX / b) { + return false; + } + result = q * b; + return true; +} + +static inline size_t ceil_div_size(size_t a, size_t b) { + return b == 0 ? 0 : (a + b - 1) / b; +} + +struct kleidiai_block_args { + size_t lhs_bl; + size_t rhs_bl; + size_t pack_bl; +}; + +static inline kleidiai_block_args kleidiai_get_block_args(ggml_type rhs_type) { + switch (rhs_type) { + case GGML_TYPE_Q4_0: + return { QK4_0, QK4_0, QK4_0 }; + case GGML_TYPE_Q8_0: + return { 0, 0, QK8_0 }; + default: + return { 0, 0, 0 }; + } +} + static inline bool kleidiai_pack_fallback_allowed() { if (ctx.sme_thread_cap <= 0) { return false; @@ -746,8 +801,10 @@ class tensor_traits : public ggml::cpu::tensor_traits { size_t n_step; size_t lhs_packed_size; size_t lhs_offset; - size_t n_offset; - size_t n_cols; + size_t lhs_bl; + size_t rhs_bl; + size_t pack_bl; + size_t lhs_packed_offset0; int assigned_threads; int thread_begin; int thread_end; @@ -772,6 +829,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { continue; } + const kleidiai_block_args block_args = kleidiai_get_block_args(kernels->rhs_type); + runtime[runtime_count] = { slot, kernels, @@ -784,7 +843,9 @@ class tensor_traits : public ggml::cpu::tensor_traits { kinfo->get_n_step(), 0, 0, - 0, + block_args.lhs_bl, + block_args.rhs_bl, + block_args.pack_bl, 0, 0, 0, @@ -795,45 +856,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { } if (runtime_count == 0) { - ggml_kleidiai_kernels * fallback = ggml_kleidiai_select_kernels(ctx.features, dst); - if (!fallback) { - return false; - } - kernel_info * kinfo = is_gemv ? &fallback->gemv : &fallback->gemm; - lhs_packing_info * linfo = is_gemv ? &fallback->gemv_lhs_info : &fallback->gemm_lhs_info; - rhs_packing_info * rinfo = &fallback->rhs_info; - if (!kinfo || !linfo || !linfo->packed_size_ex || !linfo->pack_func_ex || - !kinfo->get_rhs_packed_offset_ex || !kinfo->run_kernel_ex || !kinfo->get_dst_offset || - !rinfo || !rinfo->pack_func_ex || !rinfo->packed_size_ex) { - return false; - } - kernel_chain[0] = fallback; - runtime[0] = { - 0, - fallback, - kinfo, - linfo, - kinfo->get_mr(), - kinfo->get_nr(), - kinfo->get_kr(), - kinfo->get_sr(), - kinfo->get_n_step(), - 0, - 0, - 0, - 0, - 0, - 0, - 0, - nullptr - }; - size_t rhs_size_fallback = 0; - const uint8_t * rhs_base = weight_for_slot(0, rhs_size_fallback); - if (!rhs_base) { - rhs_base = static_cast(src0->data); - } - runtime[0].rhs_base = rhs_base; - runtime_count = 1; + GGML_LOG_WARN("kleidiai: no runtime kernel slot available for supported op %s\n", dst->name); + return false; } const int nth_total = params->nth > 0 ? params->nth : 1; @@ -846,6 +870,13 @@ class tensor_traits : public ggml::cpu::tensor_traits { break; } } + int non_sme_slot = -1; + for (int i = 0; i < runtime_count; ++i) { + if ((runtime[i].kernels->required_cpu & CPU_FEATURE_SME) != CPU_FEATURE_SME) { + non_sme_slot = i; + break; + } + } const int sme_cap_limit = ctx.sme_thread_cap; const bool use_hybrid = sme_cap_limit > 0 && @@ -864,12 +895,15 @@ class tensor_traits : public ggml::cpu::tensor_traits { if (!hybrid_enabled) { int chosen_slot = 0; if (too_small_for_hybrid && sme_slot != -1) { - chosen_slot = sme_slot; + chosen_slot = nth_total > sme_cap_limit && non_sme_slot != -1 ? non_sme_slot : sme_slot; } else if (runtime_count > 1 && ctx.sme_thread_cap > 0 && nth_total > ctx.sme_thread_cap) { chosen_slot = 1; } if (chosen_slot != 0 && chosen_slot < runtime_count) { runtime[0] = runtime[chosen_slot]; + runtime[0].assigned_threads = 0; + runtime[0].thread_begin = 0; + runtime[0].thread_end = 0; } runtime_count = runtime_count > 0 ? 1 : 0; @@ -896,6 +930,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { int fallback_indices[GGML_KLEIDIAI_MAX_KERNEL_SLOTS]; int fallback_count = 0; + // The current hybrid chain is bounded to SME + one non-SME fallback slot. + GGML_ASSERT(GGML_KLEIDIAI_MAX_KERNEL_SLOTS == 2); for (int i = 0; i < runtime_count; ++i) { if (i == sme_slot) { continue; @@ -952,73 +988,67 @@ class tensor_traits : public ggml::cpu::tensor_traits { size_t cursor = 0; for (int i = 0; i < runtime_count; ++i) { - const ggml_type slot_rhs_type = runtime[i].kernels->rhs_type; - const size_t slot_pack_size_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 : - slot_rhs_type == GGML_TYPE_Q8_0 ? QK8_0 : 0; - runtime[i].lhs_packed_size = runtime[i].lhs_info->packed_size_ex(m, k, slot_pack_size_arg, runtime[i].mr, runtime[i].kr, runtime[i].sr); + runtime[i].lhs_packed_size = runtime[i].lhs_info->packed_size_ex(m, k, runtime[i].pack_bl, runtime[i].mr, runtime[i].kr, runtime[i].sr); cursor = align_up(cursor, GGML_KLEIDIAI_PACK_ALIGN); runtime[i].lhs_offset = cursor; + runtime[i].lhs_packed_offset0 = runtime[i].lhs_info->get_packed_offset_ex(0, k, runtime[i].lhs_bl, runtime[i].mr, runtime[i].kr, runtime[i].sr); cursor += runtime[i].lhs_packed_size; } GGML_ASSERT(cursor <= params->wsize); uint8_t * scratch = static_cast(params->wdata); - size_t assigned_cols = 0; - uint64_t weighted_total = 0; - if (runtime_count > 1 && sme_slot != -1) { - for (int i = 0; i < runtime_count; ++i) { - const uint64_t weight = (i == sme_slot) ? (sme_cap << 1) : 1; - weighted_total += (uint64_t)runtime[i].assigned_threads * weight; - } - } + size_t common_step = 1; for (int i = 0; i < runtime_count; ++i) { - runtime[i].n_offset = assigned_cols; if (runtime[i].assigned_threads == 0) { - runtime[i].n_cols = 0; continue; } - const size_t remaining_cols = n - assigned_cols; - if (remaining_cols == 0) { - runtime[i].n_cols = 0; - continue; - } - const size_t step = runtime[i].n_step ? runtime[i].n_step : 1; - size_t target = 0; - if (weighted_total > 0) { - const uint64_t weight = (i == sme_slot) ? (sme_cap << 1) : 1; - target = (size_t)(((uint64_t)n * runtime[i].assigned_threads * weight) / weighted_total); - } else { - target = (size_t)(((uint64_t)n * runtime[i].assigned_threads) / nth_total); - } - target = std::min(target, remaining_cols); - size_t aligned = round_down(target, step); - if (aligned == 0 && remaining_cols >= step) { - aligned = step; + size_t next_step = 0; + if (!lcm_size(common_step, runtime[i].n_step ? runtime[i].n_step : 1, next_step)) { + return false; } - runtime[i].n_cols = aligned; - assigned_cols += aligned; + common_step = next_step; } - - if (assigned_cols < n) { - for (int i = runtime_count - 1; i >= 0; --i) { - if (runtime[i].assigned_threads > 0) { - runtime[i].n_cols += n - assigned_cols; - break; - } - } + GGML_ASSERT(common_step > 0); + + const bool disable_chunking = ggml_is_numa(); + const size_t chunk_multiplier = std::max(1, ctx.chunk_multiplier); + const size_t chunk_divisor = (nth_total == 1 || disable_chunking) ? (size_t)nth_total : (size_t)nth_total * chunk_multiplier; + size_t chunk_cols = align_up(std::max(1, ceil_div_size(n, chunk_divisor)), common_step); + if (chunk_cols == 0) { + chunk_cols = common_step; } + // If common_step is larger than n, the loop below runs one valid tail chunk + // with cols == n. + const size_t nchunk_size = std::max(1, ceil_div_size(n, chunk_cols)); + GGML_ASSERT(nchunk_size <= (size_t)INT_MAX); + const int nchunk = (int)nchunk_size; const size_t dst_stride = dst->nb[1]; + auto run_chunk = [&](runtime_slot & slot, size_t global_start, size_t cols, uint8_t * dst_batch_base) { + const size_t rhs_packed_offset = slot.kernel->get_rhs_packed_offset_ex(global_start, k, slot.rhs_bl); + const size_t dst_offset = slot.kernel->get_dst_offset(0, global_start, dst_stride); + + const uint8_t * lhs_ptr = scratch + slot.lhs_offset + slot.lhs_packed_offset0; + const uint8_t * rhs_ptr = slot.rhs_base + rhs_packed_offset; + float * dst_ptr = reinterpret_cast(dst_batch_base + dst_offset); + + slot.kernel->run_kernel_ex(m, cols, k, slot.rhs_bl, + lhs_ptr, + rhs_ptr, + dst_ptr, + dst_stride, + sizeof(float), + -FLT_MAX, + FLT_MAX); + }; + for (int64_t batch_idx = 0; batch_idx < ne12; ++batch_idx) { const uint8_t * lhs_batch_base = static_cast(src1->data) + batch_idx * src1->nb[2]; uint8_t * dst_batch_base = static_cast(dst->data) + batch_idx * dst->nb[2]; if (runtime[local_slot].assigned_threads > 0) { runtime_slot & slot = runtime[local_slot]; - const ggml_type slot_rhs_type = slot.kernels->rhs_type; - const size_t slot_lhs_exec_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 : - slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0; const int64_t m_roundup_mr = kai_roundup((int64_t)m, (int64_t)slot.mr); int64_t max_threads = slot.mr ? (m_roundup_mr / (int64_t)slot.mr) : slot.assigned_threads; max_threads = std::max(1, max_threads); @@ -1031,8 +1061,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { const int64_t m_start = (int64_t)local_ith * num_m_per_thread0; const int64_t m_count = (local_ith == use_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0; - const size_t base_packed_off = slot.lhs_info->get_packed_offset_ex(m_start, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr); - const size_t next_block_off = slot.lhs_info->get_packed_offset_ex(m_start + slot.mr, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr); + const size_t base_packed_off = slot.lhs_info->get_packed_offset_ex(m_start, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr); + const size_t next_block_off = slot.lhs_info->get_packed_offset_ex(m_start + slot.mr, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr); const size_t row_stride_bytes = slot.mr ? (next_block_off - base_packed_off) / slot.mr : 0; int64_t remaining = m_count; @@ -1049,7 +1079,7 @@ class tensor_traits : public ggml::cpu::tensor_traits { const size_t dst_off = base_packed_off + (size_t)(cur - m_start) * row_stride_bytes; void * dst_ptr = lhs_packed + dst_off; - slot.lhs_info->pack_func_ex(take, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr, 0, src_ptr, src1->nb[1], dst_ptr); + slot.lhs_info->pack_func_ex(take, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr, 0, src_ptr, src1->nb[1], dst_ptr); cur += take; remaining -= take; @@ -1057,49 +1087,29 @@ class tensor_traits : public ggml::cpu::tensor_traits { } } + if (ith_total == 0) { + ggml_threadpool_chunk_set(params->threadpool, nth_total); + } + + // Publishes both LHS packing and the initialized dynamic chunk queue. ggml_barrier(params->threadpool); runtime_slot & slot = runtime[local_slot]; - if (slot.n_cols > 0 && slot.assigned_threads > 0) { - int64_t active_threads = slot.assigned_threads; - const int64_t max_threads = slot.n_step ? (slot.n_cols / slot.n_step) : slot.assigned_threads; - if (max_threads > 0) { - active_threads = std::min(active_threads, std::max(1, max_threads)); + int current_chunk = ith_total; + while (current_chunk < nchunk) { + const size_t global_start = (size_t)current_chunk * chunk_cols; + if (global_start >= n) { + break; } - active_threads = std::max(1, active_threads); - - if (local_ith < active_threads) { - const size_t step = slot.n_step ? slot.n_step : 1; - const size_t chunk0 = round_down((size_t)(slot.n_cols / active_threads), step); - const size_t chunkN = slot.n_cols - (active_threads - 1) * chunk0; - const size_t local_start = (size_t)local_ith * chunk0; - const size_t cols = (local_ith == active_threads - 1) ? chunkN : chunk0; - - if (cols > 0) { - const ggml_type slot_rhs_type = slot.kernels->rhs_type; - const size_t slot_lhs_exec_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 : - slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0; - const size_t slot_rhs_block_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 : - slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0; - const size_t global_start = slot.n_offset + local_start; - const size_t lhs_packed_offset = slot.lhs_info->get_packed_offset_ex(0, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr); - const size_t rhs_packed_offset = slot.kernel->get_rhs_packed_offset_ex(global_start, k, slot_rhs_block_arg); - const size_t dst_offset = slot.kernel->get_dst_offset(0, global_start, dst_stride); - - const uint8_t * lhs_ptr = scratch + slot.lhs_offset + lhs_packed_offset; - const uint8_t * rhs_ptr = slot.rhs_base + rhs_packed_offset; - float * dst_ptr = reinterpret_cast(dst_batch_base + dst_offset); - - slot.kernel->run_kernel_ex(m, cols, k, slot_rhs_block_arg, - lhs_ptr, - rhs_ptr, - dst_ptr, - dst_stride, - sizeof(float), - -FLT_MAX, - FLT_MAX); - } + + const size_t cols = std::min(chunk_cols, n - global_start); + if (cols > 0) { + // KleidiAI GEMM/GEMV kernels accept arbitrary final tail widths; + // only non-tail chunks are guaranteed to be n_step-aligned. + run_chunk(slot, global_start, cols, dst_batch_base); } + + current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); } if (batch_idx != ne12 - 1) { diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 4b0426590ac..bdfbfd2d387 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -682,12 +682,16 @@ static __global__ void mul_mat_vec_q( template __launch_bounds__(get_mmvq_mmid_max_batch_for_device()*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q_moe( - const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, - float * __restrict__ dst, + const void * vx_ptr, const void * vy_ptr, const int32_t * ids_ptr, + float * dst_ptr, const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x, const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst, const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint32_t ncols_dst, const uint32_t ids_stride) { + const void * GGML_CUDA_RESTRICT vx = vx_ptr; + const void * GGML_CUDA_RESTRICT vy = vy_ptr; + const int32_t * GGML_CUDA_RESTRICT ids = ids_ptr; + float * GGML_CUDA_RESTRICT dst = dst_ptr; constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; @@ -707,6 +711,7 @@ static __global__ void mul_mat_vec_q_moe( return; } + ggml_cuda_pdl_sync(); const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride]; const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y); @@ -726,6 +731,8 @@ static __global__ void mul_mat_vec_q_moe( } } + ggml_cuda_pdl_lc(); + // Warp-level reduction only - no shared memory needed #pragma unroll for (int i = 0; i < c_rows_per_block; ++i) { @@ -794,8 +801,9 @@ static void mul_mat_vec_q_moe_launch( const int64_t nblocks_rows = (nrows_x + rows_per_block - 1) / rows_per_block; const dim3 block_nums(nblocks_rows, nchannels_dst); const dim3 block_dims(warp_size, ncols_dst); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); - mul_mat_vec_q_moe<<>>( + ggml_cuda_kernel_launch(mul_mat_vec_q_moe, launch_params, vx, vy, ids, dst, ncols_x, nchannels_y, nrows_x, stride_row_x, stride_col_y, stride_col_dst, stride_channel_x, stride_channel_y, stride_channel_dst, diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index c411e4aeaec..2a41215fd13 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -558,7 +558,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32; cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16; cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16; - cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32; + cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32; cl_kernel kernel_mul_mat_f32_f32; cl_kernel kernel_mul_mat_f16_f16; cl_kernel kernel_mul_mat_f16_f32_1row; @@ -639,7 +639,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc; cl_kernel kernel_upscale; cl_kernel kernel_upscale_bilinear; - cl_kernel kernel_concat_f32; + cl_kernel kernel_concat_f32, kernel_concat_f32_pack; cl_kernel kernel_conv_2d_f16; cl_kernel kernel_conv_2d_f32; cl_kernel kernel_conv_2d_f16_f32; @@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err)); CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err)); CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err)); + CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err)); CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err)); GGML_LOG_CONT("."); } @@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err)); + CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err)); CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); } @@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c nth *= 2; } - size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12}; + int nchunks = 1; + if (src0->type == GGML_TYPE_F32) { + const int chunk_target = nth * 4; + nchunks = (ne00 + chunk_target - 1) / chunk_target; + nchunks = MAX(1, MIN(nchunks, 64)); + } + + size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12}; size_t local_work_size[] = {(size_t)nth, 1, 1}; backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); @@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con int nth = MIN(64, ne0); - cl_kernel kernel = backend_ctx->kernel_concat_f32; + const bool concat_pack = (dim == 0 && ne0 < 32); + cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack + : backend_ctx->kernel_concat_f32; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); @@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim)); - size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3}; - size_t local_work_size[] = {(size_t)nth, 1, 1}; + if (concat_pack) { + // packed kernel needs the dst dims to unflatten its 1-D row index. + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3)); + + const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel); + const int base = MIN(64, maxwg); + const int tpr = MIN(ne0, base); // threads per row + const int rpw = MAX(1, base / tpr); // rows per workgroup + const int lsz = tpr * rpw; + const int nrows = ne1*ne2*ne3; + const int nwg = (nrows + rpw - 1) / rpw; + size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1}; + size_t local_work_size[] = {(size_t)lsz, 1, 1}; + backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst); + } else { + size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3}; + size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } } static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { @@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } else if (backend_ctx->gpu_family == ADRENO) { nth0 = 64; nth1 = 2; - ndst = 4; + ndst = 16; } else { GGML_ASSERT(false && "TODO: Unknown GPU"); } @@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_cpy_f32_f16; break; case GGML_TYPE_F32: - kernel = backend_ctx->kernel_cpy_f32_f32; + kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack + : backend_ctx->kernel_cpy_f32_f32; break; default: GGML_ASSERT(false && "not implemented"); @@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); - const int nth = MIN(64, ne00); + if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) { + const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel); + const int base = MIN(64, maxwg); + const int tpr = MIN(ne00, base); // threads per row + const int rpw = MAX(1, base / tpr); // rows per workgroup + const int lsz = tpr * rpw; // <= base <= maxwg + const int nrows = ne01*ne02*ne03; + const int nwg = (nrows + rpw - 1) / rpw; - size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; - size_t local_work_size[] = {(size_t)nth, 1, 1}; + size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1}; + size_t local_work_size[] = {(size_t)lsz, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1); + } else { + const int nth = MIN(64, ne00); - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1); + size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; + size_t local_work_size[] = {(size_t)nth, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1); + } } static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { diff --git a/ggml/src/ggml-opencl/kernels/concat.cl b/ggml/src/ggml-opencl/kernels/concat.cl index 0c1b3d785ca..2fbd7851d3d 100644 --- a/ggml/src/ggml-opencl/kernels/concat.cl +++ b/ggml/src/ggml-opencl/kernels/concat.cl @@ -49,3 +49,70 @@ kernel void kernel_concat_f32( *y = *x; } } + +kernel void kernel_concat_f32_pack( + global const char * src0, + ulong offset0, + global const char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne03, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3, + int dim, + int ne1, + int ne2, + int ne3 +) { + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; + + int lsz = get_local_size(0); + int tpr = min(ne0, lsz); // threads per row + int rpw = lsz / tpr; // rows per workgroup + int lid = get_local_id(0); + int row = get_group_id(0)*rpw + lid / tpr; + int lane = lid - (lid / tpr) * tpr; + + int nrows = ne1*ne2*ne3; + if (row >= nrows) { + return; + } + + int i1 = row % ne1; + int t = row / ne1; + int i2 = t % ne2; + int i3 = t / ne2; + + int o[4] = {0, 0, 0, 0}; + o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); + + for (int i0 = lane; i0 < ne0; i0 += tpr) { + global const float * x; + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00); + } else { + x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10); + } + + global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + *y = *x; + } +} diff --git a/ggml/src/ggml-opencl/kernels/cpy.cl b/ggml/src/ggml-opencl/kernels/cpy.cl index 820aa538a34..adbd2e766d2 100644 --- a/ggml/src/ggml-opencl/kernels/cpy.cl +++ b/ggml/src/ggml-opencl/kernels/cpy.cl @@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32( } } +kernel void kernel_cpy_f32_f32_pack( + global float * src0, + ulong offset0, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne03, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne0, + int ne1, + int ne2, + int ne3, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3 +) { + src0 = (global float*)((global char*)src0 + offset0); + dst = (global float*)((global char*)dst + offsetd); + + int lsz = get_local_size(0); + int tpr = min(ne00, lsz); // threads per row + int rpw = lsz / tpr; // rows per workgroup + int lid = get_local_id(0); + int row = get_group_id(0)*rpw + lid / tpr; + int lane = lid - (lid / tpr) * tpr; + + int nrows = ne01*ne02*ne03; + if (row >= nrows) { + return; + } + + int i01 = row % ne01; + int t = row / ne01; + int i02 = t % ne02; + int i03 = t / ne02; + + // linear index of the first element of this row, unflattened over dst dims + long n = (long)row * ne00; + int i3 = (int)(n / ((long)ne2*ne1*ne0)); + long rm = n - (long)i3*ne2*ne1*ne0; + int i2 = (int)(rm / ((long)ne1*ne0)); + rm -= (long)i2*ne1*ne0; + int i1 = (int)(rm / ne0); + int i0 = (int)(rm - (long)i1*ne0); + + global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int i00 = lane; i00 < ne00; i00 += tpr) { + global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0]; + } +} + kernel void kernel_cpy_i32_i32( global int * src0, ulong offset0, diff --git a/ggml/src/ggml-opencl/kernels/get_rows.cl b/ggml/src/ggml-opencl/kernels/get_rows.cl index c2962edc983..9ae4fff09fc 100644 --- a/ggml/src/ggml-opencl/kernels/get_rows.cl +++ b/ggml/src/ggml-opencl/kernels/get_rows.cl @@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32( src1 = (global int*)((global char*)src1 + offset1); dst = (global float*)((global char*)dst + offsetd); - int i10 = get_group_id(0); - int i11 = get_group_id(1); - int i12 = get_group_id(2); + int nchunks = get_num_groups(0) / ne10; + int g = get_group_id(0); + int i10 = g / nchunks; + int chunk = g - i10 * nchunks; + int i11 = get_group_id(1); + int i12 = get_group_id(2); int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0]; int i02 = i11; int i03 = i12; - for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) { - if (ind >= ne00) { - return; - } - ((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] = - ((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind]; + global float * dst_row = (global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1); + global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03); + + int span = (ne00 + nchunks - 1) / nchunks; + int start = chunk * span; + int end = min(start + span, ne00); + + for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) { + dst_row[ind] = src_row[ind]; } } diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl index 86fe09c6dd6..57b90c05ae5 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl @@ -33,13 +33,15 @@ inline float block_q_6_K_dot_y_flat( global uchar * blk_qh, global char * blk_scales, global half * blk_d, - global float * yy, int ib, int ip, int is, - int l0 + int l0, + float4 y0, + float4 y1, + float4 y2, + float4 y3 ) { - int y_offset = 128*ip + l0; int q_offset_l = 64*ip + l0; int q_offset_h = 32*ip + l0; @@ -48,36 +50,28 @@ inline float block_q_6_K_dot_y_flat( global uchar * qh = blk_qh + ib*64 + q_offset_h; global char * sc = blk_scales + ib*16 + is; - global float * y = yy + ib * QK_K + y_offset; - float dall = blk_d[ib]; - float sumf = 0; - float4 sums = {0.f, 0.f, 0.f, 0.f}; - - sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f); - sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f); - sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f); - sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f); - - sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f); - sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f); - sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f); - sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f); - - sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f); - sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f); - sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f); - sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f); - - sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f); - sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f); - sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f); - sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f); - - sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]); - - return sumf; + // Vectorized loads: 3 uchar4 weight loads instead of 12 scalar byte reads. + // q_offset_l/h are 4-aligned, so these are aligned vector loads. + uchar4 q1v = vload4(0, q1); + uchar4 q2v = vload4(0, q2); + uchar4 qhv = vload4(0, qh); + + int4 q1i = convert_int4(q1v); + int4 q2i = convert_int4(q2v); + int4 qhi = convert_int4(qhv); + + // Reconstruct the four 6-bit weight groups (low/high nibble of ql OR'd with the + // matching 2-bit plane of qh), same arithmetic as the scalar version, then dot() + // against the cached activation lanes. + float4 w0 = convert_float4((q1i & 0xF) | ((qhi & Q6_K_MASK1) << 4)) - 32.f; + float4 w1 = convert_float4((q2i & 0xF) | ((qhi & Q6_K_MASK2) << 2)) - 32.f; + float4 w2 = convert_float4((q1i >> 4) | ((qhi & Q6_K_MASK3) )) - 32.f; + float4 w3 = convert_float4((q2i >> 4) | ((qhi & Q6_K_MASK4) >> 2)) - 32.f; + + return dall * (dot(y0, w0) * sc[0] + dot(y1, w1) * sc[2] + + dot(y2, w2) * sc[4] + dot(y3, w3) * sc[6]); } #undef N_DST @@ -89,7 +83,7 @@ inline float block_q_6_K_dot_y_flat( #define N_SIMDGROUP 2 #define N_SIMDWIDTH 16 #elif defined (ADRENO_GPU) -#define N_DST 4 +#define N_DST 16 #define N_SIMDGROUP 2 #define N_SIMDWIDTH 64 #endif @@ -146,49 +140,39 @@ kernel void kernel_mul_mv_q6_K_f32_flat( global half * blk_d = (global half *) src0_d + offset_src0_d; global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1; - int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0 - int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1 + int tid = get_sub_group_local_id()%(N_SIMDWIDTH/BLOCK_STRIDE); // within-super-block part, 0..15 + int ix = get_sub_group_local_id()/(N_SIMDWIDTH/BLOCK_STRIDE); // super-block selector, 0..BLOCK_STRIDE-1 int ip = tid/8; // first or second half of (super) block (0 or 1) int il = tid%8; // each half has 8 parts, one per scale int n = 4; // 4 scales at a time (and 4 sums) int l0 = n*il; // offset into half-block, 0..28 int is = 8*ip + l0/16; // 0, 1, 8, 9 - float4 sumf = 0; + float sumf[N_DST]; + for (int row = 0; row < N_DST; row++) { + sumf[row] = 0.f; + } for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) { - if (first_row + 0 < ne01) { - sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0); - } - if (first_row + 1 < ne01) { - sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0); - } - if (first_row + 2 < ne01) { - sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0); - } - if (first_row + 3 < ne01) { - sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0); + global float * y = yy + ib * QK_K + 128*ip + l0; + float4 y0 = vload4(0, y + 0); + float4 y1 = vload4(0, y + 32); + float4 y2 = vload4(0, y + 64); + float4 y3 = vload4(0, y + 96); + + for (int row = 0; row < N_DST; row++) { + if (first_row + row < ne01) { + sumf[row] += block_q_6_K_dot_y_flat( + blk_ql + row*nb*128, blk_qh + row*nb*64, blk_scales + row*nb*16, blk_d + row*nb, + ib, ip, is, l0, y0, y1, y2, y3); + } } } - float4 tot = (float4)( - sub_group_reduce_add(sumf.s0), - sub_group_reduce_add(sumf.s1), - sub_group_reduce_add(sumf.s2), - sub_group_reduce_add(sumf.s3) - ); - if (get_sub_group_local_id() == 0) { - if (first_row + 0 < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0; - } - if (first_row + 1 < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1; - } - if (first_row + 2 < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2; - } - if (first_row + 3 < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3; + for (int row = 0; row < N_DST; row++) { + float tot = sub_group_reduce_add(sumf[row]); + if (get_sub_group_local_id() == 0 && first_row + row < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; } } } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 96138f57ebe..3f246e8672d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3971,7 +3971,9 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf. dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases. - dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1; + // ne[1] <= 8 so multi-column decode (spec / MTP verify) also bootstraps the reorder; + // all reorderable types have a _switch_ncols kernel. + dst->src[1]->ne[1] <= 8 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1; } static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */, diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index abd1e49a70e..cf2b59576aa 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -56,6 +56,65 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r } } +template +static void mul_mat_vec_q_reorder_ncols(const void * __restrict__ vx, const void * __restrict__ vy, + float * __restrict__ dst, const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + const sycl::nd_item<3> & nd_item) { + using block_type = ggml_sycl_reordered::block_q_t; + using block_traits = typename block_type::traits; + + const auto sg = nd_item.get_sub_group(); + const int sg_range = sg.get_group_linear_range(); + const int workgroup_id = nd_item.get_group_linear_id(); + const int sg_id = sg.get_group_linear_id(); + const int row = workgroup_id * sg_range + sg_id; + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / block_traits::qk; + constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi); + constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq; + const int nblocks = nrows * (ncols / block_traits::qk); + + static_assert(blocks_per_subgroup > 0); + static_assert(block_elements_per_subgroup > 0); + + float partial_sum[ncols_dst] = {0.0f}; + for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) { + const int ibx = row * blocks_per_row + i; + + const auto bx_offset = block_type::get_block_offset(ibx, nblocks); + const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx); + const int iby = i * block_type::block_to_q8_1_ratio(); + +#pragma unroll + for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) { + const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup); + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const char * vy_j = (const char *)vy + j * stride_col_y_bytes; + const int8_t * q8_1_quant_ptr = (const int8_t *)vy_j + iby * QK8_1; + const sycl::half2* q8_1_ds_ptr = (const sycl::half2 *)(vy_j + ncols + iby * sizeof(sycl::half2)); + + partial_sum[j] += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs); + } + } + } + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + float sum = sycl::reduce_over_group(nd_item.get_sub_group(), partial_sum[j], std::plus<>()); + + if (sg.leader()) { + dst[j * stride_col_dst + row] = sum; + } + } +} + template static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) { @@ -100,6 +159,70 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ } } +template +static void mul_mat_vec_q_ncols( + const void * __restrict__ vx, + const void * __restrict__ vy, + float * __restrict__ dst, + const int ncols, + const int nrows, + const int stride_col_y, + const int stride_col_dst, + const sycl::nd_item<3> & item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; + + // partial sums: one per output column + float tmp[ncols_dst] = {0.0f}; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); + i < blocks_per_row; + i += blocks_per_warp) { + + const int ibx = row * blocks_per_row + i; + const int iby = i * (qk / QK8_1); + + // read weight block once, dot against all columns + for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) { + const int iqs = elem + vdr * (item_ct1.get_local_id(2) % (qi / vdr)); + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + tmp[j] += vec_dot_q_sycl(&x[ibx], &y[j * stride_col_y + iby], iqs); + } + } + } + + // reduce within subgroup +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp[j] += dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), tmp[j], mask); + } + } + + if (item_ct1.get_local_id(2) == 0) { +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + dst[j * stride_col_dst + row] = tmp[j]; + } + } +} + template static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, const void *__restrict__ vy, @@ -553,6 +676,45 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q4_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q4_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_0 reorder multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK4_0 == 0); @@ -571,6 +733,45 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * } } +template +static void mul_mat_vec_q4_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q4_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q4_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q4_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q4_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q4_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q4_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q4_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q4_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q4_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_0 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -595,6 +796,45 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q4_1_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_1 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q4_1_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q4_1_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q4_1_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q4_1_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q4_1_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q4_1_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q4_1_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q4_1_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q4_1_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_1 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_mxfp4_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_MXFP4 == 0); @@ -613,6 +853,45 @@ static void mul_mat_vec_mxfp4_q8_1_sycl(const void * vx, const void * vy, float } } +template +static void mul_mat_vec_mxfp4_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_MXFP4 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_mxfp4_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_mxfp4_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_mxfp4_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_mxfp4_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_mxfp4_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_mxfp4_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_mxfp4_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_mxfp4_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_mxfp4_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for MXFP4 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_nvfp4_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_NVFP4 == 0); @@ -631,6 +910,45 @@ static void mul_mat_vec_nvfp4_q8_1_sycl(const void * vx, const void * vy, float } } +template +static void mul_mat_vec_nvfp4_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_NVFP4 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_nvfp4_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_nvfp4_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_nvfp4_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_nvfp4_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_nvfp4_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_nvfp4_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_nvfp4_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_nvfp4_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_nvfp4_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for NVFP4 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -655,6 +973,45 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q5_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q5_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q5_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q5_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q5_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q5_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q5_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q5_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q5_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q5_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q5_0 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -679,6 +1036,45 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q5_1_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_1 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q5_1_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q5_1_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q5_1_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q5_1_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q5_1_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q5_1_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q5_1_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q5_1_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q5_1_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q5_1 multi-col MMVQ", ncols_dst); + } +} + static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK8_0 == 0); @@ -698,6 +1094,45 @@ static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q8_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q8_0 reorder multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -722,6 +1157,45 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q8_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q8_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q8_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q8_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q8_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q8_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q8_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q8_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q8_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q8_0 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -746,6 +1220,45 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q2_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q2_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q2_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q2_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q2_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q2_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q2_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q2_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q2_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q2_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q2_K multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -790,6 +1303,85 @@ static void reorder_mul_mat_vec_q3_k_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q3_k_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q3_k_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q3_K reorder multi-col MMVQ", ncols_dst); + } +} + +template +static void mul_mat_vec_q3_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q3_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q3_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q3_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q3_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q3_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q3_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q3_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q3_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q3_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q3_K multi-col MMVQ", ncols_dst); + } +} + + static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -814,6 +1406,51 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q4_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, + stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q4_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q4_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q4_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q4_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q4_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q4_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q4_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q4_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q4_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_K multi-col MMVQ", ncols_dst); + } +} + static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); @@ -834,6 +1471,44 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q4_k_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q4_k_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_K reorder multi-col MMVQ", ncols_dst); + } +} static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, @@ -859,6 +1534,51 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q5_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, + stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q5_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q5_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q5_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q5_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q5_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q5_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q5_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q5_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q5_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q5_K multi-col MMVQ", ncols_dst); + } +} + static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); @@ -879,6 +1599,45 @@ static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q5_k_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q5_k_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q5_K reorder multi-col MMVQ", ncols_dst); + } +} + static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); @@ -897,6 +1656,46 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, }); }); } + +template +static void reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q6_k_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q6_k_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q6_K reorder multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -921,6 +1720,51 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q6_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, + stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q6_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q6_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q6_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q6_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q6_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q6_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q6_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q6_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q6_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q6_K multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, @@ -1117,6 +1961,51 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_iq4_xs_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, + stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_iq4_xs_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_iq4_xs_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for IQ4_XS multi-col MMVQ", ncols_dst); + } +} + void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, @@ -1143,42 +2032,135 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q4_0: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n"); - reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q4_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n"); + reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q4_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n"); mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } break; case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_1_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q4_1_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q5_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_1_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q5_1_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q8_0: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n"); - reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n"); + reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl\n"); mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } break; case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q2_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q2_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q3_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl\n"); - reorder_mul_mat_vec_q3_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, - stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q3_k_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q3_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q3_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q3_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q3_K_q8_1_sycl\n"); mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } @@ -1186,9 +2168,27 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q4_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n"); - reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q4_k_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q4_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n"); mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } @@ -1196,9 +2196,27 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q5_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n"); - reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q5_k_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q5_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl\n"); mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } @@ -1206,9 +2224,27 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q6_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n"); - reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q6_k_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q6_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_k_q8_1_sycl\n"); mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } @@ -1238,13 +2274,43 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); break; case GGML_TYPE_IQ4_XS: - mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_iq4_xs_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_iq4_xs_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_MXFP4: - mul_mat_vec_mxfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_mxfp4_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_mxfp4_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_mxfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_NVFP4: - mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_nvfp4_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_nvfp4_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; default: GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(src0->type)); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e7d04634b8a..fc9bc8fe376 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5084,6 +5084,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { } ++idx; } + } else if (device->driver_id != vk::DriverId::eIntelProprietaryWindows) { + // Disabled on Intel Windows due to a driver bug: https://github.com/ggml-org/llama.cpp/pull/23964#issuecomment-4598226147 + int idx = 0; + for (uint32_t n : {64, 128, 256, 512}) { + const uint32_t block_size = std::min(device->subgroup_size, n); + ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { block_size, n }, 1); + ++idx; + } } const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; @@ -5630,6 +5638,11 @@ static vk_device ggml_vk_get_device(size_t idx) { #endif device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle); +#ifdef __APPLE__ + if (device->vendor_id == VK_VENDOR_ID_AMD) { + device->subgroup_shuffle = false; + } +#endif device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered); @@ -6336,6 +6349,15 @@ static void ggml_vk_print_gpu_info(size_t idx) { } #endif +#if defined(VK_NV_cooperative_matrix2) + VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; + coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; + last_struct = (VkBaseOutStructure *)&coopmat2_features; + } +#endif + VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {}; coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV; if (coopmat2_decode_vector_support) { @@ -6367,6 +6389,19 @@ static void ggml_vk_print_gpu_info(size_t idx) { #endif && ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture); +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + coopmat2_support = coopmat2_support && + coopmat2_features.cooperativeMatrixWorkgroupScope && + coopmat2_features.cooperativeMatrixFlexibleDimensions && + coopmat2_features.cooperativeMatrixReductions && + coopmat2_features.cooperativeMatrixConversions && + coopmat2_features.cooperativeMatrixPerElementOperations && + coopmat2_features.cooperativeMatrixTensorAddressing && + coopmat2_features.cooperativeMatrixBlockLoads; +#else + coopmat2_support = false; +#endif + coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector; #if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) coopmat2_decode_vector_support = false; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp index 72059d4afc2..a2069964adb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp @@ -1,14 +1,16 @@ #version 450 #extension GL_EXT_control_flow_attributes : require +#ifndef FWHT_SHMEM #extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_shuffle : enable +#endif -layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; - -layout(constant_id = 0) const uint WARP_SIZE = 32; +layout(constant_id = 0) const uint BLOCK_SIZE = 32; layout(constant_id = 1) const uint N = 128; +layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; + layout(push_constant) uniform parameter { uint n_rows; @@ -20,35 +22,72 @@ layout(push_constant) uniform parameter layout(binding = 0, std430) readonly buffer A { float data_a[]; }; layout(binding = 1, std430) writeonly buffer D { float data_d[]; }; -const uint EL_W = N / WARP_SIZE; +const uint EL_W = N / BLOCK_SIZE; + +#ifdef FWHT_SHMEM +shared float shmem[4 * N]; +#endif void main() { - const uint lane = gl_SubgroupInvocationID; - for (uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID; - row < n_rows; - row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) { +#ifdef FWHT_SHMEM + const uint tid = gl_LocalInvocationID.x; + const uint shmem_base = gl_LocalInvocationID.y * N; + const uint row_id = gl_LocalInvocationID.y; +#else + const uint tid = gl_SubgroupInvocationID; + const uint row_id = gl_SubgroupID; +#endif + + for (uint base_row = gl_WorkGroupID.x * gl_WorkGroupSize.y; + base_row < n_rows; + base_row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) { + const uint row = base_row + row_id; const uint row_offset = row * N; +#ifndef FWHT_SHMEM + if (row >= n_rows) { + continue; + } +#endif + float reg[EL_W]; [[unroll]] for (uint i = 0; i < EL_W; ++i) { - reg[i] = data_a[src_offset + row_offset + i * WARP_SIZE + lane] * scale; + reg[i] = row < n_rows ? data_a[src_offset + row_offset + i * BLOCK_SIZE + tid] * scale : 0.0; } +#ifdef FWHT_SHMEM + [[unroll]] + for (uint h = 1; h < BLOCK_SIZE; h <<= 1) { + [[unroll]] + for (uint i = 0; i < EL_W; ++i) { + shmem[shmem_base + i * BLOCK_SIZE + tid] = reg[i]; + } + barrier(); + [[unroll]] + for (uint j = 0; j < EL_W; ++j) { + const float val = reg[j]; + const float other = shmem[shmem_base + j * BLOCK_SIZE + (tid ^ h)]; + reg[j] = (tid & h) == 0 ? val + other : other - val; + } + barrier(); + } +#else [[unroll]] - for (uint h = 1; h < WARP_SIZE; h <<= 1) { + for (uint h = 1; h < BLOCK_SIZE; h <<= 1) { [[unroll]] for (uint j = 0; j < EL_W; ++j) { const float val = reg[j]; const float val2 = subgroupShuffleXor(val, h); - reg[j] = (lane & h) == 0 ? val + val2 : val2 - val; + reg[j] = (tid & h) == 0 ? val + val2 : val2 - val; } } +#endif [[unroll]] - for (uint h = WARP_SIZE; h < N; h <<= 1) { - const uint step = h / WARP_SIZE; + for (uint h = BLOCK_SIZE; h < N; h <<= 1) { + const uint step = h / BLOCK_SIZE; [[unroll]] for (uint j = 0; j < EL_W; j += 2 * step) { [[unroll]] @@ -61,9 +100,16 @@ void main() { } } - [[unroll]] - for (uint i = 0; i < EL_W; ++i) { - data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i]; +#ifdef FWHT_SHMEM + if (row < n_rows) { +#endif + [[unroll]] + for (uint i = 0; i < EL_W; ++i) { + data_d[dst_offset + row_offset + i * BLOCK_SIZE + tid] = reg[i]; + } +#ifdef FWHT_SHMEM } + barrier(); +#endif } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index de7dbec2c63..d65cd12b287 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -957,6 +957,7 @@ void process_shaders() { string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}})); string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("fwht_f32", "fwht.comp", {}); + string_to_spv("fwht_shmem_f32", "fwht.comp", {{"FWHT_SHMEM", "1"}}); string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}})); string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ce556ec9b65..bd6246137b0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -128,6 +128,7 @@ class LLM: MOE_LATENT_SIZE = "{arch}.moe_latent_size" NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers" NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers" + DEEPSTACK_MAPPING = "{arch}.deepstack_mapping" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" @@ -325,6 +326,8 @@ class ClipVision: WA_PATTERN_MODE = "clip.vision.wa_pattern_mode" # used by mimovl, per-layer -1/0/1 IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers" WINDOW_SIZE = "clip.vision.window_size" + FEATURE_LAYERS = "clip.vision.feature_layer" # Granite4 Vision + IMAGE_GRID_PINPOINTS = "clip.vision.image_grid_pinpoints" # Granite4 Vision class Attention: HEAD_COUNT = "clip.vision.attention.head_count" @@ -333,6 +336,9 @@ class Attention: class Projector: SCALE_FACTOR = "clip.vision.projector.scale_factor" + QUERY_SIDE = "clip.vision.projector.query_side" + WINDOW_SIDE = "clip.vision.projector.window_side" + SPATIAL_OFFSETS = "clip.vision.projector.spatial_offsets" class SAM: BLOCK_COUNT = "clip.vision.sam.block_count" @@ -434,6 +440,7 @@ class MODEL_ARCH(IntEnum): GEMMA3 = auto() GEMMA3N = auto() GEMMA4 = auto() + GEMMA4_ASSISTANT = auto() GEMMA_EMBEDDING = auto() STARCODER2 = auto() RWKV6 = auto() @@ -821,6 +828,31 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2 V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2 + # qformer projector (vision) - Granite4 Vision + V_QF_PROJ_QUERY = auto() + V_QF_PROJ_NORM = auto() + V_QF_PROJ_LINEAR = auto() + V_QF_SELF_ATTN_Q = auto() + V_QF_SELF_ATTN_K = auto() + V_QF_SELF_ATTN_V = auto() + V_QF_SELF_ATTN_O = auto() + V_QF_SELF_ATTN_NORM = auto() + V_QF_CROSS_ATTN_Q = auto() + V_QF_CROSS_ATTN_K = auto() + V_QF_CROSS_ATTN_V = auto() + V_QF_CROSS_ATTN_O = auto() + V_QF_CROSS_ATTN_NORM = auto() + V_QF_FFN_UP = auto() + V_QF_FFN_DOWN = auto() + V_QF_FFN_NORM = auto() + V_PROJ_NORM = auto() + # multi-projector (bid => projector id) - Granite4 vision + V_MULTI_PROJ_IMG_POS = auto() + V_MULTI_PROJ_QUERY = auto() + V_MULTI_PROJ_NORM = auto() + V_MULTI_PROJ_LINEAR = auto() + V_MULTI_PROJ_POST_NORM = auto() + # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_EMBD_NORM = auto() @@ -866,6 +898,8 @@ class MODEL_TENSOR(IntEnum): A_PER_DIM_K_SCALE = auto() # gemma4 A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp + NEXTN_PROJ_PRE = auto() + NEXTN_PROJ_POST = auto() NEXTN_EH_PROJ = auto() NEXTN_EMBED_TOKENS = auto() NEXTN_ENORM = auto() @@ -885,7 +919,7 @@ class MODEL_TENSOR(IntEnum): A_CTC_OUT = auto() A_CTC_OUT_MID = auto() A_ENC_ATTN_REL_POS_EMB = auto() - # qformer projector + # audio qformer projector A_QF_PROJ_QUERY = auto() A_QF_PROJ_NORM = auto() A_QF_PROJ_LINEAR = auto() @@ -955,6 +989,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA3: "gemma3", MODEL_ARCH.GEMMA3N: "gemma3n", MODEL_ARCH.GEMMA4: "gemma4", + MODEL_ARCH.GEMMA4_ASSISTANT: "gemma4-assistant", MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", @@ -1337,10 +1372,33 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}", MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2", MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", - MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR + MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR, Granite4Vision MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2 MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2 + # Granite4 Vision + # qformer layers (bid => proj_id) + # NOTE: Names align with A_QF_* + MODEL_TENSOR.V_QF_SELF_ATTN_Q: "v.proj_blk.{bid}.self_attn_q", + MODEL_TENSOR.V_QF_SELF_ATTN_K: "v.proj_blk.{bid}.self_attn_k", + MODEL_TENSOR.V_QF_SELF_ATTN_V: "v.proj_blk.{bid}.self_attn_v", + MODEL_TENSOR.V_QF_SELF_ATTN_O: "v.proj_blk.{bid}.self_attn_out", + MODEL_TENSOR.V_QF_SELF_ATTN_NORM: "v.proj_blk.{bid}.self_attn_norm", + MODEL_TENSOR.V_QF_CROSS_ATTN_Q: "v.proj_blk.{bid}.cross_attn_q", + MODEL_TENSOR.V_QF_CROSS_ATTN_K: "v.proj_blk.{bid}.cross_attn_k", + MODEL_TENSOR.V_QF_CROSS_ATTN_V: "v.proj_blk.{bid}.cross_attn_v", + MODEL_TENSOR.V_QF_CROSS_ATTN_O: "v.proj_blk.{bid}.cross_attn_out", + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: "v.proj_blk.{bid}.cross_attn_norm", + MODEL_TENSOR.V_QF_FFN_UP: "v.proj_blk.{bid}.ffn_up", + MODEL_TENSOR.V_QF_FFN_DOWN: "v.proj_blk.{bid}.ffn_down", + MODEL_TENSOR.V_QF_FFN_NORM: "v.proj_blk.{bid}.ffn_norm", + # multi-projector (bid => projector ID) + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: "v.proj_blk.{bid}.img_pos", + MODEL_TENSOR.V_MULTI_PROJ_QUERY: "v.proj_blk.{bid}.query", + MODEL_TENSOR.V_MULTI_PROJ_NORM: "v.proj_blk.{bid}.norm", + MODEL_TENSOR.V_MULTI_PROJ_LINEAR: "v.proj_blk.{bid}.linear", + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: "v.proj_blk.{bid}.post_norm", + # audio (mtmd) # note: all audio tensor names must use prefix "a." or "mm.a." MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", @@ -1417,6 +1475,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down", MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm", # NextN/MTP + MODEL_TENSOR.NEXTN_PROJ_PRE: "nextn.pre_projection", + MODEL_TENSOR.NEXTN_PROJ_POST: "nextn.post_projection", MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm", @@ -1522,6 +1582,29 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_SAM_NET_3, MODEL_TENSOR.V_RESMPL_QUERY_768, MODEL_TENSOR.V_RESMPL_QUERY_1024, + MODEL_TENSOR.V_PROJ_NORM, + MODEL_TENSOR.V_QF_PROJ_QUERY, + MODEL_TENSOR.V_QF_PROJ_NORM, + MODEL_TENSOR.V_QF_PROJ_LINEAR, + MODEL_TENSOR.V_QF_SELF_ATTN_Q, + MODEL_TENSOR.V_QF_SELF_ATTN_K, + MODEL_TENSOR.V_QF_SELF_ATTN_V, + MODEL_TENSOR.V_QF_SELF_ATTN_O, + MODEL_TENSOR.V_QF_SELF_ATTN_NORM, + MODEL_TENSOR.V_QF_CROSS_ATTN_Q, + MODEL_TENSOR.V_QF_CROSS_ATTN_K, + MODEL_TENSOR.V_QF_CROSS_ATTN_V, + MODEL_TENSOR.V_QF_CROSS_ATTN_O, + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM, + MODEL_TENSOR.V_QF_FFN_UP, + MODEL_TENSOR.V_QF_FFN_DOWN, + MODEL_TENSOR.V_QF_FFN_NORM, + MODEL_TENSOR.V_QF_PROJ_NORM, + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS, + MODEL_TENSOR.V_MULTI_PROJ_QUERY, + MODEL_TENSOR.V_MULTI_PROJ_LINEAR, + MODEL_TENSOR.V_MULTI_PROJ_NORM, + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_EMBD_NORM, @@ -2500,6 +2583,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.PER_LAYER_PROJ_NORM, MODEL_TENSOR.PER_LAYER_POST_NORM, ], + MODEL_ARCH.GEMMA4_ASSISTANT: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.NEXTN_PROJ_PRE, + MODEL_TENSOR.NEXTN_PROJ_POST, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + MODEL_TENSOR.LAYER_OUT_SCALE, + ], MODEL_ARCH.GEMMA_EMBEDDING: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT, @@ -4388,6 +4489,7 @@ class VisionProjectorType: MINICPMV4_6 = "minicpmv4_6" GRANITE_SPEECH = "granite_speech" # audio MIMOVL = "mimovl" + GRANITE4_VISION = "granite4_vision" # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 875d0f73d96..182c9c54a53 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -959,8 +959,13 @@ def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) def add_num_deepstack_layers(self, count: int) -> None: + """Add scalar deepstack layer count (qwen3vl format)""" self.add_uint32(Keys.LLM.NUM_DEEPSTACK_LAYERS.format(arch=self.arch), count) + def add_deepstack_mapping(self, layers: Sequence[int]) -> None: + """Add per-layer deepstack projector indices (Granite4 Vision format)""" + self.add_array(Keys.LLM.DEEPSTACK_MAPPING.format(arch=self.arch), list(layers)) + def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) @@ -1184,6 +1189,15 @@ def add_vision_preproc_min_tiles(self, value: int) -> None: def add_vision_preproc_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value) + def add_vision_projector_query_side(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.QUERY_SIDE, value) + + def add_vision_projector_window_side(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.WINDOW_SIDE, value) + + def add_vision_spatial_offsets(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipVision.Projector.SPATIAL_OFFSETS, layers) + def add_vision_image_mean(self, values: Sequence[float]) -> None: self.add_array(Keys.ClipVision.IMAGE_MEAN, values) @@ -1240,6 +1254,12 @@ def add_vision_wa_pattern_mode(self, modes: Sequence[int]) -> None: def add_vision_window_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value) + def add_vision_feature_layers(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipVision.FEATURE_LAYERS, layers) + + def add_vision_image_grid_pinpoints(self, layers: Sequence[Sequence[int]]) -> None: + self.add_array(Keys.ClipVision.IMAGE_GRID_PINPOINTS, layers) + def add_vision_sam_layers_count(self, value: int) -> None: self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 82f26e7b303..a9537983de1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1408,6 +1408,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "model.vision_tower.vision_model.embeddings.patch_embedding", # Granite4Vision "vision_tower.vision_model.embeddings.patch_embedding", "model.vision_tower.embeddings.patch_embedding", # minicpmv4_6 "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 @@ -1439,6 +1440,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_EMBD_POS: ( + "model.vision_tower.vision_model.embeddings.position_embedding", # Granite4Vision "vision_tower.vision_model.embeddings.position_embedding", "model.vision_tower.embeddings.position_embedding", # minicpmv4_6 "model.vision_tower.embeddings.position_embeddings", # Intern-S1 @@ -1456,8 +1458,9 @@ class TensorNameMap: "model.vision_embedder.pos_embedding", # gemma4 unified ), + # TODO: I think these should all be moved to mapping_cfg? MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( - "model.image_newline", # Deepseek-OCR + "model.image_newline", # Deepseek-OCR, Granite4Vision "vit.perceive.image_newline", # HunyuanVL ), @@ -1477,6 +1480,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_Q: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.q_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 @@ -1502,6 +1506,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_K: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.k_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 @@ -1527,6 +1532,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_V: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.v_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 @@ -1545,6 +1551,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", "model.vision_tower.encoder.layers.{bid}.layer_norm1", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL @@ -1567,6 +1574,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_O: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.out_proj", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL @@ -1595,6 +1603,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", "model.vision_tower.encoder.layers.{bid}.layer_norm2", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL @@ -1618,6 +1627,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_FFN_UP: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", "model.vision_tower.encoder.layers.{bid}.mlp.fc1", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 @@ -1649,6 +1659,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", "model.vision_tower.encoder.layers.{bid}.mlp.fc2", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 @@ -1706,6 +1717,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_POST_NORM: ( + "model.vision_tower.vision_model.post_layernorm", # Granite4Vision "vision_tower.vision_model.post_layernorm", "model.vision_tower.post_layernorm", # minicpmv4_6 "model.vision_model.post_layernorm", # SmolVLM @@ -1952,6 +1964,82 @@ class TensorNameMap: "model.vision_tower.std_scale", # gemma4 ), + # For these tensors, bid => projector ID + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: ( + "model.layerwise_projectors.{bid}.image_positions", # Granite4 Vision + "model.spatial_projectors.{bid}.image_positions", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_QUERY: ( + "model.layerwise_projectors.{bid}.query", # Granite4 Vision + "model.spatial_projectors.{bid}.query", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_LINEAR: ( + "model.layerwise_projectors.{bid}.out_linear", # Granite4 Vision + "model.spatial_projectors.{bid}.out_linear", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_NORM: ( + "model.layerwise_projectors.{bid}.norm", # Granite4 Vision + "model.spatial_projectors.{bid}.norm", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: ( + "model.layerwise_projectors.{bid}.qformer.layernorm", # Granite4 Vision + "model.spatial_projectors.{bid}.qformer.layernorm", # Granite4 Vision + ), + + # For these tensors, bid => proj-id + MODEL_TENSOR.V_QF_SELF_ATTN_Q: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_K: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_V: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_O: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_Q: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_K: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_V: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_O: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_UP: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_DOWN: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision + ), + # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: ( @@ -2279,6 +2367,14 @@ class TensorNameMap: ), # NextN/MTP tensors + MODEL_TENSOR.NEXTN_PROJ_PRE: ( + "pre_projection", + ), + + MODEL_TENSOR.NEXTN_PROJ_POST: ( + "post_projection", + ), + MODEL_TENSOR.NEXTN_EH_PROJ: ( "model.layers.{bid}.eh_proj", ), diff --git a/include/llama.h b/include/llama.h index a7e5679c0ce..6da9e995373 100644 --- a/include/llama.h +++ b/include/llama.h @@ -394,6 +394,10 @@ extern "C" { // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) struct llama_sampler_seq_config * samplers; size_t n_samplers; + + // a source/target/parent context + // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts + struct llama_context * ctx_other; }; struct llama_model_tensor_override { diff --git a/models/templates/LFM2.5-8B-A1B.jinja b/models/templates/LFM2.5-8B-A1B.jinja new file mode 100644 index 00000000000..8bca4a545e9 --- /dev/null +++ b/models/templates/LFM2.5-8B-A1B.jinja @@ -0,0 +1,115 @@ +{{- bos_token -}} +{%- set preserve_thinking = preserve_thinking | default(false) -%} + +{%- macro format_arg_value(arg_value) -%} + {%- if arg_value is string -%} + {{- "'" + arg_value + "'" -}} + {%- elif arg_value is mapping -%} + {{- arg_value | tojson -}} + {%- else -%} + {{- arg_value | string -}} + {%- endif -%} +{%- endmacro -%} + +{%- macro parse_content(content) -%} + {%- if content is string -%} + {{- content -}} + {%- else -%} + {%- set _ns = namespace(result="") -%} + {%- for item in content -%} + {%- if item["type"] == "image" -%} + {%- set _ns.result = _ns.result + "" -%} + {%- elif item["type"] == "text" -%} + {%- set _ns.result = _ns.result + item["text"] -%} + {%- else -%} + {%- set _ns.result = _ns.result + item | tojson -%} + {%- endif -%} + {%- endfor -%} + {{- _ns.result -}} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_calls(tool_calls) -%} + {%- set tool_calls_ns = namespace(tool_calls=[]) -%} + {%- for tool_call in tool_calls -%} + {%- set func_name = tool_call["function"]["name"] -%} + {%- set func_args = tool_call["function"]["arguments"] -%} + {%- set args_ns = namespace(arg_strings=[]) -%} + {%- for arg_name, arg_value in func_args.items() -%} + {%- set args_ns.arg_strings = args_ns.arg_strings + [arg_name + "=" + format_arg_value(arg_value)] -%} + {%- endfor -%} + {%- set tool_calls_ns.tool_calls = tool_calls_ns.tool_calls + [func_name + "(" + (args_ns.arg_strings | join(", ")) + ")"] -%} + {%- endfor -%} + {{- "<|tool_call_start|>[" + (tool_calls_ns.tool_calls | join(", ")) + "]<|tool_call_end|>" -}} +{%- endmacro -%} + +{%- set ns = namespace(system_prompt="", last_user_index=-1) -%} +{%- if messages[0]["role"] == "system" -%} + {%- if messages[0].get("content") -%} + {%- set ns.system_prompt = parse_content(messages[0]["content"]) -%} + {%- endif -%} + {%- set messages = messages[1:] -%} +{%- endif -%} +{%- if tools -%} + {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%} + {%- for tool in tools -%} + {%- if tool is not string -%} + {%- set tool = tool | tojson -%} + {%- endif -%} + {%- set ns.system_prompt = ns.system_prompt + tool -%} + {%- if not loop.last -%} + {%- set ns.system_prompt = ns.system_prompt + ", " -%} + {%- endif -%} + {%- endfor -%} + {%- set ns.system_prompt = ns.system_prompt + "]" -%} +{%- endif -%} +{%- if ns.system_prompt -%} + {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}} +{%- endif -%} +{%- for message in messages -%} + {%- if message["role"] == "user" -%} + {%- set ns.last_user_index = loop.index0 -%} + {%- endif -%} +{%- endfor -%} +{%- for message in messages -%} + {{- "<|im_start|>" + message.role + "\n" -}} + {%- if message.role == "assistant" -%} + {%- generation -%} + {%- if message.thinking is defined and (preserve_thinking or loop.index0 > ns.last_user_index) -%} + {{- "" + message.thinking + "" -}} + {%- endif -%} + {%- set _cfm_tag = "CONTINUE_FINAL_MESSAGE_TAG " -%} + {%- set _has_cfm = false -%} + {%- if message.content is defined -%} + {%- set content = parse_content(message.content) -%} + {%- if not (preserve_thinking or loop.index0 > ns.last_user_index) -%} + {%- if "" in content -%} + {%- set content = content.split("")[-1] | trim -%} + {%- endif -%} + {%- endif -%} + {%- if message.tool_calls is defined and content.endswith(_cfm_tag) -%} + {%- set _has_cfm = true -%} + {%- set _trunc_len = (content | length) - (_cfm_tag | length) -%} + {{- content[:_trunc_len] -}} + {%- else -%} + {{- content -}} + {%- endif -%} + {%- endif -%} + {%- if message.tool_calls is defined -%} + {{- render_tool_calls(message.tool_calls) -}} + {%- endif -%} + {%- if _has_cfm -%} + {{- _cfm_tag -}} + {%- endif -%} + {{- "<|im_end|>\n" -}} + {%- endgeneration -%} + {%- else %} + {%- if message.get("content") -%} + {{- parse_content(message["content"]) -}} + {%- endif -%} + {{- "<|im_end|>\n" -}} + {%- endif %} +{%- endfor -%} +{%- if add_generation_prompt -%} + {{- "<|im_start|>assistant\n" -}} +{%- endif -%} \ No newline at end of file diff --git a/scripts/ui-assets.cmake b/scripts/ui-assets.cmake index 4637c81278f..e69d776166c 100644 --- a/scripts/ui-assets.cmake +++ b/scripts/ui-assets.cmake @@ -151,8 +151,22 @@ function(npm_build out_var) return() endif() - if(NOT EXISTS "${UI_SOURCE_DIR}/node_modules") - message(STATUS "UI: running npm install (first time)") + # npm writes node_modules/.package-lock.json on every successful install, + # so a package-lock.json newer than this marker means node_modules is stale + set(NPM_MARKER "${UI_SOURCE_DIR}/node_modules/.package-lock.json") + set(need_install FALSE) + if(NOT EXISTS "${NPM_MARKER}") + set(need_install TRUE) + else() + file(TIMESTAMP "${UI_SOURCE_DIR}/package-lock.json" lock_ts) + file(TIMESTAMP "${NPM_MARKER}" marker_ts) + if(lock_ts STRGREATER marker_ts) + set(need_install TRUE) + endif() + endif() + + if(need_install) + message(STATUS "UI: running npm install") execute_process( COMMAND ${NPM_EXECUTABLE} install WORKING_DIRECTORY "${UI_SOURCE_DIR}" diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 4a1aaa955a8..3e0fe66afff 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), + /*.mem_size =*/ hparams.n_layer()*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) { }; // make tensors - tensors.reserve(hparams.n_layer); + tensors.reserve(hparams.n_layer()); tensors.push_back(nullptr); // there's never a tensor for layer 0 - for (size_t il = 1; il < hparams.n_layer; il++) { + for (size_t il = 1; il < hparams.n_layer(); il++) { ggml_backend_buffer_type_t buft = model.select_buft(il); ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { @@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply( layer_start = il_start; layer_end = il_end; - for (size_t il = 1; il < hparams.n_layer; il++) { + for (size_t il = 1; il < hparams.n_layer(); il++) { assert(tensors[il] != nullptr); const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 576b986c6ef..2fc556053a6 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -57,6 +57,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA3, "gemma3" }, { LLM_ARCH_GEMMA3N, "gemma3n" }, { LLM_ARCH_GEMMA4, "gemma4" }, + { LLM_ARCH_GEMMA4_ASSISTANT, "gemma4-assistant" }, { LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, @@ -197,6 +198,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" }, { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" }, { LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" }, + { LLM_KV_DEEPSTACK_MAPPING, "%s.deepstack_mapping" }, { LLM_KV_HIDDEN_ACT, "%s.hidden_activation" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, @@ -457,6 +459,8 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, + { LLM_TENSOR_NEXTN_PROJ_PRE, "nextn.pre_projection" }, + { LLM_TENSOR_NEXTN_PROJ_POST, "nextn.post_projection" }, { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, { LLM_TENSOR_DFLASH_FC, "dflash_fc" }, { LLM_TENSOR_DFLASH_HIDDEN_NORM, "dflash_hidden_norm" }, @@ -773,6 +777,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PROJ_PRE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PROJ_POST, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so // the model loader doesn't fault on the block index. diff --git a/src/llama-arch.h b/src/llama-arch.h index 663446b7e2b..0da781aea5d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -61,6 +61,7 @@ enum llm_arch { LLM_ARCH_GEMMA3, LLM_ARCH_GEMMA3N, LLM_ARCH_GEMMA4, + LLM_ARCH_GEMMA4_ASSISTANT, LLM_ARCH_GEMMA_EMBEDDING, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, @@ -201,6 +202,7 @@ enum llm_kv { LLM_KV_MOE_LATENT_SIZE, LLM_KV_NEXTN_PREDICT_LAYERS, LLM_KV_NUM_DEEPSTACK_LAYERS, + LLM_KV_DEEPSTACK_MAPPING, LLM_KV_HIDDEN_ACT, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, @@ -563,6 +565,8 @@ enum llm_tensor { LLM_TENSOR_INDEXER_PROJ, LLM_TENSOR_INDEXER_ATTN_K, LLM_TENSOR_INDEXER_ATTN_Q_B, + LLM_TENSOR_NEXTN_PROJ_PRE, + LLM_TENSOR_NEXTN_PROJ_POST, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 720b99f081c..8b1aba82d9d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -69,9 +69,10 @@ llama_context::llama_context( cparams.embeddings_nextn_masked = false; cparams.offload_kqv = params.offload_kqv; cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; cparams.warmup = false; + cparams.ctx_type = params.ctx_type; + cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -84,7 +85,17 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; - cparams.ctx_type = params.ctx_type; + cparams.ctx_other = nullptr; + + // TODO: more generic + if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { + if (params.ctx_other == nullptr) { + // TODO: change from runtime_error to llama_exception to avoid printing error message + throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)"); + } + + cparams.ctx_other = params.ctx_other; + } // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later @@ -300,10 +311,11 @@ llama_context::llama_context( // init the memory module if (!hparams.vocab_only) { llama_memory_params params_mem = { - /*.type_k =*/ params.type_k, - /*.type_v =*/ params.type_v, - /*.swa_full =*/ params.swa_full, - /*.ctx_type= */ cparams.ctx_type, + /*.type_k =*/ params.type_k, + /*.type_v =*/ params.type_v, + /*.swa_full =*/ params.swa_full, + /*.ctx_type =*/ cparams.ctx_type, + /*.mem_other =*/ llama_get_memory(cparams.ctx_other), }; memory.reset(model.create_memory(params_mem, cparams)); @@ -341,7 +353,7 @@ llama_context::llama_context( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = model.n_devices() > 1 && - model.n_gpu_layers() > model.hparams.n_layer && + model.n_gpu_layers() > model.hparams.n_layer_all && model.split_mode() == LLAMA_SPLIT_MODE_LAYER && cparams.offload_kqv && !model.has_tensor_overrides(); @@ -930,7 +942,7 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) { throw std::runtime_error("no nextn embeddings"); } - const uint32_t n_embd = model.hparams.n_embd; + const uint32_t n_embd = model.hparams.n_embd_out(); if (!cparams.embeddings_nextn_masked) { // unmasked: nextn rows are stored densely, indexed by raw token position. @@ -1645,7 +1657,7 @@ int llama_context::encode(const llama_batch & batch_inp) { ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd = hparams.n_embd_out(); GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_nextn.size); ggml_backend_tensor_get_async(backend_h, t_h_nextn, embd_nextn.data, 0, n_tokens*n_embd*sizeof(float)); } @@ -2116,7 +2128,7 @@ int llama_context::decode(const llama_batch & batch_inp) { ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd = hparams.n_embd_out(); float * embd_nextn_out = embd_nextn.data + offset*n_embd; GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_nextn.size); @@ -2209,7 +2221,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); bool has_logits = true; @@ -2228,12 +2239,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { logits.size = has_logits ? n_vocab*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; - embd_nextn.size = has_embd_nextn ? n_embd*n_outputs_max : 0; + embd_nextn.size = has_embd_nextn ? n_embd_out*n_outputs_max : 0; if (has_embd_nextn && !cparams.embeddings_nextn_masked) { // unmasked: nextn row exists for every token in the batch, not just // those flagged via batch.logits[i] -> size by token count instead. - embd_nextn.size = (size_t) n_embd * n_batch; + embd_nextn.size = (size_t) n_embd_out * n_batch; } // Allocate backend sampling output buffers if there are backend samplers configured. @@ -2559,7 +2570,7 @@ llm_graph_cb llama_context::graph_get_cb() const { // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer; + const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer_all; if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); @@ -3584,6 +3595,7 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, + /*.ctx_other =*/ nullptr, }; return result; @@ -3625,7 +3637,7 @@ llama_context * llama_init_from_model( if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) { const uint32_t blck_size = ggml_blck_size(params.type_k); - for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) { if (model->hparams.n_embd_head_k(il) % blck_size != 0) { LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n", __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il)); @@ -3636,7 +3648,7 @@ llama_context * llama_init_from_model( if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) { const uint32_t blck_size = ggml_blck_size(params.type_v); - for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) { if (model->hparams.n_embd_head_v(il) % blck_size != 0) { LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n", __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il)); @@ -3658,12 +3670,11 @@ llama_context * llama_init_from_model( } if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && - model->hparams.nextn_predict_layers == 0) { + model->hparams.n_layer_nextn == 0) { LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); return nullptr; } - try { auto * ctx = new llama_context(*model, params); return ctx; @@ -3802,6 +3813,14 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { ctx->set_embeddings_nextn(value, masked); } +llama_memory_t llama_get_memory(const struct llama_context * ctx) { + if (!ctx) { + return nullptr; + } + + return ctx->get_memory(); +} + float * llama_get_embeddings_nextn(llama_context * ctx) { ctx->synchronize(); @@ -3865,7 +3884,7 @@ struct ggml_cgraph * llama_graph_reserve( uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs) { - auto * memory = ctx->get_memory(); + auto memory = ctx->get_memory(); llama_memory_context_ptr mctx; if (memory) { mctx = memory->init_full(); @@ -3905,10 +3924,6 @@ int32_t llama_set_adapter_cvec( // memory // -llama_memory_t llama_get_memory(const struct llama_context * ctx) { - return ctx->get_memory(); -} - void llama_memory_clear(llama_memory_t mem, bool data) { if (!mem) { return; @@ -4219,3 +4234,7 @@ void llama_opt_epoch( llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) { return ctx->memory_breakdown(); } + +llama_context * llama_get_ctx_other(struct llama_context * ctx) { + return ctx->get_cparams().ctx_other; +} diff --git a/src/llama-context.h b/src/llama-context.h index 92ba4081e89..a48e8fcca82 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -6,6 +6,7 @@ #include "llama-graph.h" #include "llama-adapter.h" #include "llama-impl.h" +#include "llama-memory.h" #include "ggml-cpp.h" #include "ggml-opt.h" @@ -285,7 +286,7 @@ struct llama_context { bool dflash_decoder_ctx = false; - std::unique_ptr memory; + llama_memory_ptr memory; // decode output (2-dimensional array: [n_outputs][n_vocab]) buffer_view logits = {nullptr, 0}; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 1cba534edaf..7e324dbebf1 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -50,4 +50,6 @@ struct llama_cparams { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + + llama_context * ctx_other; }; diff --git a/src/llama-ext.h b/src/llama-ext.h index 7ad6125fad3..bd74544129b 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -100,3 +100,5 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); + +LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ffb21dac728..11e0dc036f2 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -508,7 +508,7 @@ static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break; }; - LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); + LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swa_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__); LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__); @@ -676,18 +676,18 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { if (self_k_idxs && self_k_idxs->buffer) { mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); - - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); - - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); } @@ -716,18 +716,18 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { if (self_k_idxs && self_k_idxs->buffer) { res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + return res; } @@ -867,7 +867,9 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch); + } + if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) { attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); } @@ -875,7 +877,9 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch); attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch); + } + if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) { attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); } @@ -921,18 +925,18 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens; //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; @@ -1116,7 +1120,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : cparams (params.cparams), ubatch (params.ubatch), n_embd (hparams.n_embd), - n_layer (hparams.n_layer), + n_layer (hparams.n_layer()), + n_layer_nextn (hparams.n_layer_nextn), n_rot (hparams.n_rot()), n_ctx (cparams.n_ctx), n_head (hparams.n_head()), @@ -1971,7 +1976,12 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { res->t_inp_embd = cur; // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { + // NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be + // multimodal inputs that should not be scaled. + if (ubatch.token && hparams.f_embedding_scale != 0.0f) { + if (!ggml_is_contiguous(cur)) { + cur = ggml_cont(ctx0, cur); + } cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale); } diff --git a/src/llama-graph.h b/src/llama-graph.h index 21bb80a9564..fa77a47321d 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -820,6 +820,7 @@ struct llm_graph_context { const int64_t n_embd; const int64_t n_layer; + const int64_t n_layer_nextn; const int64_t n_rot; const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 087afec55c6..2bf57687382 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -7,31 +7,38 @@ void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { if (dense_first) { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer(); ++il) { is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0); } } else { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer(); ++il) { is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); } } + + for (uint32_t il = n_layer(); il < n_layer_all; ++il) { + is_swa_impl[il] = false; + } } -// TODO: implement -//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) { -// if (dense_first) { -// for (uint32_t il = 0; il < n_layer; ++il) { -// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0); -// } -// } else { -// for (uint32_t il = 0; il < n_layer; ++il) { -// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); -// } -// } -//} +void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) { + if (dense_first) { + for (uint32_t il = 0; il < n_layer(); ++il) { + is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0); + } + } else { + for (uint32_t il = 0; il < n_layer(); ++il) { + is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); + } + } + + for (uint32_t il = n_layer(); il < n_layer_all; ++il) { + is_recr_impl[il] = false; + } +} bool llama_hparams::is_swa_any() const { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (is_swa_impl[il]) { return true; } @@ -41,7 +48,7 @@ bool llama_hparams::is_swa_any() const { } uint32_t llama_hparams::n_head(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_head_arr[il]; } @@ -49,7 +56,7 @@ uint32_t llama_hparams::n_head(uint32_t il) const { } uint32_t llama_hparams::n_head_kv(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_head_kv_arr[il]; } @@ -57,7 +64,7 @@ uint32_t llama_hparams::n_head_kv(uint32_t il) const { } uint32_t llama_hparams::n_ff(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_ff_arr[il]; } @@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const { } uint32_t llama_hparams::n_rot(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_rot_swa : n_rot_full; } @@ -84,6 +91,10 @@ uint32_t llama_hparams::n_rot(uint32_t il) const { } uint32_t llama_hparams::n_embd_inp() const { + if (n_embd_inp_impl > 0) { + return n_embd_inp_impl; + } + uint32_t n_embd_inp = n_embd; if (n_deepstack_layers > 0) { @@ -98,7 +109,7 @@ uint32_t llama_hparams::n_embd_out() const { } uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full; } @@ -106,7 +117,7 @@ uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { } uint32_t llama_hparams::n_embd_head_v(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full; } @@ -127,7 +138,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const { bool llama_hparams::is_n_embd_k_gqa_variable() const { const uint32_t val = n_embd_k_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (val != n_embd_k_gqa(il)) { return true; } @@ -138,7 +149,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const { bool llama_hparams::is_n_embd_v_gqa_variable() const { const uint32_t val = n_embd_v_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (val != n_embd_v_gqa(il)) { return true; } @@ -149,7 +160,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const { uint32_t llama_hparams::n_embd_k_gqa_max() const { uint32_t val = n_embd_k_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { val = std::max(val, n_embd_k_gqa(il)); } @@ -158,7 +169,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const { uint32_t llama_hparams::n_embd_v_gqa_max() const { uint32_t val = n_embd_v_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { val = std::max(val, n_embd_v_gqa(il)); } @@ -207,11 +218,11 @@ uint32_t llama_hparams::n_embd_s() const { } bool llama_hparams::is_recr(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_recr_impl[il]; } - GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer); + GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all); } uint32_t llama_hparams::n_pos_per_embd() const { @@ -219,11 +230,11 @@ uint32_t llama_hparams::n_pos_per_embd() const { } bool llama_hparams::is_swa(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa_impl[il]; } - GGML_ABORT("fatal error"); + GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all); } bool llama_hparams::is_mla() const { @@ -242,12 +253,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const { } bool llama_hparams::has_kv(uint32_t il) const { - if (kv_only_nextn) { - // MTP head: only the trailing nextn_predict_layers blocks own a KV cache; - // the leading trunk blocks are not executed in this graph. - return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers); - } - if (n_layer_kv_from_start >= 0) { if (il < (uint32_t) n_layer_kv_from_start) { return true; @@ -260,16 +265,8 @@ bool llama_hparams::has_kv(uint32_t il) const { return true; } -uint32_t llama_hparams::n_layer_kv() const { - uint32_t res = 0; - - for (uint32_t il = 0; il < n_layer; ++il) { - if (has_kv(il)) { - res++; - } - } - - return res; +uint32_t llama_hparams::n_layer() const { + return n_layer_all - n_layer_nextn; } bool llama_hparams::use_mrope() const { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6b9fd546868..53cc2d1938d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -48,12 +48,15 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; - uint32_t n_layer; - int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + uint32_t n_layer_all; + uint32_t n_layer_nextn = 0; uint32_t n_expert = 0; uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; + // TODO: this needs to be reworked + int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + // different head size for full_attention and SWA layers uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head @@ -96,9 +99,6 @@ struct llama_hparams { uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t moe_every_n_layers = 0; uint32_t moe_latent_size = 0; - uint32_t nextn_predict_layers = 0; - - bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches) float f_norm_eps; float f_norm_rms_eps; @@ -185,6 +185,9 @@ struct llama_hparams { // for Classifiers uint32_t n_cls_out = 1; + // input embedding dimension (0 = use n_embd) + uint32_t n_embd_inp_impl = 0; + // output embedding dimension (0 = use n_embd) uint32_t n_embd_out_impl = 0; @@ -219,6 +222,12 @@ struct llama_hparams { uint32_t indexer_top_k = 0; // qwen3vl deepstack + // When parsed from GGUF, this implies the first N layers consume the first + // N deepstack embeddings. Use deepstack_mapping_arr if you need a more + // complex mapping. If using deepstack_mapping_arr, also make sure to set + // n_deepstack_layers to the number of unique deepstack layers so that + // n_embd_imp is accurate (see granite.cpp). + // TODO: can be expressed via the `new n_embd_inp_impl` and remove this param uint32_t n_deepstack_layers = 0; // DFlash draft model @@ -227,6 +236,11 @@ struct llama_hparams { uint32_t dflash_mask_token_id = 0; uint32_t dflash_n_target_features = 0; + // deepstack layer array (Granite4 Vision) + // -1 => no deepstack + // >=0 => input embedding index for deepstack injection + std::array deepstack_mapping_arr; + // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; @@ -278,8 +292,7 @@ struct llama_hparams { bool is_swa(uint32_t il) const; - // TODO: implement - //void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); + void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); // whether or not the given layer is recurrent (for hybrid models) bool is_recr(uint32_t il) const; @@ -335,8 +348,8 @@ struct llama_hparams { bool has_kv(uint32_t il) const; - // number of layers for which has_kv() returns true - uint32_t n_layer_kv() const; + // number of effective layers (excludes nextn layers) + uint32_t n_layer() const; // note that this function uses different SWA parameters from those in the hparams // note: inlined on purpose for performance reasons diff --git a/src/llama-kv-cache-dsa.cpp b/src/llama-kv-cache-dsa.cpp index e44004b5586..916ab653756 100644 --- a/src/llama-kv-cache-dsa.cpp +++ b/src/llama-kv-cache-dsa.cpp @@ -32,7 +32,7 @@ llama_kv_cache_dsa::llama_kv_cache_dsa( kv_mla = std::make_unique( model, model.hparams, type_k, type_v, v_trans, offload, unified, kv_size, n_seq_max, n_pad, - n_swa, swa_type, filter, reuse); + n_swa, swa_type, nullptr, filter, reuse, nullptr); // we use llama_kv_cache for caching indexer keys // by hand-tweaking some hparams we fool it to create @@ -49,7 +49,7 @@ llama_kv_cache_dsa::llama_kv_cache_dsa( kv_lid = std::make_unique( model, hparams_lid, type_k, type_v, v_trans, offload, unified, kv_size, n_seq_max, n_pad, - n_swa, swa_type, filter, reuse); + n_swa, swa_type, nullptr, filter, reuse, nullptr); } void llama_kv_cache_dsa::clear(bool data) { diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 9b9f1790363..aa1b1b72ebe 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -23,8 +23,10 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) { + const layer_reuse_cb & reuse, + const layer_share_cb & share) : hparams(model.hparams), unified(unified) { // chain filters const layer_filter_cb filter_base = [&](int32_t il) { @@ -59,17 +61,27 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); + llama_memory_t mem_other_base = nullptr; + if (mem_other) { + mem_other_base = static_cast(mem_other)->get_base(); + } + + llama_memory_t mem_other_swa = nullptr; + if (mem_other) { + mem_other_swa = static_cast(mem_other)->get_swa(); + } + kv_base = std::make_unique( model, hparams, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, - 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse); + 0, LLAMA_SWA_TYPE_NONE, mem_other_base, filter_base, reuse, share); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( model, hparams, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, - hparams.n_swa, hparams.swa_type, filter_swa, reuse); + hparams.n_swa, hparams.swa_type, mem_other_swa, filter_swa, reuse, share); } void llama_kv_cache_iswa::clear(bool data) { diff --git a/src/llama-kv-cache-iswa.h b/src/llama-kv-cache-iswa.h index 70ab22f0d60..dfafc1ef510 100644 --- a/src/llama-kv-cache-iswa.h +++ b/src/llama-kv-cache-iswa.h @@ -25,8 +25,10 @@ class llama_kv_cache_iswa : public llama_memory_i { uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + const layer_share_cb & share); ~llama_kv_cache_iswa() = default; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index e83f7884fbd..8d53bf0ef44 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -90,14 +90,16 @@ llama_kv_cache::llama_kv_cache( uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : + const layer_reuse_cb & reuse, + const layer_share_cb & share) : model(model), hparams(hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { GGML_ASSERT(kv_size % n_pad == 0); - const uint32_t n_layer_kv = hparams.n_layer_kv(); + const uint32_t n_layer = hparams.n_layer_all; // define a comparator for the buft -> ctx map to ensure that the order is well-defined: struct ggml_backend_buft_comparator { @@ -112,7 +114,7 @@ llama_kv_cache::llama_kv_cache( auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -160,7 +162,9 @@ llama_kv_cache::llama_kv_cache( const bool is_mla = hparams.is_mla(); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + other = static_cast(mem_other); + + for (uint32_t il = 0; il < n_layer; il++) { if (!hparams.has_kv(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); continue; @@ -171,6 +175,24 @@ llama_kv_cache::llama_kv_cache( continue; } + if (share && other) { + const int32_t il_share = share(il); + + if (il_share >= 0) { + const auto & layer_share = other->layers[other->map_layer_ids[il_share]]; + + LLAMA_LOG_WARN("%s: layer %3d: sharing with layer %d. k = %p, v = %p\n", __func__, il, il_share, + layer_share.k->data, layer_share.v->data); + + map_layer_ids[il] = layers.size(); + + layers.push_back(layer_share); + layers.back().il = il; + + continue; + } + } + if (n_embd_head_k_all == 0) { n_embd_head_k_all = (int32_t) hparams.n_embd_head_k(il); } else if (n_embd_head_k_all > 0 && n_embd_head_k_all != (int32_t) hparams.n_embd_head_k(il)) { @@ -230,7 +252,7 @@ llama_kv_cache::llama_kv_cache( if (reuse) { LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + for (uint32_t il = 0; il < n_layer; il++) { const int32_t il_reuse = reuse(il); if (il_reuse < 0) { @@ -282,28 +304,37 @@ llama_kv_cache::llama_kv_cache( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); - const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false; - if (attn_rot_disable) { - LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__); - } + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + n_embd_head_k_all = other->n_embd_head_k_all; + n_embd_head_v_all = other->n_embd_head_v_all; - attn_rot_k = - !attn_rot_disable && - n_embd_head_k_all > 0 && - ggml_is_quantized(type_k) && - hparams.n_embd_head_k() % 64 == 0; + attn_rot_k = other->attn_rot_k; + attn_rot_v = other->attn_rot_v; + } else { + const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); + const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false; + if (attn_rot_disable) { + LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__); + } - // always create Hadamard rotation tensors for DeepSeek V3.2 DSA lightning indexer - if (model.arch == LLM_ARCH_DEEPSEEK32 && hparams.n_embd_head_k_full == hparams.indexer_head_size) { - attn_rot_k = true; - } + attn_rot_k = + !attn_rot_disable && + n_embd_head_k_all > 0 && + ggml_is_quantized(type_k) && + hparams.n_embd_head_k() % 64 == 0; + + // always create Hadamard rotation tensors for DeepSeek V3.2 DSA lightning indexer + if (model.arch == LLM_ARCH_DEEPSEEK32 && hparams.n_embd_head_k_full == hparams.indexer_head_size) { + attn_rot_k = true; + } - attn_rot_v = - !attn_rot_disable && - n_embd_head_v_all > 0 && - ggml_is_quantized(type_v) && - hparams.n_embd_head_v() % 64 == 0; + attn_rot_v = + !attn_rot_disable && + n_embd_head_v_all > 0 && + ggml_is_quantized(type_v) && + hparams.n_embd_head_v() % 64 == 0; + } LLAMA_LOG_INFO("%s: attn_rot_k = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_k, n_embd_head_k_all); LLAMA_LOG_INFO("%s: attn_rot_v = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_v, n_embd_head_v_all); @@ -347,6 +378,11 @@ void llama_kv_cache::clear(bool data) { } bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return true; + } + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); if (p0 < 0) { @@ -410,6 +446,11 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { } void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size()); GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size()); @@ -497,6 +538,11 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll } void llama_kv_cache::seq_keep(llama_seq_id seq_id) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -519,6 +565,11 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) { } void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1"); @@ -564,6 +615,11 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll } void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1"); @@ -598,6 +654,11 @@ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, in } llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return other->seq_pos_min(seq_id); + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -606,6 +667,11 @@ llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { } llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return other->seq_pos_max(seq_id); + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -746,6 +812,11 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vectorget_sched(); @@ -1021,6 +1092,12 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, } void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + v_cells = other->v_cells; + return; + } + // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ]; @@ -1831,6 +1908,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { } ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + GGML_ASSERT(!other); + auto * ctx = res->get_ctx(); auto * gf = res->get_gf(); @@ -1876,6 +1956,11 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co } void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_UNUSED(flags); io.write(&n_stream, sizeof(n_stream)); @@ -1941,6 +2026,11 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla } void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_UNUSED(flags); GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 649269af6dd..f5ace6ae350 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -98,7 +98,7 @@ class llama_kv_cache : public llama_memory_i { // likely through `struct llama_memory_params` llama_kv_cache( const llama_model & model, - const llama_hparams & hparams, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, @@ -109,8 +109,10 @@ class llama_kv_cache : public llama_memory_i { uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + const layer_share_cb & share); ~llama_kv_cache() = default; @@ -264,6 +266,9 @@ class llama_kv_cache : public llama_memory_i { // note: this is not part of the KV state and it's only used to speed-up the find_slot() method std::vector v_heads; + // TODO: temporary until we refactor to be able to share the same cells between 2 kv caches [TAG_KV_CACHE_SHARE_CELLS] + llama_kv_cache * other; + std::vector v_cells; // maps from a sequence id to a stream id diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index a242079b406..c7d4bcd413e 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -43,9 +43,11 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( n_seq_max, n_ubatch, n_pad, + nullptr, filter_attn == nullptr ? [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, + nullptr, nullptr )), mem_recr(new llama_memory_recurrent( diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 66ec3fd6d55..f2d49cbce54 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -44,9 +44,11 @@ llama_memory_hybrid::llama_memory_hybrid( n_pad, n_swa, swa_type, + nullptr, filter_attn == nullptr ? [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, + nullptr, nullptr )), mem_recr(new llama_memory_recurrent( diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index ec5dc5835dd..6a4892fb471 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -26,7 +26,7 @@ llama_memory_recurrent::llama_memory_recurrent( uint32_t n_seq_max, uint32_t n_rs_seq, const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) { - const int32_t n_layer = hparams.n_layer; + const int32_t n_layer = hparams.n_layer(); head = 0; size = mem_size; @@ -863,7 +863,7 @@ void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std:: void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { const uint32_t s_trans = 0; - const uint32_t n_layer = hparams.n_layer; + const uint32_t n_layer = hparams.n_layer(); io.write(&s_trans, sizeof(s_trans)); io.write(&n_layer, sizeof(n_layer)); @@ -1047,8 +1047,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell io.read(&s_trans, sizeof(s_trans)); io.read(&n_layer, sizeof(n_layer)); - if (n_layer != hparams.n_layer) { - LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + if (n_layer != hparams.n_layer()) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer()); return false; } if (cell_count > size) { diff --git a/src/llama-memory.h b/src/llama-memory.h index 4ad1612e45b..db825396645 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -23,6 +23,8 @@ struct llama_memory_params { bool swa_full; llama_context_type ctx_type; + + llama_memory_t mem_other; }; enum llama_memory_status { @@ -76,6 +78,8 @@ struct llama_memory_i { // return negative value to indicate that the layer il should not reuse memory using layer_reuse_cb = std::function; + using layer_share_cb = std::function; + virtual ~llama_memory_i() = default; // split the input batch into a set of ubatches and verify that they can fit into the cache diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 6fcf9185a91..16868e650d0 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -394,6 +394,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array & result, bool required); template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); + template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array & result, bool required); template bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { @@ -1052,10 +1053,10 @@ struct ggml_tensor * llama_model_loader::create_tensor( if (it == ctx_map.end()) { // one ggml context per buffer type int max_n_tensors = n_tensors; - max_n_tensors += 1; // duplicated output tensor - max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors + max_n_tensors += 1; // duplicated output tensor + max_n_tensors += hparams.n_layer()*2; // duplicated rope freq tensors if (files.empty()) { - max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses + max_n_tensors += hparams.n_layer()*256; // this should be well above what any model actually uses } const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors; diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 26fda1abfae..67d4a9df0f0 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -77,7 +77,7 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) { template void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) { GGML_ASSERT(model != nullptr || !per_layer); - const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size(); + const size_t n_values = per_layer ? size_t(model->hparams.n_layer()) : value.size(); GGML_ASSERT(n_values <= value.size()); if (n_values == 0) { @@ -206,7 +206,7 @@ void llama_model_saver::add_kv_from_model() { if (hparams.n_embd_out_impl > 0) { add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl); } - add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer); + add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true); add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); @@ -227,8 +227,9 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale); add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers); - add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers); + add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn); add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers); + add_kv(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr); add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index df270e80719..e0760b84ceb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -139,6 +139,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_gemma3n(params); case LLM_ARCH_GEMMA4: return new llama_model_gemma4(params); + case LLM_ARCH_GEMMA4_ASSISTANT: + return new llama_model_gemma4_assistant(params); case LLM_ARCH_GEMMA_EMBEDDING: return new llama_model_gemma_embedding(params); case LLM_ARCH_STARCODER2: @@ -400,7 +402,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str rotation = get_il_eff(il) % ud->n_devices; } else { il = 0; - rotation = hparams.n_layer % ud->n_devices; + rotation = hparams.n_layer() % ud->n_devices; } const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str()); if (tensor_axis_0 == nullptr) { @@ -555,10 +557,12 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str }; auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector> & segments) -> std::vector { + // for better performance it may make sense to round up blck_size to a higher power of 2 so that more efficient kernels can be used if (hparams.is_recr(il)) { // linear attention - const int64_t head_dim = hparams.ssm_d_state; - const int64_t granularity_qkv = std::lcm(blck_size, head_dim); + const int64_t head_dim = hparams.ssm_d_state; + const int64_t blck_size_perf = std::lcm(blck_size, 128); + const int64_t granularity_qkv = std::lcm(blck_size_perf, head_dim); if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d) || std::regex_match(tensor_name, pattern_ssm_out_weight)) { return std::vector(segments.size(), granularity_qkv); @@ -580,17 +584,24 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str // regular attention const uint32_t n_gqa = hparams.n_gqa(il); const uint32_t n_embd_q = n_gqa * hparams.n_embd_head_k(il); + + // to handle head sizes like 80, only increase granularity while it doesn't cause underutilization + int64_t blck_size_perf = blck_size; + while (blck_size_perf < 128 && blck_size_perf*ud->n_devices < n_embd_q) { + blck_size_perf *= 2; + } + if (std::regex_match(tensor_name, pattern_attn_sinks)) { GGML_ASSERT(segments.size() == 1); - return {std::lcm(n_embd_q, blck_size)/n_embd_q * n_gqa}; + return {std::lcm(n_embd_q, blck_size_perf)/n_embd_q * n_gqa}; } - const int64_t granularity_q = std::lcm(n_embd_q, blck_size); + const int64_t granularity_q = std::lcm(n_embd_q, blck_size_perf); if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_q_bias)) { GGML_ASSERT(segments.size() == 1); // some models have Q gate tensors, for those cases the granularity needs to be doubled: if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) { - return {std::lcm(2*n_embd_q, blck_size)}; + return {std::lcm(2*n_embd_q, blck_size_perf)}; } return {granularity_q}; } @@ -615,8 +626,9 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str // FFN if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) || std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) { + const int64_t blck_size_perf = std::lcm(blck_size, 128); GGML_ASSERT(segments.size() == 1); - return {blck_size}; + return {blck_size_perf}; } // everything else @@ -629,7 +641,6 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str tensor_config tc = get_tensor_config(); split_state.axis = tc.axis; if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) { - const int64_t ne_full = tensor->ne[split_state.axis]; const int64_t blck_size = ggml_blck_size(tc.tensor_axis_0->type); const float * tensor_split = ud->model->tensor_split(); std::vector tensor_split_scan; @@ -646,7 +657,6 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str const int64_t ne_s = segments[is].first; const uint32_t nr_s = segments[is].second; const int64_t g_s = granularity[is]; - GGML_ASSERT(ne_full % g_s == 0); int64_t low = 0; size_t j = 0; for (; j < ud->n_devices - 1; j++) { @@ -1036,7 +1046,7 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); @@ -1091,13 +1101,16 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f); std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f); - ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false); + + // Populate deepstack_mapping_arr - initialized to -1 (no deepstack) + std::fill(hparams.deepstack_mapping_arr.begin(), hparams.deepstack_mapping_arr.end(), -1); // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer(), false); bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -1196,7 +1209,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { const auto & use_mlock = params.use_mlock; const auto & tensor_split = params.tensor_split; - const int n_layer = hparams.n_layer; + const int n_layer_all = hparams.n_layer_all; const int n_gpu_layers = this->n_gpu_layers(); const bool use_mmap_buffer = true; @@ -1253,10 +1266,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { splits[i] /= split_sum; } - const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0); - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1); + const int i_gpu_start = std::max(n_layer_all + 1 - n_gpu_layers, 0); + const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer_all + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { - const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il); + const bool is_swa = il < n_layer_all && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; @@ -1272,13 +1285,13 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list }; // assign the repeating layers to the devices according to the splits - pimpl->dev_layer.resize(n_layer); - for (int il = 0; il < n_layer; ++il) { + pimpl->dev_layer.resize(n_layer_all); + for (int il = 0; il < n_layer_all; ++il) { pimpl->dev_layer[il] = get_layer_buft_list(il); } // assign the output layer - pimpl->dev_output = get_layer_buft_list(n_layer); + pimpl->dev_output = get_layer_buft_list(n_layer_all); const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED; @@ -1294,14 +1307,14 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { throw std::runtime_error("model has expert layers but no expert layers are used"); } - layers.resize(n_layer); + layers.resize(n_layer_all); // call the per-model loading function load_arch_tensors(ml); // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2) // this avoids having to add scale loading to every architecture - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { auto & layer = layers[i]; // attention weight scales (per-tensor, shape {1}) @@ -1559,7 +1572,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } if (llama_supports_gpu_offload()) { - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + const int n_gpu = std::min(n_gpu_layers, n_layer_all); int n_repeating = n_gpu; if (n_repeating > 0) { @@ -1568,8 +1581,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating); - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_backend_supported_layers = n_layer_all + 1; + const int max_offloadable_layers = n_layer_all + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); } @@ -1638,7 +1651,8 @@ const float * llama_model::tensor_split() const { } uint32_t llama_model::n_gpu_layers() const { - return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1; + // note: plus 1 for the "output" layer + return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer_all + 1; } llama_split_mode llama_model::split_mode() const { @@ -1671,10 +1685,10 @@ uint64_t llama_model::n_elements() const { void llama_model::print_info() const { const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train); - auto print_f = [](const std::function & f, uint32_t n) { + auto print_f = [](const std::function & f, uint32_t n) { bool is_var = false; - std::vector v; + std::vector v; for (uint32_t i = 0; i < n; ++i) { v.push_back(f(i)); if (v[i] != v[0]) { @@ -1707,19 +1721,21 @@ void llama_model::print_info() const { if (!hparams.vocab_only) { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp()); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_embd_out = %u\n", __func__, hparams.n_embd_out()); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer()); + LLAMA_LOG_INFO("%s: n_layer_all = %u\n", __func__, hparams.n_layer_all); + LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any()); LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full); LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full); - LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); @@ -1727,7 +1743,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attn_value_scale = %.4f\n", __func__, hparams.f_attn_value_scale); - LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups); @@ -1748,6 +1764,14 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); + if (arch == LLM_ARCH_GRANITE && + std::any_of(hparams.deepstack_mapping_arr.begin(), + hparams.deepstack_mapping_arr.end(), + [](const auto & entry) { return entry >= 0; })) { + LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__, + print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; }, + hparams.n_layer_all).c_str()); + } // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]); @@ -1854,7 +1878,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers); + LLAMA_LOG_INFO("%s: n_layer_nextn = %d\n", __func__, hparams.n_layer_nextn); } if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) { @@ -2036,22 +2060,21 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; llama_memory_hybrid::layer_filter_cb filter_recr = nullptr; if (arch == LLM_ARCH_FALCON_H1) { - filter_attn = [&](int32_t) { return true; }; - filter_recr = [&](int32_t) { return true; }; + filter_attn = [&](uint32_t) { return true; }; + filter_recr = [&](uint32_t) { return true; }; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { - filter_attn = [&](int32_t il) { + filter_attn = [&](uint32_t il) { return !hparams.is_recr(il) && hparams.n_ff(il) == 0; }; - filter_recr = [&](int32_t il) { + filter_recr = [&](uint32_t il) { return hparams.is_recr(il) && hparams.n_ff(il) == 0; }; } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - filter_attn = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && !hparams.is_recr(il); + filter_attn = [&](uint32_t il) { + return il < hparams.n_layer() && !hparams.is_recr(il); }; - filter_recr = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && hparams.is_recr(il); + filter_recr = [&](uint32_t il) { + return il < hparams.n_layer() && hparams.is_recr(il); }; } @@ -2096,13 +2119,16 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* filter_recr */ std::move(filter_recr)); } } else { - llama_memory_i::layer_reuse_cb reuse = nullptr; llama_kv_cache::layer_filter_cb filter = nullptr; + llama_memory_i::layer_reuse_cb reuse = nullptr; + llama_kv_cache::layer_share_cb share = nullptr; if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { - reuse = [&](int32_t il) { - if (il >= (int32_t) hparams.n_layer_kv_from_start) { - return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); + reuse = [&](uint32_t il) { + GGML_ASSERT(hparams.n_layer_kv_from_start >= 2); + + if (il >= (uint32_t)hparams.n_layer_kv_from_start) { + return hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); } return -1; @@ -2110,27 +2136,67 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } if (mtp_on_hybrid_qwen35) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; }; + filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; + } + + if (arch == LLM_ARCH_STEP35 && hparams.n_layer_nextn > 0) { + if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) { + filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; + } else { + filter = [&](uint32_t il) { return il < hparams.n_layer(); }; + } } if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(hparams.is_swa_any()); - res = new llama_kv_cache_iswa( - *this, - params.type_k, - params.type_v, - !cparams.flash_attn, - cparams.offload_kqv, - params.swa_full, - cparams.kv_unified, - cparams.n_ctx_seq, - cparams.n_seq_max, - cparams.n_ubatch, - 1, - filter, - reuse); + if (arch == LLM_ARCH_GEMMA4_ASSISTANT) { + llama_memory_t mem_other = llama_get_memory(cparams.ctx_other); + + share = [&](int32_t il) { + const llama_model * model_other = llama_get_model(cparams.ctx_other); + + if (hparams.is_swa(il)) { + return llama_model_n_layer(model_other) - 2; + } + + return llama_model_n_layer(model_other) - 1; + }; + + res = new llama_kv_cache_iswa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + params.swa_full, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + cparams.n_ubatch, + 1, + mem_other, + filter, + reuse, + share); + } else { + res = new llama_kv_cache_iswa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + params.swa_full, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + cparams.n_ubatch, + 1, + nullptr, + filter, + reuse, + share); + } } else { GGML_ASSERT(!hparams.is_swa_any()); @@ -2147,7 +2213,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, 1, hparams.n_swa, hparams.swa_type, + nullptr, filter, + nullptr, nullptr); } } @@ -2235,7 +2303,7 @@ int32_t llama_model_n_embd_out(const llama_model * model) { } int32_t llama_model_n_layer(const llama_model * model) { - return model->hparams.n_layer; + return model->hparams.n_layer(); } int32_t llama_model_n_head(const llama_model * model) { @@ -2399,6 +2467,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: case LLM_ARCH_GEMMA4: + case LLM_ARCH_GEMMA4_ASSISTANT: case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: diff --git a/src/llama-model.h b/src/llama-model.h index da769b41586..12df43a1faf 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -552,6 +552,10 @@ struct llama_model { struct ggml_tensor * output_s = nullptr; struct ggml_tensor * output_in_s = nullptr; + // NextN/MTP model-level projections + struct ggml_tensor * nextn_proj_pre = nullptr; + struct ggml_tensor * nextn_proj_post = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; @@ -704,7 +708,9 @@ const char * llm_type_name(llm_type type); // convenience macro for loading local variables for load_tensors() in llama_model_base // note: cast to int64_t since we will use these for the tensor dimensions #define LLAMA_LOAD_LOCALS \ - const int n_layer = hparams.n_layer; GGML_UNUSED(n_layer); \ + const int n_layer = hparams.n_layer(); GGML_UNUSED(n_layer); \ + const int n_layer_all = hparams.n_layer_all; GGML_UNUSED(n_layer_all); \ + const int n_layer_nextn = hparams.n_layer_nextn; GGML_UNUSED(n_layer_nextn); \ const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \ const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \ const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \ diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e531f9a3da5..f7ed686af61 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -854,7 +854,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vectorhparams.n_embd = desc->n_embd; model->hparams.n_embd_head_k_full = desc->n_embd_head_k; model->hparams.n_embd_head_v_full = desc->n_embd_head_v; - model->hparams.n_layer = desc->n_layer; + model->hparams.n_layer_all = desc->n_layer; model->hparams.n_expert = desc->n_expert; for (uint32_t i = 0; i < desc->n_layer; i++) { diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index a7c77ee5d28..063b214256e 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -30,7 +30,7 @@ void llama_model_afmoe::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 56: type = LLM_TYPE_6B; break; case 32: type = LLM_TYPE_26B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp index bec7136521c..6dfb8905fbe 100644 --- a/src/models/apertus.cpp +++ b/src/models/apertus.cpp @@ -2,12 +2,13 @@ void llama_model_apertus::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer); - switch (hparams.n_layer) { + ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer()); + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp index d086c4717ff..9536e7c5d42 100644 --- a/src/models/arcee.cpp +++ b/src/models/arcee.cpp @@ -4,7 +4,7 @@ void llama_model_arcee::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); // Arcee uses the same structure as Llama - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index 27deadffeb7..09ee0f752f0 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -4,7 +4,7 @@ void llama_model_arctic::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); if (hparams.n_expert == 128) { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 35: type = LLM_TYPE_10B_128x3_66B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp index 9bd04127b25..b38b2064785 100644 --- a/src/models/arwkv7.cpp +++ b/src/models/arwkv7.cpp @@ -10,7 +10,7 @@ void llama_model_arwkv7::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: switch (hparams.n_embd) { case 768: type = LLM_TYPE_190M; break; diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp index 4d26081cd5d..585f3614174 100644 --- a/src/models/baichuan.cpp +++ b/src/models/baichuan.cpp @@ -2,7 +2,7 @@ void llama_model_baichuan::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp index fe1ae10864b..7faf73c835b 100644 --- a/src/models/bailingmoe.cpp +++ b/src/models/bailingmoe.cpp @@ -8,7 +8,7 @@ void llama_model_bailingmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_16B; break; case 88: type = LLM_TYPE_290B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index 2f0d44a6259..5000e9c6db8 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -9,17 +9,13 @@ void llama_model_bailingmoe2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 20: type = LLM_TYPE_16B_A1B; break; - case 21: type = LLM_TYPE_16B_A1B; break; case 32: type = LLM_TYPE_100B_A6B; break; - case 33: type = LLM_TYPE_100B_A6B; break; default: type = LLM_TYPE_UNKNOWN; } } @@ -39,9 +35,9 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) { GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2"); GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2"); - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -78,7 +74,7 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); @@ -112,8 +108,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -146,7 +141,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/bert.cpp b/src/models/bert.cpp index 3c28f419ccf..53ce29f23ca 100644 --- a/src/models/bert.cpp +++ b/src/models/bert.cpp @@ -1,9 +1,9 @@ #include "models.h" void llama_model_bert::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 3: type = LLM_TYPE_17M; break; // bge-micro case 6: diff --git a/src/models/bitnet.cpp b/src/models/bitnet.cpp index 7e8125deec4..c8330274580 100644 --- a/src/models/bitnet.cpp +++ b/src/models/bitnet.cpp @@ -3,7 +3,7 @@ void llama_model_bitnet::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp index 30b0f3d07d0..609d2ddf998 100644 --- a/src/models/bloom.cpp +++ b/src/models/bloom.cpp @@ -3,7 +3,7 @@ void llama_model_bloom::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 30: switch (hparams.n_embd) { diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp index 4bceaefd63b..4f45acecf84 100644 --- a/src/models/chameleon.cpp +++ b/src/models/chameleon.cpp @@ -6,7 +6,7 @@ void llama_model_chameleon::load_arch_hparams(llama_model_loader & ml) { hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_34B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index 6766fa71c15..7ae5b938fde 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -2,7 +2,8 @@ void llama_model_chatglm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: { if (hparams.n_head(0) == 16) { type = LLM_TYPE_1_5B; diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp index 274dd3342a7..de53bb98184 100644 --- a/src/models/codeshell.cpp +++ b/src/models/codeshell.cpp @@ -2,7 +2,8 @@ void llama_model_codeshell::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 42: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index 2e231bb3f93..750f57a394e 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -2,7 +2,8 @@ void llama_model_cogvlm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp index a514cf88fc6..61a5945a194 100644 --- a/src/models/cohere2.cpp +++ b/src/models/cohere2.cpp @@ -5,6 +5,7 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) { uint32_t swa_period = 4; ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; @@ -12,7 +13,8 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp index adf7fcaa20f..94a46188bb8 100644 --- a/src/models/command-r.cpp +++ b/src/models/command-r.cpp @@ -3,7 +3,8 @@ void llama_model_command_r::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_35B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index af71c775365..4f5ac4d06a4 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -1,14 +1,14 @@ #include "models.h" void llama_model_dbrx::load_arch_hparams(llama_model_loader & ml) { -ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); -switch (hparams.n_layer) { - case 40: type = LLM_TYPE_16x12B; break; - default: type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer()) { + case 40: type = LLM_TYPE_16x12B; break; + default: type = LLM_TYPE_UNKNOWN; + } } - } void llama_model_dbrx::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; diff --git a/src/models/deci.cpp b/src/models/deci.cpp index 567e3535276..cdfcf29e02f 100644 --- a/src/models/deci.cpp +++ b/src/models/deci.cpp @@ -2,7 +2,8 @@ void llama_model_deci::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; case 162: type = LLM_TYPE_405B; break; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 1fe54adc13e..a9e8bc51403 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -5,7 +5,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false); // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B - const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256)); + const bool is_lite = (hparams.n_layer() == 27 || hparams.n_layer() == 26 || (hparams.n_layer() == 48 && n_vocab == 128256)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); @@ -23,7 +23,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { // for compatibility with existing DeepSeek V2 and V2.5 GGUFs // that have no expert_gating_func model parameter set - if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) { + if ((hparams.n_layer() == 47 || hparams.n_layer() == 48) && n_vocab == 154880) { // GLM 4.7 Lite hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } else { @@ -43,7 +43,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { hparams.f_attn_temp_offset = 0.0f; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 27: type = LLM_TYPE_16B; break; case 47: type = LLM_TYPE_30B_A3B; break; case 60: type = LLM_TYPE_236B; break; @@ -191,8 +191,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p ggml_tensor * inp_out_ids = build_inp_out_ids(); - int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < effective_n_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -366,7 +365,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } } - if (il == effective_n_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/deepseek2ocr.cpp b/src/models/deepseek2ocr.cpp index f9e4c98785c..65d31c31b93 100644 --- a/src/models/deepseek2ocr.cpp +++ b/src/models/deepseek2ocr.cpp @@ -14,7 +14,7 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/deepseek32.cpp b/src/models/deepseek32.cpp index c92ab60d166..9a20e2ce907 100644 --- a/src/models/deepseek32.cpp +++ b/src/models/deepseek32.cpp @@ -31,7 +31,7 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); // Expert gating function - ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) { // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] @@ -40,13 +40,10 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_685B_A37B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -82,9 +79,9 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; @@ -142,7 +139,7 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -205,8 +202,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); - int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < effective_n_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -427,7 +423,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il); } } - if (il == effective_n_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/dflash.cpp b/src/models/dflash.cpp index bee180e1c15..5d14884cfbc 100644 --- a/src/models/dflash.cpp +++ b/src/models/dflash.cpp @@ -238,13 +238,13 @@ void llama_model_dflash::load_arch_hparams(llama_model_loader & ml) { // to receive the BOOL/INT array. Filled with 0 by default so unset slots are dense. std::array pattern{}; if (ml.get_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, pattern, false)) { - const uint32_t n = std::min(pattern.size(), hparams.n_layer); + const uint32_t n = std::min(pattern.size(), hparams.n_layer()); for (uint32_t il = 0; il < n; ++il) { hparams.is_swa_impl[il] = pattern[il] != 0 ? 1u : 0u; } } else { // No per-layer pattern: assume all layers are SWA. - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_swa_impl[il] = 1u; } } @@ -272,7 +272,7 @@ void llama_model_dflash::load_arch_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); // Layers for draft generation - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { auto & layer = layers[il]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), {n_embd}, 0); diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index 435d27281c6..07d6ab1b7cd 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -8,7 +8,8 @@ void llama_model_dots1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_142B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dream.cpp b/src/models/dream.cpp index 12ac6f1ce88..abe737c335a 100644 --- a/src/models/dream.cpp +++ b/src/models/dream.cpp @@ -2,8 +2,9 @@ void llama_model_dream::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // Dream models are primarily 7B with 28 layers - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_7B; break; diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index 9b39c605e35..895cf690bd2 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -12,7 +12,7 @@ void llama_model_ernie4_5::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_0_3B; break; case 28: type = LLM_TYPE_21B_A3B; break; case 54: type = LLM_TYPE_300B_A47B; break; diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp index ddf13c3028f..0948d7de656 100644 --- a/src/models/eurobert.cpp +++ b/src/models/eurobert.cpp @@ -3,7 +3,7 @@ void llama_model_eurobert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (hparams.n_layer == 12) { + if (hparams.n_layer() == 12) { type = LLM_TYPE_SMALL; // 0.2B } } diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index 76d91982fc5..5aed9379400 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -20,13 +20,12 @@ void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_30B_A3B; break; - case 48: - case 49: type = LLM_TYPE_235B_A22B; break; + case 48: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } @@ -50,9 +49,9 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -70,7 +69,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end - if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers)) { + if (i < (int) hparams.n_layer_dense_lead || (i >= n_layer)) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); @@ -95,7 +94,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags); @@ -130,8 +129,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // use RoPE for SWA layers @@ -170,7 +168,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp index c7e9960d718..676fb37b5a6 100644 --- a/src/models/exaone.cpp +++ b/src/models/exaone.cpp @@ -3,7 +3,7 @@ void llama_model_exaone::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp index b5030eb0545..863268abcef 100644 --- a/src/models/exaone4.cpp +++ b/src/models/exaone4.cpp @@ -1,7 +1,7 @@ #include "models.h" void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { - if (hparams.n_layer == 64) { // 32B + if (hparams.n_layer() == 64) { // 32B hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.n_swa = 4096; uint32_t swa_period = 4; @@ -15,11 +15,11 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); - switch (hparams.n_layer) { + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer"); + + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_1_2B; break; case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; @@ -40,8 +40,8 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { - const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers; + for (int i = 0; i < n_layer_all; ++i) { + const bool is_nextn = i >= n_layer; int flags = 0; if (is_nextn) { // NextN/MTP layers are preserved in GGUF but are not executed yet. @@ -109,11 +109,7 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra } ggml_tensor * inp_out_ids = build_inp_out_ids(); - // MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe). - const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers); - GGML_ASSERT(n_layer_main > 0); - - for (int il = 0; il < n_layer_main; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // use RoPE for SWA layers or non-SWA models @@ -149,7 +145,7 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } - if (il == n_layer_main - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index c130ccdd49e..d6ef2d51986 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -13,7 +13,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) { std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_0_5B; break; case 24: diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp index ad546ef2db5..b2ad90b3272 100644 --- a/src/models/falcon.cpp +++ b/src/models/falcon.cpp @@ -3,7 +3,7 @@ void llama_model_falcon::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 60: type = LLM_TYPE_40B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp index 4e07f5f2bda..80ed3b1a460 100644 --- a/src/models/gemma-embedding.cpp +++ b/src/models/gemma-embedding.cpp @@ -21,7 +21,7 @@ void llama_model_gemma_embedding::load_arch_hparams(llama_model_loader & ml) { GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd"); GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_0_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp index 1519682fdf6..651cd7e64de 100644 --- a/src/models/gemma.cpp +++ b/src/models/gemma.cpp @@ -3,7 +3,7 @@ void llama_model_gemma::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_2B; break; case 28: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma2.cpp b/src/models/gemma2.cpp index ae3f9ffb530..2fbfb15a94a 100644 --- a/src/models/gemma2.cpp +++ b/src/models/gemma2.cpp @@ -16,7 +16,7 @@ void llama_model_gemma2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_2B; break; case 42: type = LLM_TYPE_9B; break; case 46: type = LLM_TYPE_27B; break; diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index 63a2b380e71..690194529e3 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -17,7 +17,7 @@ void llama_model_gemma3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_270M; break; case 26: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_8B; break; // Rnj-1 diff --git a/src/models/gemma3n.cpp b/src/models/gemma3n.cpp index 6ec3a006081..83eb8250aa9 100644 --- a/src/models/gemma3n.cpp +++ b/src/models/gemma3n.cpp @@ -6,14 +6,14 @@ void llama_model_gemma3n::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(swa_period); - hparams.n_layer_kv_from_start = 20; - hparams.f_attention_scale = 1.0f; + hparams.n_layer_kv_from_start = 20; + hparams.f_attention_scale = 1.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_E2B; break; case 35: type = LLM_TYPE_E4B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp new file mode 100644 index 00000000000..5b7a25a5aba --- /dev/null +++ b/src/models/gemma4-assistant.cpp @@ -0,0 +1,200 @@ +#include "models.h" + +void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { + hparams.n_embd_inp_impl = hparams.n_embd_out(); + + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); + + uint32_t n_kv_shared_layers = 0; + ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); + + hparams.f_attention_scale = 1.0f; + + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn == hparams.n_layer_all && "n_layer_nextn must be == n_layer_impl"); + + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); +} + +void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + if (n_embd_head_k != n_embd_head_v) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k == n_embd_head_v"); + } + if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k_swa == n_embd_head_v_swa"); + } + if (hparams.n_embd_out() == n_embd) { + throw std::runtime_error("Gemma 4 assistant requires embedding_length_out to carry the target hidden size"); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + + const int64_t n_embd_backbone = hparams.n_embd_inp(); + nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0); + + int rope_freqs_flag = 0; + + for (int i = 0; i < n_layer_nextn; ++i) { + auto & layer = layers[i]; + + const int64_t n_head = hparams.n_head(i); + const int64_t n_embd_head = hparams.n_embd_head_k(i); + const int64_t n_ff = hparams.n_ff(i); + + if (i == 0) { + nextn_proj_pre = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_PRE, "weight", i), { 2*n_embd_backbone, n_embd }, 0); + } + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head*n_head }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head*n_head, n_embd }, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + + layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), { 1u }, 0); + + if (!hparams.is_swa(i)) { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_embd_head/2 }, rope_freqs_flag); + rope_freqs_flag = TENSOR_DUPLICATED; + } + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), { n_embd }, 0); + } +} + +std::unique_ptr llama_model_gemma4_assistant::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + const int64_t n_embd_backbone = hparams.n_embd_inp(); + + ggml_tensor * inp_tokens; + ggml_tensor * inp_h; + { + auto inp = std::make_unique(n_embd_backbone); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); + inp_tokens = inp->tokens; + res->t_inp_tokens = inp->tokens; + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_backbone, ubatch.n_tokens); + cb(inp->embd, "inp_h", -1); + ggml_set_input(inp->embd); + inp_h = inp->embd; + res->t_inp_embd = inp->embd; + + res->add_input(std::move(inp)); + } + + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + ggml_tensor * x = ggml_get_rows(ctx0, model_other->tok_embd, inp_tokens); + x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone)); + cb(x, "inp_embd_target", -1); + + ggml_tensor * xh = ggml_concat(ctx0, x, inp_h, 0); + cb(xh, "inp_xh", -1); + + ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_proj_pre, xh); + cb(cur, "pre_proj", -1); + + auto * inp_attn = build_attn_inp_kv_iswa(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + ggml_tensor * inpL = cur; + + for (int il = 0; il < n_layer_nextn; ++il) { + const bool is_swa = hparams.is_swa(il); + + const int64_t n_embd_head = hparams.n_embd_head_k(il); + const int64_t n_head = hparams.n_head(il); + + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + const int n_rot_l = hparams.n_rot(il); + + ggml_tensor * cur_norm = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur_norm, "attn_norm", il); + + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur_norm); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + ggml_tensor * freq_factors = is_swa ? nullptr : model.layers[il].rope_freqs; + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, + freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_pos", il); + + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, nullptr, + Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); + + if (il == n_layer_nextn - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + cur = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL); + cb(attn_out, "attn_out", il); + + cur = build_norm(attn_out, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = build_norm(cur, model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, attn_out); + + cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); + cb(cur, "out_scaled", il); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + ggml_tensor * logits = build_lora_mm(model.output, cur); + cb(logits, "result_output", -1); + res->t_logits = logits; + + ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_proj_post, cur); + cb(h_next, "h_nextn", -1); + res->t_h_nextn = h_next; + + ggml_build_forward_expand(gf, logits); + ggml_build_forward_expand(gf, h_next); +} diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index ab9dd3bbbc1..585854bfe57 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -2,12 +2,12 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); uint32_t n_kv_shared_layers = 0; ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); - hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers; + hparams.n_layer_kv_from_start = hparams.n_layer_all - (int32_t)n_kv_shared_layers; hparams.f_attention_scale = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling) ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); @@ -19,7 +19,7 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_26B_A4B; break; case 35: type = LLM_TYPE_E2B; break; case 42: type = LLM_TYPE_E4B; break; @@ -155,12 +155,14 @@ class llm_graph_input_logits_bias : public llm_graph_input_i { } virtual ~llm_graph_input_logits_bias() = default; - void set_input(const llama_ubatch *) override { + void set_input(const llama_ubatch * /*ubatch*/) override { const int64_t n_vocab = arr.size(); ggml_backend_tensor_set(logits_bias, arr.data(), 0, n_vocab*ggml_element_size(logits_bias)); } - // bool can_reuse(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & /*params*/) override { + return true; + } ggml_tensor * logits_bias = nullptr; // F32 [n_vocab] @@ -301,7 +303,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para } // TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing - if (il == n_layer - 1 && inp_out_ids) { + // keep all rows when extracting unmasked nextn embeddings (MTP target needs the hidden state for every token) + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } @@ -416,7 +419,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_this_layer = ggml_view_2d_slice(ctx0, inp_per_layer, il); // [n_embd_per_layer, n_tokens] // TODO @ngxson : improve this - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids); } @@ -459,6 +462,17 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para model.output_norm, nullptr, LLM_NORM_RMS, -1); + // Expose the post-output-norm hidden state (the LM-head input feature) so that + // MTP draft contexts can read it via llama_get_embeddings_nextn_ith() as the + // recurrent h input. This matches the reference (transformers/vLLM/SGLang), + // which feeds the drafter the target's post-final-norm hidden state. + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + if (!cparams.embeddings_nextn_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + cb(cur, "result_norm", -1); res->t_embd = cur; diff --git a/src/models/glm-dsa.cpp b/src/models/glm-dsa.cpp index af2b55ef563..11d91312def 100644 --- a/src/models/glm-dsa.cpp +++ b/src/models/glm-dsa.cpp @@ -33,13 +33,10 @@ void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 79: type = LLM_TYPE_744B_A40B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -76,9 +73,9 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; @@ -135,8 +132,8 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); } - // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // NextN/MTP tensors (preserved but unused) - conditionally load for last n_layer_nextn + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 27654b8cba3..d60e47ddf0c 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -20,16 +20,13 @@ void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { - case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer) + switch (hparams.n_layer()) { + case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open - case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer) + case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 default: type = LLM_TYPE_UNKNOWN; } } @@ -54,9 +51,9 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) { // Load ALL tensors including NextN layer to satisfy total tensor count // but only PROCESS up to last layer (skipping final NextN layer) in forward pass - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -116,7 +113,7 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -161,8 +158,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -211,7 +207,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa model.layers[il].wo, NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 7c242fed298..b4326c5f210 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -5,13 +5,10 @@ void llama_model_glm4::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); // NextN/MTP parameters (GLM-OCR) - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 17: type = LLM_TYPE_1B; break; // GLM-OCR case 40: type = LLM_TYPE_9B; break; case 61: type = LLM_TYPE_32B; break; @@ -32,9 +29,9 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -55,7 +52,7 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags); // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -100,8 +97,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -140,7 +136,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params model.layers[il].wo, NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp index e2dcc8b1521..45afbccc121 100644 --- a/src/models/gpt2.cpp +++ b/src/models/gpt2.cpp @@ -2,7 +2,8 @@ void llama_model_gpt2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_SMALL; break; case 24: type = LLM_TYPE_MEDIUM; break; case 36: type = LLM_TYPE_LARGE; break; diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp index 443e35addf2..ed5e8c50da2 100644 --- a/src/models/gptneox.cpp +++ b/src/models/gptneox.cpp @@ -3,7 +3,8 @@ void llama_model_gptneox::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 6: switch (hparams.n_ff()) { case 512: type = LLM_TYPE_14M; break; diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 8740d9fc7d9..eb23095aece 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -19,7 +19,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) { hparams.rope_finetuned = rope_finetuned; // A layer is recurrent IFF the n_head_kv value is set to 0 - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } diff --git a/src/models/granite-moe.cpp b/src/models/granite-moe.cpp index 0d89bc1f340..115263c418f 100644 --- a/src/models/granite-moe.cpp +++ b/src/models/granite-moe.cpp @@ -12,7 +12,7 @@ void llama_model_granite_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; // Add additional layer/vocab/etc checks here for other model sizes diff --git a/src/models/granite.cpp b/src/models/granite.cpp index cda4aa231fa..4a75c5ff3cc 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -1,5 +1,7 @@ #include "models.h" +#include + void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); @@ -7,12 +9,33 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false); ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false); + // Granite4 Vision uses array deepstack_mapping + ml.get_arr(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr, false); + + // Count the unique deepstack input indices + std::unordered_set unique_deepstack_idxs; + for (const auto val : hparams.deepstack_mapping_arr) { + if (val >= 0) { + unique_deepstack_idxs.insert(val); + } + } + hparams.n_deepstack_layers = unique_deepstack_idxs.size(); + + // Ensure all values are valid (avoid overflow attacks) + for (const auto val : unique_deepstack_idxs) { + if (val > hparams.n_deepstack_layers) { + std::stringstream ss; + ss << "Invalid deepstack index: " << val << " > " << hparams.n_deepstack_layers; + throw std::runtime_error(ss.str()); + } + } + // Granite uses rope_finetuned as a switch for rope, so default to true bool rope_finetuned = true; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; // Add additional layer/vocab/etc checks here for other model sizes @@ -112,6 +135,20 @@ llama_model_granite::graph::graph( ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + + // Granite Vision 4.1 deepstack: inject the projector stream that + // targets decoder layer `il` before the decoder runs. + // NOTE: skip the first deepstack layer since that's inpL + const auto & deepstack_emb_idx = hparams.deepstack_mapping_arr[il]; + if (il > 0 && deepstack_emb_idx >= 0) { + ggml_tensor * ds = ggml_view_2d(ctx0, + res->t_inp_embd, n_embd, n_tokens, + res->t_inp_embd->nb[1], + deepstack_emb_idx * n_embd * sizeof(float)); + inpL = ggml_add(ctx0, inpL, ds); + cb(inpL, "deepstack_in", il); + } + ggml_tensor * inpSA = inpL; // norm diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 7c46ec1c0f2..42f38af6724 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -26,7 +26,7 @@ void llama_model_grok::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 64: type = LLM_TYPE_314B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index 1cab75adc7f..643a448e59a 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -7,7 +7,7 @@ void llama_model_grovemoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index deb3c9671f3..4d55f5e7f31 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -5,7 +5,7 @@ void llama_model_hunyuan_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_A13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp index f9ee37a24b6..f6cfdfb9458 100644 --- a/src/models/internlm2.cpp +++ b/src/models/internlm2.cpp @@ -2,7 +2,8 @@ void llama_model_internlm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_20B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jais.cpp b/src/models/jais.cpp index 2ba162605f1..415103ce23a 100644 --- a/src/models/jais.cpp +++ b/src/models/jais.cpp @@ -4,7 +4,7 @@ void llama_model_jais::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_3B; break; case 40: type = LLM_TYPE_13B; break; /* TODO: add variants */ diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp index 8966131441c..8610fcc9f82 100644 --- a/src/models/jais2.cpp +++ b/src/models/jais2.cpp @@ -3,7 +3,7 @@ void llama_model_jais2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; case 68: type = LLM_TYPE_70B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index a62b121b3ee..dba160b014f 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -8,11 +8,11 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { // TODO: Jamba layers are a bit heterogeneous, so naming this is hard. case 12: // 900M 8x???M case 32: // 51B 16x?B diff --git a/src/models/jina-bert-v2.cpp b/src/models/jina-bert-v2.cpp index 4f8866ece4d..86ff1c84d1a 100644 --- a/src/models/jina-bert-v2.cpp +++ b/src/models/jina-bert-v2.cpp @@ -4,7 +4,7 @@ void llama_model_jina_bert_v2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); hparams.f_max_alibi_bias = 8.0f; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jina-bert-v3.cpp b/src/models/jina-bert-v3.cpp index e0527529f56..1c974a6f16c 100644 --- a/src/models/jina-bert-v3.cpp +++ b/src/models/jina-bert-v3.cpp @@ -3,7 +3,7 @@ void llama_model_jina_bert_v3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_558M; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index c13f71b5bcb..367f6990d1f 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -14,7 +14,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent } @@ -25,7 +25,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 3898b56bb12..97da8a6abb8 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -5,10 +5,13 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } - hparams.n_layer_dense_lead = hparams.n_layer; + + hparams.n_layer_dense_lead = hparams.n_layer(); + switch (hparams.n_ff()) { case 4608: type = LLM_TYPE_350M; break; case 6912: type = LLM_TYPE_700M; break; @@ -16,9 +19,10 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { case 10752: type = LLM_TYPE_2_6B; break; default: type = LLM_TYPE_UNKNOWN; } + if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_swa_impl[il] = !hparams.is_recr_impl[il]; } } diff --git a/src/models/lfm2moe.cpp b/src/models/lfm2moe.cpp index 81ced2eaba2..490f5c223eb 100644 --- a/src/models/lfm2moe.cpp +++ b/src/models/lfm2moe.cpp @@ -9,11 +9,11 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_8B_A1B; break; case 40: type = LLM_TYPE_24B_A2B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index 9722dde9f17..2ae89386447 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -2,11 +2,12 @@ void llama_model_llada_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // diffusion language model uses non-causal attention hparams.causal_attn = false; - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_A1_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/llada.cpp b/src/models/llada.cpp index 58b2c466e17..87d4259f9a7 100644 --- a/src/models/llada.cpp +++ b/src/models/llada.cpp @@ -2,14 +2,16 @@ void llama_model_llada::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } + // Set non-causal attention for diffusion models hparams.causal_attn = false; } diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 642b3b1608a..7def112827a 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -7,13 +7,13 @@ void llama_model_llama::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); if (hparams.n_expert == 8) { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8x7B; break; case 56: type = LLM_TYPE_8x22B; break; default: type = LLM_TYPE_UNKNOWN; } } else { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B case 22: type = LLM_TYPE_1B; break; case 26: type = LLM_TYPE_3B; break; diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp index 8f39b3f59a5..7194c72a585 100644 --- a/src/models/llama4.cpp +++ b/src/models/llama4.cpp @@ -8,7 +8,7 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) { const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); if (found_swa && hparams.n_swa == 0) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope + hparams.n_no_rope_layer_step = hparams.n_layer(); // always use rope } else { hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED; hparams.n_swa = 8192; diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp index 84cfe399027..ae56a26a1f6 100644 --- a/src/models/maincoder.cpp +++ b/src/models/maincoder.cpp @@ -2,7 +2,8 @@ void llama_model_maincoder::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp index 887a1fa509a..0d94e98281c 100644 --- a/src/models/mamba.cpp +++ b/src/models/mamba.cpp @@ -9,7 +9,7 @@ void llama_model_mamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: switch (hparams.n_embd) { case 768: type = LLM_TYPE_SMALL; break; diff --git a/src/models/mamba2.cpp b/src/models/mamba2.cpp index 3277ca53ec4..c5951cf0f7f 100644 --- a/src/models/mamba2.cpp +++ b/src/models/mamba2.cpp @@ -9,7 +9,7 @@ void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: switch (hparams.n_embd) { case 768: type = LLM_TYPE_SMALL; break; diff --git a/src/models/mellum.cpp b/src/models/mellum.cpp index 1e1e97e9fa0..28823018bc0 100644 --- a/src/models/mellum.cpp +++ b/src/models/mellum.cpp @@ -13,7 +13,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) { if (res) { hparams.set_swa_pattern(swa_period); } else { - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); } hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; @@ -24,7 +24,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_12B_A2_5B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp index 1bcdf696f2e..88989160570 100644 --- a/src/models/mimo2.cpp +++ b/src/models/mimo2.cpp @@ -9,18 +9,17 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); float value_scale = 0.0f; if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) { hparams.f_attn_value_scale = value_scale; } - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_310B_A15B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -35,16 +34,14 @@ void llama_model_mimo2::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - const uint32_t n_nextn = hparams.nextn_predict_layers; - - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { auto & layer = layers[i]; uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); uint32_t n_head = hparams.n_head(i); // NextN/MTP layers (the last n_nextn blocks) are preserved but disabled pending support - const bool is_nextn = (n_nextn > 0) && (static_cast(i) >= n_layer - n_nextn); + const bool is_nextn = i >= n_layer; const int skip = is_nextn ? TENSOR_SKIP : 0; create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, skip); @@ -93,10 +90,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param const float v_scale = hparams.f_attn_value_scale; - // The last hparams.nextn_predict_layers blocks are MTP heads, currently inactive - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; uint32_t n_head_l = hparams.n_head(il); @@ -174,7 +168,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param } } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp index 966d3af615c..fc3e5b171d5 100644 --- a/src/models/minicpm.cpp +++ b/src/models/minicpm.cpp @@ -3,7 +3,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) { // Backward-compatible defaults for older MiniCPM GGUFs hparams.f_embedding_scale = 12.0f; - hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer)); + hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer())); hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f; ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -16,7 +16,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) { // MiniCPM uses rope by default, unlike Granite which uses it as a switch hparams.rope_finetuned = true; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 52: type = LLM_TYPE_1B; break; case 40: type = LLM_TYPE_2B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index 1ffc54fa7c6..e011b1ff0a8 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -5,7 +5,7 @@ void llama_model_minicpm3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index 22e291d73a3..b25435e4d97 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -5,7 +5,7 @@ void llama_model_minimax_m2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_230B_A10B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 1ac5a95ccdc..9a8e3f9a50b 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -18,7 +18,7 @@ void llama_model_mistral3::load_arch_hparams(llama_model_loader & ml) { } } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_3B; break; case 34: type = LLM_TYPE_8B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/models.h b/src/models/models.h index 03b770ab38c..4afcd918889 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -840,6 +840,19 @@ struct llama_model_gemma4 : public llama_model_base { }; +struct llama_model_gemma4_assistant : public llama_model_base { + llama_model_gemma4_assistant(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_gemma_embedding : public llama_model_base { llama_model_gemma_embedding(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index 5ab51867cc0..f3e9407e012 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -22,7 +22,7 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) { hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU); } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_47M; break; // granite-embedding-small case 22: diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp index 0229d20ed36..d094fd9f80b 100644 --- a/src/models/mpt.cpp +++ b/src/models/mpt.cpp @@ -5,7 +5,7 @@ void llama_model_mpt::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_30B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index d2c811d2497..a456269347b 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -9,7 +9,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { // A layer is recurrent IFF the n_head_kv value is set to 0 and // the n_ff value is set to 0 - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0); } @@ -22,7 +22,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B case 56: type = LLM_TYPE_9B; break; case 88: type = LLM_TYPE_120B_A12B; break; diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp index 5d4a3b5c69e..6e2bd9a33ca 100644 --- a/src/models/nemotron.cpp +++ b/src/models/nemotron.cpp @@ -2,7 +2,8 @@ void llama_model_nemotron::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/neo-bert.cpp b/src/models/neo-bert.cpp index f00d6eddfc9..4a08d7abd40 100644 --- a/src/models/neo-bert.cpp +++ b/src/models/neo-bert.cpp @@ -3,7 +3,7 @@ void llama_model_neo_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (hparams.n_layer == 28) { + if (hparams.n_layer() == 28) { type = LLM_TYPE_250M; } } diff --git a/src/models/nomic-bert-moe.cpp b/src/models/nomic-bert-moe.cpp index a17abe2c269..da4b62919bb 100644 --- a/src/models/nomic-bert-moe.cpp +++ b/src/models/nomic-bert-moe.cpp @@ -4,7 +4,7 @@ void llama_model_nomic_bert_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); - if (hparams.n_layer == 12 && hparams.n_embd == 768) { + if (hparams.n_layer() == 12 && hparams.n_embd == 768) { if (arch == LLM_ARCH_NOMIC_BERT) { type = LLM_TYPE_137M; } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { diff --git a/src/models/nomic-bert.cpp b/src/models/nomic-bert.cpp index 5a8a5584457..e7fc72286a6 100644 --- a/src/models/nomic-bert.cpp +++ b/src/models/nomic-bert.cpp @@ -4,7 +4,7 @@ void llama_model_nomic_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); - if (hparams.n_layer == 12 && hparams.n_embd == 768) { + if (hparams.n_layer() == 12 && hparams.n_embd == 768) { if (arch == LLM_ARCH_NOMIC_BERT) { type = LLM_TYPE_137M; } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp index cfcf17bcb03..9f7a2ba60ef 100644 --- a/src/models/olmo.cpp +++ b/src/models/olmo.cpp @@ -4,7 +4,7 @@ void llama_model_olmo::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 22: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp index 7cc262f5504..cb52cdef720 100644 --- a/src/models/olmo2.cpp +++ b/src/models/olmo2.cpp @@ -17,7 +17,7 @@ void llama_model_olmo2::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index 7976ae44a51..1e2baeb207f 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -2,7 +2,8 @@ void llama_model_olmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_A1_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 15b6c8c1205..3ab15d61f08 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -14,7 +14,7 @@ void llama_model_openai_moe::load_arch_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_20B; break; case 36: type = LLM_TYPE_120B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index 9f76350fd4d..13120bd3236 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -3,12 +3,12 @@ void llama_model_openelm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { - case 16: type = LLM_TYPE_270M; break; - case 20: type = LLM_TYPE_450M; break; - case 28: type = LLM_TYPE_1B; break; - case 36: type = LLM_TYPE_3B; break; - default: type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer()) { + case 16: type = LLM_TYPE_270M; break; + case 20: type = LLM_TYPE_450M; break; + case 28: type = LLM_TYPE_1B; break; + case 36: type = LLM_TYPE_3B; break; + default: type = LLM_TYPE_UNKNOWN; } } diff --git a/src/models/orion.cpp b/src/models/orion.cpp index bcb4bbba4b1..863a2822269 100644 --- a/src/models/orion.cpp +++ b/src/models/orion.cpp @@ -3,7 +3,7 @@ void llama_model_orion::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_14B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/pangu-embed.cpp b/src/models/pangu-embed.cpp index 7593f879b24..90f05c088c1 100644 --- a/src/models/pangu-embed.cpp +++ b/src/models/pangu-embed.cpp @@ -2,7 +2,8 @@ void llama_model_pangu_embed::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1 case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1 default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp index 8f3ed5f7b7d..81b1ad12cc0 100644 --- a/src/models/phi2.cpp +++ b/src/models/phi2.cpp @@ -3,7 +3,7 @@ void llama_model_phi2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index f8a4a4d5aa5..716ff814cc1 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -3,7 +3,7 @@ void llama_model_phi3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/phimoe.cpp b/src/models/phimoe.cpp index 4575d6139cf..c332553bc7d 100644 --- a/src/models/phimoe.cpp +++ b/src/models/phimoe.cpp @@ -3,7 +3,7 @@ void llama_model_phimoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_16x3_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp index c7ed1211c31..246144519e4 100644 --- a/src/models/plamo.cpp +++ b/src/models/plamo.cpp @@ -3,7 +3,7 @@ void llama_model_plamo::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index 2ffa0898f71..b93cf48bc5c 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -11,11 +11,11 @@ void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; case 32: if (hparams.n_embd == 2048) { diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 29f3e803d68..16d0b1dcef7 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -13,7 +13,7 @@ void llama_model_plamo3::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_2B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plm.cpp b/src/models/plm.cpp index ce050919e6a..8ca325f5e2c 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -3,7 +3,8 @@ void llama_model_plm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp index 00467dbad7d..1f5dff3843c 100644 --- a/src/models/qwen.cpp +++ b/src/models/qwen.cpp @@ -3,7 +3,7 @@ void llama_model_qwen::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index a5147460bae..e9c2ea80a6b 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -2,7 +2,8 @@ void llama_model_qwen2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break; case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break; case 32: type = LLM_TYPE_7B; break; diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index 7cb03859deb..e831ed11aad 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -5,7 +5,8 @@ void llama_model_qwen2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_A2_7B; break; case 28: type = LLM_TYPE_57B_A14B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index 41b97fed956..1d0d2fab362 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -2,7 +2,8 @@ void llama_model_qwen3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 348650b3796..4b642cff467 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -13,22 +13,20 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break; case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break; case 64: type = LLM_TYPE_27B; break; @@ -39,9 +37,7 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -122,10 +118,10 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - for (int i = (int) n_main; i < n_layer; ++i) { + for (int i = n_layer; i < n_layer_all; ++i) { load_block_mtp(i); } } @@ -159,8 +155,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -177,7 +172,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -490,15 +485,15 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35 MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35 MTP currently only supports a single MTP block"); const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); // hparams.n_layer includes both main model layers and MTP layers. The MTP // layer is stored immediately after the main layers in model.layers[]. - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 7d906191cbb..eb5e9a406a1 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -16,22 +16,20 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_35B_A3B; break; case 48: type = LLM_TYPE_122B_A10B; break; case 60: type = LLM_TYPE_397B_A17B; break; @@ -42,9 +40,7 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -145,10 +141,10 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - for (int i = (int) n_main; i < n_layer; ++i) { + for (int i = n_layer; i < n_layer_all; ++i) { load_block_mtp(i); } } @@ -182,8 +178,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -200,7 +195,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -555,13 +550,13 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35MOE MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index a4f8e1379c9..317e668bec7 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -1,10 +1,10 @@ #include "models.h" void llama_model_qwen3moe::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 9e09ae6f232..97200a44072 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -14,15 +14,15 @@ void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // Mark recurrent layers (linear attention layers) - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_80B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 5defd893944..724d6140d19 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -4,7 +4,8 @@ void llama_model_qwen3vl::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false); ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; case 64: type = LLM_TYPE_32B; break; diff --git a/src/models/qwen3vlmoe.cpp b/src/models/qwen3vlmoe.cpp index 5b77df57122..7c41592f772 100644 --- a/src/models/qwen3vlmoe.cpp +++ b/src/models/qwen3vlmoe.cpp @@ -5,7 +5,8 @@ void llama_model_qwen3vlmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/refact.cpp b/src/models/refact.cpp index bf3949a9092..a46c358fa68 100644 --- a/src/models/refact.cpp +++ b/src/models/refact.cpp @@ -2,7 +2,8 @@ void llama_model_refact::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index ca8e009615e..fc276ce591b 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -2,12 +2,13 @@ void llama_model_rnd1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } + // Set non-causal attention for diffusion models hparams.causal_attn = false; } diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp index ba2a9dfa0db..0b5013dc758 100644 --- a/src/models/rwkv6.cpp +++ b/src/models/rwkv6.cpp @@ -9,7 +9,7 @@ void llama_model_rwkv6::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_6B; break; case 32: switch (hparams.n_embd) { diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp index 566b8cdcb54..6c7db514435 100644 --- a/src/models/rwkv6qwen2.cpp +++ b/src/models/rwkv6qwen2.cpp @@ -9,7 +9,7 @@ void llama_model_rwkv6qwen2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_6B; break; case 32: switch (hparams.n_embd) { diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp index 7574b252621..67c51f5b59c 100644 --- a/src/models/rwkv7.cpp +++ b/src/models/rwkv7.cpp @@ -10,7 +10,7 @@ void llama_model_rwkv7::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: switch (hparams.n_embd) { case 768: type = LLM_TYPE_190M; break; diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp index 806cba574be..57de881a091 100644 --- a/src/models/seed-oss.cpp +++ b/src/models/seed-oss.cpp @@ -2,7 +2,8 @@ void llama_model_seed_oss::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 64: type = LLM_TYPE_36B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 4231cccc666..a8e3d957f1f 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -15,14 +15,14 @@ void llama_model_smallthinker::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_no_rope_layer_step = hparams.n_layer; + hparams.n_no_rope_layer_step = hparams.n_layer(); } ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_4B; break; case 52: type = LLM_TYPE_20B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp index 90e7d473eaf..c67d967b204 100644 --- a/src/models/smollm3.cpp +++ b/src/models/smollm3.cpp @@ -4,7 +4,7 @@ void llama_model_smollm3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); hparams.n_no_rope_layer_step = 4; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp index 4da7f7aefcf..bf6087b8796 100644 --- a/src/models/stablelm.cpp +++ b/src/models/stablelm.cpp @@ -3,7 +3,7 @@ void llama_model_stablelm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_12B; break; diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp index e131af058bc..f73a88fd4e9 100644 --- a/src/models/starcoder.cpp +++ b/src/models/starcoder.cpp @@ -2,7 +2,8 @@ void llama_model_starcoder::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 36: type = LLM_TYPE_3B; break; case 42: type = LLM_TYPE_7B; break; diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp index 9c207c02885..b81b469374a 100644 --- a/src/models/starcoder2.cpp +++ b/src/models/starcoder2.cpp @@ -2,7 +2,8 @@ void llama_model_starcoder2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_3B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_15B; break; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index cf9942b200f..e2218c58704 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -23,16 +23,16 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer(), false); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer(), false); // NextN/MTP (Step3p5): extra decoder block appended beyond the main stack. - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 45: type = LLM_TYPE_196B_A11B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -41,15 +41,12 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP // tensors live in a separate file (e.g. user split target/draft). Mark // MTP tensors NOT_REQUIRED so the trunk loads cleanly. - const std::string mtp_probe = "blk." + std::to_string(n_main) + ".nextn.eh_proj.weight"; - const bool trunk_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight(mtp_probe.c_str()) == nullptr); + const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight"; + const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; const int mtp_flags = trunk_only ? TENSOR_NOT_REQUIRED : 0; @@ -176,7 +173,7 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } // Only the first MTP block (i == n_main) is required at runtime — the @@ -184,8 +181,8 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { // Trailing MTP blocks are loaded if present (so an un-pruned GGUF with // all MTP layers still works) but tolerated when absent via the pruning // path. See scripts/prune_step35_extra_mtp.py for the pruner. - for (int i = (int) n_main; i < n_layer; ++i) { - load_block_mtp(i, /*is_first_mtp=*/ i == (int) n_main); + for (int i = n_layer; i < n_layer_all; ++i) { + load_block_mtp(i, /*is_first_mtp=*/ i == n_layer); } } @@ -206,8 +203,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; const uint32_t n_head_l = hparams.n_head(il); @@ -294,7 +290,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "attn_proj", il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -374,7 +370,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para // LLM_GRAPH_TYPE_DECODER_MTP draft head for Step3p5 (MoE) llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "STEP35 MTP requires nextn_predict_layers > 0"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0"); // Single-block MTP only: always run the first trained MTP block (Qwen // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to @@ -382,7 +378,7 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just // block 0) also work — see load_arch_tensors below and // scripts/prune_step35_extra_mtp.py. - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/t5.cpp b/src/models/t5.cpp index 73e32741406..b0e3f062572 100644 --- a/src/models/t5.cpp +++ b/src/models/t5.cpp @@ -9,10 +9,10 @@ void llama_model_t5::load_arch_hparams(llama_model_loader & ml) { hparams.dec_start_token_id = dec_start_token_id; } - hparams.dec_n_layer = hparams.n_layer; + hparams.dec_n_layer = hparams.n_layer(); ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 6: type = LLM_TYPE_60M; break; // t5-small case 8: type = LLM_TYPE_80M; break; // flan-t5-small case 12: diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp index 1258eeb19b6..393e8f65bf4 100644 --- a/src/models/talkie.cpp +++ b/src/models/talkie.cpp @@ -4,7 +4,7 @@ void llama_model_talkie::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp index d6d1c7a2e5d..3135001293a 100644 --- a/src/models/xverse.cpp +++ b/src/models/xverse.cpp @@ -2,7 +2,8 @@ void llama_model_xverse::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; case 80: type = LLM_TYPE_65B; break; diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 30ea2c07213..c1be9eb5a99 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -684,6 +684,20 @@ static common_chat_tool config_tool{ })", }; +static common_chat_tool calendar_create_event_tool{ + /* .name = */ "Calendar.create_event", + /* .description = */ "Create a calendar event", + /* .parameters = */ R"({ + "type": "object", + "properties": { + "title": { "type": "string" }, + "participants": { "type": "array", "items": { "type": "string" } }, + "metadata": { "type": "object" } + }, + "required": ["title", "participants", "metadata"] + })", +}; + static common_chat_tool imaginary_number_tool{ /* .name = */ "imaginary_number", /* .description = */ "Imaginary number converter", @@ -1811,6 +1825,104 @@ static void test_convert_responses_to_chatcmpl() { } } +// Shared LFM2 parser cases - all variants use one output format and parser +static void test_lfm2_parser(const std::string & template_path, bool detailed_debug) { + auto tst = peg_tester(template_path, detailed_debug); + + // Basic content only + tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); + + // Single tool call without reasoning + tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") + .tools({ special_function_tool }) + .expect(message_assist_call) + .run(); + + // Tool call with string argument + tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>") + .tools({ get_time_tool }) + .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) + .run(); + + // Python literals become JSON + tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>") + .tools({ toggle_tool }) + .expect(message_with_tool_calls("toggle", R"({"enabled": true})")) + .run(); + + tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>") + .tools({ nullable_tool }) + .expect(message_with_tool_calls("set_nullable", R"({"value": null})")) + .run(); + + // Nested Python literal + tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>") + .tools({ config_tool }) + .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})")) + .run(); + + // JSON literals are accepted too + tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>") + .tools({ config_tool }) + .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})")) + .run(); + + // Dotted function name with structured args + tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], " + "metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>") + .tools({ calendar_create_event_tool }) + .expect(message_with_tool_calls( + "Calendar.create_event", + R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})")) + .run(); + + // Markdown links stay content + tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).") + .tools({ get_time_tool }) + .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")) + .run(); + + // Python tool with multiline code in string + tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>") + .tools({ python_tool }) + .expect_tool_calls({ + { "python", R"#({"code": "def hello():\\n print('hey')"})#", "" } + }) + .run(); + + // Content before tool call (no reasoning) + tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>") + .tools({ get_time_tool }) + .expect(message_with_reasoning_content_and_multiple_tool_calls( + "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } } + )) + .run(); + + // Multiple tool calls (parallel) + tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>") + .parallel_tool_calls(true) + .tools({ special_function_tool, special_function_tool_with_optional_param }) + .expect_tool_calls({ + { "special_function", R"({"arg1": 1})", {} }, + { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} }, + }) + .run(); + + // Partial tool call (streaming) + tst.test("<|tool_call_start|>[special_function(arg1=") + .tools({ special_function_tool }) + .is_partial(true) + .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": ")) + .run(); + + // Tool call with empty arguments + tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>") + .tools({ empty_args_tool }) + .expect(simple_assist_msg("", "", "empty_args", "{}")) + .run(); + +} + static void test_template_output_peg_parsers(bool detailed_debug) { LOG_DBG("%s\n", __func__); @@ -4024,49 +4136,30 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); } - // LFM2-8B-A1B tests - uses <|tool_list_start|>/<|tool_list_end|> and <|tool_call_start|>[name(args)]<|tool_call_end|> - { - auto tst = peg_tester("models/templates/LFM2-8B-A1B.jinja", detailed_debug); - - // Basic content only - tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); + for (const char * tmpl : { + "models/templates/LFM2-8B-A1B.jinja", + "models/templates/LFM2.5-Instruct.jinja", + "models/templates/LFM2.5-8B-A1B.jinja", + }) { + test_lfm2_parser(tmpl, detailed_debug); + } - // Single tool call without reasoning - tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") - .tools({ special_function_tool }) - .expect(message_assist_call) - .run(); + // Thinking cases only apply to LFM2.5-8B-A1B, the one LFM2 template that emits + { + auto tst = peg_tester("models/templates/LFM2.5-8B-A1B.jinja", detailed_debug); - // Tool call with string argument - tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>") - .tools({ get_time_tool }) - .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) - .run(); + // Reasoning is parsed independent of enable_thinking - // Tool call with reasoning (enable_thinking=true) + // Tool call with reasoning tst.test("I'm\nthinking<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") - .enable_thinking(true) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ special_function_tool }) .expect(message_assist_call_thoughts) .run(); - // Multiple tool calls (parallel) - tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>") - .parallel_tool_calls(true) - .tools({ - special_function_tool, special_function_tool_with_optional_param - }) - .expect_tool_calls({ - { "special_function", R"({"arg1": 1})", {} }, - { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} }, - }) - .run(); - // Tool call with reasoning and content tst.test("I need to call a function" "Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>") - .enable_thinking(true) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ get_time_tool }) .expect(message_with_reasoning_content_and_multiple_tool_calls( @@ -4074,32 +4167,9 @@ static void test_template_output_peg_parsers(bool detailed_debug) { )) .run(); - // Python tool with multiline code in string - tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>") - .tools({ python_tool }) - .expect_tool_calls({ - { "python", R"#({"code": "def hello():\\n print('hey')"})#", "" } - }) - .run(); - - // Partial tool call (streaming) - tst.test("<|tool_call_start|>[special_function(arg1=") - .tools({ special_function_tool }) - .is_partial(true) - .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": ")) - .run(); - - // Tool call with empty arguments - tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>") - .tools({ empty_args_tool }) - .expect(simple_assist_msg("", "", "empty_args", "{}")) - .run(); - - // fake tool call marker in reasoning - tst.test( - "Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm" - "<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") - .enable_thinking(true) + // Fake tool call marker inside reasoning is not parsed as a call + tst.test("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm" + "<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ special_function_tool }) .expect_reasoning("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm") @@ -4108,89 +4178,21 @@ static void test_template_output_peg_parsers(bool detailed_debug) { }) .run(); - // Continuation tests - tst.test("world!\nWhat's up?") - .reasoning_format(COMMON_REASONING_FORMAT_AUTO) - .enable_thinking(true) - .messages({ message_user, message_assist_prefill_content }) - .add_generation_prompt(false) - .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) - .expect_reasoning("I'm thinking") - .expect_content("Hello, world!\nWhat's up?") - .run(); - - tst.test(" thinkingHello, world!\nWhat's up?") + // enable_thinking=false still captures emitted reasoning + tst.test("I'm\nthinkingHello, world!\nWhat's up?") + .enable_thinking(false) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) - .enable_thinking(true) - .messages({ message_user, message_assist_prefill_reasoning }) - .add_generation_prompt(false) - .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) - .expect_reasoning("I'm thinking") - .expect_content("Hello, world!\nWhat's up?") - .run(); - } - - // LFM2.5 tests - uses plain "List of tools: [...]" and bare [name(args)] without wrapper tokens - { - auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug); - - // Basic content only - tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); - - // Single tool call without reasoning - tst.test("[special_function(arg1=1)]") - .tools({ special_function_tool }) - .expect(message_assist_call) - .run(); - - // Tool call with string argument - tst.test("[get_time(city=\"XYZCITY\")]") - .tools({ get_time_tool }) - .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) + .expect(message_assist_thoughts) .run(); - // Tool call with reasoning (enable_thinking=true) - tst.test("I'm\nthinking[special_function(arg1=1)]") - .enable_thinking(true) + tst.test("I'm\nthinking<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") + .enable_thinking(false) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ special_function_tool }) .expect(message_assist_call_thoughts) .run(); - // Multiple tool calls (parallel) - tst.test("[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]") - .parallel_tool_calls(true) - .tools({ - special_function_tool, special_function_tool_with_optional_param - }) - .expect_tool_calls({ - { "special_function", R"({"arg1": 1})", {} }, - { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} }, - }) - .run(); - - // Tool call with content before tool call - tst.test("Let me check the time.[get_time(city=\"Paris\")]") - .tools({ get_time_tool }) - .expect(message_with_reasoning_content_and_multiple_tool_calls( - "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } } - )) - .run(); - - // Partial tool call (streaming) - tst.test("[special_function(arg1=") - .tools({ special_function_tool }) - .is_partial(true) - .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": ")) - .run(); - - // Tool call with empty arguments - tst.test("[empty_args()]") - .tools({ empty_args_tool }) - .expect(simple_assist_msg("", "", "empty_args", "{}")) - .run(); - - // Continuation tests + // Continuation: prefill content tst.test("world!\nWhat's up?") .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .enable_thinking(true) @@ -4201,6 +4203,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect_content("Hello, world!\nWhat's up?") .run(); + // Continuation: prefill reasoning tst.test(" thinkingHello, world!\nWhat's up?") .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .enable_thinking(true) @@ -5426,18 +5429,25 @@ static void test_template_generation_prompt() { check(tmpls, continuation_reasoning(), "<|im_assistant|>assistant<|im_middle|>I'm"); } - { - auto tmpls = read_templates("models/templates/LFM2-8B-A1B.jinja"); + for (const char * tmpl : { + "models/templates/LFM2-8B-A1B.jinja", + "models/templates/LFM2.5-Instruct.jinja", + "models/templates/LFM2.5-8B-A1B.jinja", + }) { + auto tmpls = read_templates(tmpl); check(tmpls, basic(), "<|im_start|>assistant\n"); check(tmpls, continuation_content(), "<|im_start|>assistant\nI'm thinkingHello, "); check(tmpls, continuation_reasoning(), "<|im_start|>assistant\nI'm"); } { - auto tmpls = read_templates("models/templates/LFM2.5-Instruct.jinja"); - check(tmpls, basic(), "<|im_start|>assistant\n"); - check(tmpls, continuation_content(), "<|im_start|>assistant\nI'm thinkingHello, "); - check(tmpls, continuation_reasoning(), "<|im_start|>assistant\nI'm"); + // 8B-A1B renders prior-turn reasoning via the "thinking" field + auto tmpls = read_templates("models/templates/LFM2.5-8B-A1B.jinja"); + common_chat_templates_inputs inputs; + inputs.messages = { message_user, message_assist_call_thoughts, tool_msg }; + inputs.add_generation_prompt = true; + auto params = common_chat_templates_apply(tmpls.get(), inputs); + assert_contains(params.prompt, "I'm\nthinking"); } { diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 90004e37906..22e6dcc2e10 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -392,7 +392,7 @@ static bool arch_supported(const llm_arch arch) { if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { return false; // FIXME CUDA backend crashes. } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { return false; // FIXME @ngxson } if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) { @@ -450,7 +450,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } for (bool moe : {false, true}) { @@ -553,7 +553,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index af40adbb4ce..e830f262de2 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -397,6 +397,8 @@ int llama_cli(int argc, char ** argv) { return 1; } + ctx_cli.defaults.sampling = params.sampling; + console::spinner::stop(); console::log("\n"); diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 6d2dcb56b2f..6747558fc54 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -33,12 +33,8 @@ #endif static llama_context ** g_ctx; -static llama_model ** g_model; static common_sampler ** g_smpl; static common_params * g_params; -static std::vector * g_input_tokens; -static std::ostringstream * g_output_ss; -static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; @@ -136,7 +132,6 @@ int llama_completion(int argc, char ** argv) { llama_context * ctx = nullptr; common_sampler * smpl = nullptr; - g_model = &model; g_ctx = &ctx; g_smpl = &smpl; @@ -549,9 +544,9 @@ int llama_completion(int argc, char ** argv) { int n_consumed = 0; int n_session_consumed = 0; - std::vector input_tokens; g_input_tokens = &input_tokens; - std::vector output_tokens; g_output_tokens = &output_tokens; - std::ostringstream output_ss; g_output_ss = &output_ss; + std::vector input_tokens; + std::vector output_tokens; + std::ostringstream output_ss; std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode // the first thing we will do is to output the prompt, so set color accordingly @@ -989,7 +984,7 @@ int llama_completion(int argc, char ** argv) { LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - LOG_INF("saved final session to %s, n_tokens = %ld\n", path_session.data(), session_tokens.size()); + LOG_INF("saved final session to %s, n_tokens = %zu\n", path_session.data(), session_tokens.size()); } diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 3f7f3a11dfa..3431a4eca84 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "imatrix-loader.h" #include "log.h" #include "llama.h" #include "gguf.h" @@ -34,10 +35,6 @@ static void print_usage(int, char ** argv) { LOG("\n"); } -static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets"; -static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count"; -static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size"; - struct Stats { std::vector values; std::vector counts; @@ -65,7 +62,6 @@ class IMatrixCollector { bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix_legacy(int32_t ncall = -1) const; void save_imatrix(int32_t n_chunk = -1) const; - bool load_imatrix_legacy(const char * fname); bool load_imatrix(const char * file_name); const std::unordered_map & get_mstats() const { return m_stats; } private: @@ -624,204 +620,63 @@ void IMatrixCollector::save_imatrix(int32_t n_chunk) const { ggml_free(ctx); } -bool IMatrixCollector::load_imatrix_legacy(const char * fname) { - std::ifstream in(fname, std::ios::binary); - if (!in) { - LOG_ERR("%s: failed to open %s\n", __func__, fname); - return false; - } - int n_entries; - in.read((char *) &n_entries, sizeof(n_entries)); - if (in.fail() || n_entries < 1) { - LOG_ERR("%s: no data in file %s\n", __func__, fname); +bool IMatrixCollector::load_imatrix(const char * file_name) { + common_imatrix loaded; + if (!common_imatrix_load(file_name, loaded)) { return false; } - // Guess the chunk size because it's not stored in the file - const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel; - - for (int i = 0; i < n_entries; ++i) { - int32_t len = 0; - in.read((char *) &len, sizeof(len)); - std::vector name_as_vec(len + 1); - in.read((char *) name_as_vec.data(), len); - if (in.fail()) { - LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname); - return false; - } - name_as_vec[len] = 0; - std::string name{ name_as_vec.data() }; - auto & e = m_stats[std::move(name)]; - int32_t ncall = 0; - in.read((char *) &ncall, sizeof(ncall)); - int32_t nval = 0; - in.read((char *) &nval, sizeof(nval)); - if (in.fail() || nval < 1) { - LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i); - m_stats = {}; - return false; - } - if (e.values.empty()) { - e.values.resize(nval, 0.0f); - e.counts.resize(1, 0); - } - - std::vector tmp(nval); - in.read((char *) tmp.data(), nval * sizeof(float)); - if (in.fail()) { - LOG_ERR("%s: failed reading data for entry %d\n", __func__, i); - m_stats = {}; - return false; - } + const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel; + const bool is_legacy = loaded.is_legacy; - // Recreate the state as expected by save_imatrix(), and correct for weighted sum. - for (int i = 0; i < nval; i++) { - e.values[i] += tmp[i] * chunk_size; - } - // The legacy format doesn't distinguish the counts for different experts - for (size_t j = 0; j < e.counts.size(); ++j) { - e.counts[j] += ncall * chunk_size; - } - } + for (auto & [name, entry] : loaded.entries) { + auto & e = m_stats[name]; - { - // TODO: extract into its own method; this is also used by the GGUF-based format - // Calculate the last chunk count - int64_t max_count = 0; - for (const auto & stats : m_stats) { - for (int64_t count : stats.second.counts) { - if (count > max_count) { - max_count = count; - } + if (is_legacy) { + // Legacy format: sums contain (raw_sum/raw_count)*ncall, counts contain {ncall} + // Reconstruct raw form by multiplying by chunk_size + if (e.values.empty()) { + e.values.resize(entry.sums.size(), 0.0f); + e.counts.resize(1, 0); } - } - m_last_chunk = max_count / (chunk_size); - } - - { - // Read the number of calls the matrix was computed with - int32_t n_calls; - in.read((char *) &n_calls, sizeof(n_calls)); - // ignore it because it's not important - } - - // Read the dataset path to include it when writing to GGUF - if (!in.fail()){ - int32_t len = 0; - in.read((char *) &len, sizeof(len)); - if (!in.fail()) { - std::vector dataset; - dataset.resize(len + 1, 0); - in.read(dataset.data(), len); - if (!in.fail()) { - m_datasets.push_back(dataset.data()); + for (size_t j = 0; j < entry.sums.size(); ++j) { + e.values[j] += entry.sums[j] * chunk_size; + } + for (size_t j = 0; j < e.counts.size(); ++j) { + e.counts[j] += entry.counts[0] * chunk_size; } - } - } - - return true; -} - -// Using GGUF as the file format, for greater extensibility -bool IMatrixCollector::load_imatrix(const char * file_name) { - struct ggml_context * ctx = nullptr; - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ false, // the data is needed - /* .ctx = */ &ctx, - }; - struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params); - if (!ctx_gguf) { - return this->load_imatrix_legacy(file_name); - } - const int32_t n_entries = gguf_get_n_tensors(ctx_gguf); - if (n_entries < 1) { - LOG_ERR("%s: no data in file %s\n", __func__, file_name); - gguf_free(ctx_gguf); - ggml_free(ctx); - return false; - } - - const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS); - if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) { - const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key); - m_datasets.reserve(m_datasets.size() + n); - for (int64_t i = 0; i < n; ++i) { - m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i)); - } - } - - const std::string in_sum2_suffix{ ".in_sum2" }; - const std::string counts_suffix{ ".counts" }; - - // Could re-use m_stats instead, but this allows - // checking for completeness of *each* loaded imatrix file - // and also makes it easier to re-use a similar implementation in quantize.cpp - // Using an ordered map to get a deterministic iteration order. - std::map> sums_counts_for; - - for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - std::string name = cur->name; - - if (name.empty()) { continue; } - - if (string_remove_suffix(name, in_sum2_suffix)) { - // in_sum2 - sums_counts_for[std::move(name)].first = cur; - } else if (string_remove_suffix(name, counts_suffix)) { - // counts - sums_counts_for[std::move(name)].second = cur; } else { - // ignore other tensors - } - } - - for (const auto & sc : sums_counts_for) { - const std::string & name = sc.first; - const struct ggml_tensor * in_sum2 = sc.second.first; - const struct ggml_tensor * counts = sc.second.second; - - if (!in_sum2 || !counts) { - LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str()); - gguf_free(ctx_gguf); - ggml_free(ctx); - return false; - } - - auto & e = m_stats[name]; - - int64_t nval = ggml_nelements(in_sum2); - if (e.values.empty()) { - e.values.resize(nval, 0.0f); - } else if ((size_t) nval != e.values.size()) { - LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size()); - gguf_free(ctx_gguf); - ggml_free(ctx); - return false; - } + // GGUF format: raw sums and counts, accumulate directly + const int64_t nval = entry.sums.size(); + const int64_t ncounts = entry.counts.size(); + + if (e.values.empty()) { + e.values.resize(nval, 0.0f); + } else if ((size_t) nval != e.values.size()) { + LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size()); + return false; + } - int64_t ncounts = ggml_nelements(counts); - if (e.counts.empty()) { - e.counts.resize(ncounts, 0); - } else if (e.counts.size() == 1 && ncounts > 1) { - // broadcast, when loading an old imatrix - e.counts.resize(ncounts, e.counts[0]); - } else if ((size_t) ncounts != e.counts.size()) { - LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size()); - gguf_free(ctx_gguf); - ggml_free(ctx); - return false; - } + if (e.counts.empty()) { + e.counts.resize(ncounts, 0); + } else if (e.counts.size() == 1 && ncounts > 1) { + e.counts.resize(ncounts, e.counts[0]); + } else if ((size_t) ncounts != e.counts.size()) { + LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size()); + return false; + } - // Recreate the state as expected by save_imatrix() - for (int64_t j = 0; j < nval; j++) { - e.values[j] += ((const float *) in_sum2->data)[j]; - } - for (int64_t j = 0; j < ncounts; j++) { - e.counts[j] += std::lround(((const float *) counts->data)[j]); + for (int64_t j = 0; j < nval; ++j) { + e.values[j] += entry.sums[j]; + } + for (int64_t j = 0; j < ncounts; ++j) { + e.counts[j] += entry.counts[j]; + } } } - // TODO: extract into its own method; this is also used by the legacy format + m_datasets.insert(m_datasets.end(), loaded.datasets.begin(), loaded.datasets.end()); + // Calculate the last chunk count int64_t max_count = 0; for (const auto & stats : m_stats) { @@ -831,10 +686,8 @@ bool IMatrixCollector::load_imatrix(const char * file_name) { } } } - m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel); + m_last_chunk = max_count / chunk_size; - gguf_free(ctx_gguf); - ggml_free(ctx); return true; } @@ -1218,6 +1071,9 @@ int main(int argc, char ** argv) { return 1; } + // set_params before show_statistics so load_imatrix has valid n_ctx/n_parallel + g_collector.set_params(params); + if (params.show_statistics) { if (!show_statistics(params)) { return 1; diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 93f005652b7..20c53178634 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(mtmd models/gemma4uv.cpp models/glm4v.cpp models/granite-speech.cpp + models/granite4-vision.cpp models/hunyuanvl.cpp models/internvl.cpp models/kimivl.cpp diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 1d9f6a136a9..7d10586217b 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -37,6 +37,9 @@ struct clip_graph { float kq_scale; // TODO: maybe move this to hparams const clip_flash_attn_type flash_attn_type; + // TODO [QWEN_VIDEO]: improve this in the future + int n_batch = 1; + ggml_context_ptr ctx0_ptr; ggml_context * ctx0; ggml_cgraph * gf; diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c055cfb7541..b104f373618 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -4,6 +4,7 @@ #include "gguf.h" #include "clip.h" +#include #include #include #include @@ -35,20 +36,22 @@ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" // vision-specific -#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" -#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" -#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles" -#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles" -#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_FEATURE_LAYER "clip.vision.feature_layer" -#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" -#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" -#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" +#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" +#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" +#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles" +#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles" +#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side" +#define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side" +#define KEY_PROJ_SPATIAL_OFFSETS "clip.vision.projector.spatial_offsets" +#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -72,7 +75,6 @@ #define KEY_A_PROJ_DOWNSAMPLE_RATE "clip.audio.projector.downsample_rate" #define KEY_A_PROJ_HEAD_COUNT "clip.audio.projector.head_count" - // // tensor name constants // @@ -210,22 +212,28 @@ #define TN_CTC_OUT_MID "a.enc_ctc_out_mid.%s" #define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb" // qformer projector -#define TN_QF_PROJ_QUERY "a.proj_query" -#define TN_QF_PROJ_NORM "a.proj_norm.%s" -#define TN_QF_PROJ_LINEAR "a.proj_linear.%s" -#define TN_QF_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s" -#define TN_QF_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s" -#define TN_QF_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s" -#define TN_QF_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s" -#define TN_QF_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s" -#define TN_QF_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s" -#define TN_QF_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s" -#define TN_QF_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s" -#define TN_QF_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s" -#define TN_QF_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s" -#define TN_QF_FFN_UP "a.proj_blk.%d.ffn_up.%s" -#define TN_QF_FFN_DOWN "a.proj_blk.%d.ffn_down.%s" -#define TN_QF_FFN_NORM "a.proj_blk.%d.ffn_norm.%s" +#define TN_QF_PROJ_QUERY "%s.proj_query" +#define TN_QF_PROJ_NORM "%s.proj_norm.%s" +#define TN_QF_PROJ_LINEAR "%s.proj_linear.%s" +#define TN_QF_SELF_ATTN_Q "%s.proj_blk.%d.self_attn_q.%s" +#define TN_QF_SELF_ATTN_K "%s.proj_blk.%d.self_attn_k.%s" +#define TN_QF_SELF_ATTN_V "%s.proj_blk.%d.self_attn_v.%s" +#define TN_QF_SELF_ATTN_O "%s.proj_blk.%d.self_attn_out.%s" +#define TN_QF_SELF_ATTN_N "%s.proj_blk.%d.self_attn_norm.%s" +#define TN_QF_CROSS_ATTN_Q "%s.proj_blk.%d.cross_attn_q.%s" +#define TN_QF_CROSS_ATTN_K "%s.proj_blk.%d.cross_attn_k.%s" +#define TN_QF_CROSS_ATTN_V "%s.proj_blk.%d.cross_attn_v.%s" +#define TN_QF_CROSS_ATTN_O "%s.proj_blk.%d.cross_attn_out.%s" +#define TN_QF_CROSS_ATTN_N "%s.proj_blk.%d.cross_attn_norm.%s" +#define TN_QF_FFN_UP "%s.proj_blk.%d.ffn_up.%s" +#define TN_QF_FFN_DOWN "%s.proj_blk.%d.ffn_down.%s" +#define TN_QF_FFN_NORM "%s.proj_blk.%d.ffn_norm.%s" +// multi-projector qformer (bid => projector ID) +#define TN_MULTI_PROJ_IMG_POS "v.proj_blk.%d.img_pos" +#define TN_MULTI_PROJ_QUERY "%s.proj_blk.%d.query" +#define TN_MULTI_PROJ_LINEAR "%s.proj_blk.%d.linear.%s" +#define TN_MULTI_PROJ_NORM "%s.proj_blk.%d.norm.%s" +#define TN_MULTI_PROJ_POST_NORM "%s.proj_blk.%d.post_norm.%s" // gemma4 audio conformer #define TN_A_MM_INP_PROJ "mm.a.input_projection.%s" @@ -354,6 +362,7 @@ enum projector_type { PROJECTOR_TYPE_MINICPMV4_6, PROJECTOR_TYPE_GRANITE_SPEECH, PROJECTOR_TYPE_MIMOVL, + PROJECTOR_TYPE_GRANITE4_VISION, PROJECTOR_TYPE_UNKNOWN, }; @@ -407,6 +416,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"}, { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, { PROJECTOR_TYPE_MIMOVL, "mimovl"}, + { PROJECTOR_TYPE_GRANITE4_VISION, "granite4_vision"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { @@ -420,24 +430,158 @@ static projector_type clip_projector_type_from_string(const std::string & str) { // RGB uint8 image struct clip_image_u8 { - int nx; - int ny; + clip_image_size get_size() const { + return { nx, ny }; + } + + void set_size(clip_image_size size, bool is_placeholder) { + nx = size.width; + ny = size.height; + if (is_placeholder) { + buf.clear(); + } else { + buf.resize((size_t) nx * (size_t) ny * 3); + } + } + + void cpy_buf(const std::vector & new_buf) { + buf = new_buf; + } + + const std::vector & get_ro_buf() const { + if (is_placeholder()) { + throw std::runtime_error("this clip_image_u8 is a placeholder"); + } + return buf; + } + + // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation + + bool is_placeholder() const { + return buf.empty(); + } + std::array get_pixel(int x, int y) const { + if (is_placeholder()) { + // return a dummy value, so that legacy code can still process image without errors + return { 0, 0, 0 }; + } + int idx = (y * nx + x) * 3; + return { buf[idx], buf[idx + 1], buf[idx + 2] }; + } + + void set_pixel(int x, int y, const std::array & rgb) { + if (is_placeholder()) { + return; // no-op + } + int idx = (y * nx + x) * 3; + buf[idx] = rgb[0]; + buf[idx + 1] = rgb[1]; + buf[idx + 2] = rgb[2]; + } + + size_t n_elements() const { + return n_pixels() * 3; + } + + private: std::vector buf; + int nx = 0; + int ny = 0; + + size_t n_pixels() const { + return (size_t) nx * (size_t) ny; + } }; // For images, buf.size() == nx*ny*3 // Memory layout: RGBRGBRGB... +// For seq, buf.size() == nx*ny*3*nt +// Memory layout: RGBRGB...RGBRGB... (nt times) // For audio, only one channel is used, buf.size() == nx*ny // nx will be n_frames and ny will be n_mel struct clip_image_f32 { - int nx; - int ny; + // marks the global view in e.g., DeepSeek-OCR Models + bool add_viewsep = false; + // whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision) + bool add_newline = false; + + clip_image_size get_size() const { + return { nx_, ny_ }; + } + + int nx() const { return nx_; } + int ny() const { return ny_; } + + void set_size(clip_image_size size, bool is_placeholder, bool is_audio) { + nx_ = size.width; + ny_ = size.height; + if (is_placeholder) { + buf.clear(); + } else { + if (is_audio) { + buf.resize((size_t) nx_ * (size_t) ny_); + } else { + buf.resize((size_t) nx_ * (size_t) ny_ * 3); + } + } + } + + void cpy_buf(const std::vector & new_buf) { + buf = new_buf; + } + + void from_u8(const clip_image_u8 & img) { + auto size = img.get_size(); + nx_ = size.width; + ny_ = size.height; + if (img.is_placeholder()) { + buf.clear(); + return; // no-op + } + buf.resize(img.n_elements()); + const auto & u8_buf = img.get_ro_buf(); + for (size_t i = 0; i < img.n_elements(); ++i) { + buf[i] = (float) u8_buf[i] / 255.0f; + } + } + + size_t n_elements() const { + return n_pixels() * 3; + } + void normalize(const float mean[3], const float std[3]) { + if (is_placeholder()) { + return; // no-op + } + for (size_t i = 0; i < n_pixels(); ++i) { + buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0]; + buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1]; + buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2]; + } + } + + const std::vector & get_ro_buf() const { + if (is_placeholder()) { + throw std::runtime_error("this clip_image_f32 is a placeholder"); + } + return buf; + } + + // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern + + bool is_placeholder() const { + return buf.empty(); + } + + private: std::vector buf; + int nx_ = 0; + int ny_ = 0; - // marks the global view in e.g., DeepSeek-OCR Models - bool add_viewsep = false; + size_t n_pixels() const { + return (size_t) nx_ * (size_t) ny_; + } }; // @@ -485,6 +629,7 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, .. va_end(args); } +#define LOG_TRC(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__) #define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__) diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 238f805a9aa..48796b6306f 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -4,6 +4,7 @@ #include "clip.h" #include "clip-impl.h" +#include #include #include #include @@ -90,7 +91,7 @@ struct clip_hparams { float eps = 1e-6; float rope_theta = 0.0; - std::unordered_set vision_feature_layer; + std::vector vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) @@ -101,6 +102,11 @@ struct clip_hparams { int32_t sam_n_head = 0; int32_t sam_n_embd = 0; + // Granite4 Vision + std::vector proj_spatial_offsets; + int32_t downsample_query_side; + int32_t downsample_window_side; + // audio int32_t n_mel_bins = 0; // whisper preprocessor int32_t proj_stack_factor = 0; // ultravox @@ -158,6 +164,10 @@ struct clip_hparams { return false; } + + bool is_vision_feature_layer(int32_t layer) const { + return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end(); + } }; struct clip_layer { @@ -325,6 +335,20 @@ struct yasa2_stage { std::vector blocks; }; +// QFormer projector block for models with 1 (or more) QFormer projectors +// Granite Speech, Granite4 Vision +struct qf_block { + ggml_tensor * qf_proj_query = nullptr; + ggml_tensor * qf_proj_norm_w = nullptr; + ggml_tensor * qf_proj_norm_b = nullptr; + ggml_tensor * qf_proj_linear_w = nullptr; + ggml_tensor * qf_proj_linear_b = nullptr; + ggml_tensor * qf_proj_post_norm_w = nullptr; + ggml_tensor * qf_proj_post_norm_b = nullptr; + ggml_tensor * qf_proj_img_pos = nullptr; // Vision only + std::vector qf_proj_layers; +}; + struct clip_model { clip_modality modality = CLIP_MODALITY_VISION; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -589,13 +613,8 @@ struct clip_model { ggml_tensor * ctc_out_b = nullptr; ggml_tensor * ctc_out_mid_w = nullptr; ggml_tensor * ctc_out_mid_b = nullptr; - // qformer projector - ggml_tensor * qf_proj_query = nullptr; - ggml_tensor * qf_proj_norm_w = nullptr; - ggml_tensor * qf_proj_norm_b = nullptr; - ggml_tensor * qf_proj_linear_w = nullptr; - ggml_tensor * qf_proj_linear_b = nullptr; - std::vector qf_proj_layers; + // qformer projector(s) + std::vector qf_proj_blocks; bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 80136ed8667..bd33f430625 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -39,12 +39,14 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s } // PPM header: P6 format, width, height, and max color value - file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + const auto ppm_size = img.get_size(); + file << "P6\n" << ppm_size.width << " " << ppm_size.height << "\n255\n"; // Write pixel data - for (size_t i = 0; i < img.buf.size(); i += 3) { + const auto & ppm_buf = img.get_ro_buf(); + for (size_t i = 0; i < ppm_buf.size(); i += 3) { // PPM expects binary data in RGB format, which matches our image buffer - file.write(reinterpret_cast(&img.buf[i]), 3); + file.write(reinterpret_cast(&ppm_buf[i]), 3); } file.close(); @@ -57,9 +59,10 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& return; } - int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + const auto bmp_size = img.get_size(); + int fileSize = 54 + 3 * bmp_size.width * bmp_size.height; // File header + info header + pixel data int bytesPerPixel = 3; - int widthInBytes = img.nx * bytesPerPixel; + int widthInBytes = bmp_size.width * bytesPerPixel; int paddingAmount = (4 - (widthInBytes % 4)) % 4; int stride = widthInBytes + paddingAmount; @@ -72,7 +75,7 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& }; // Total file size - fileSize = 54 + (stride * img.ny); + fileSize = 54 + (stride * bmp_size.height); fileHeader[2] = (unsigned char)(fileSize); fileHeader[3] = (unsigned char)(fileSize >> 8); fileHeader[4] = (unsigned char)(fileSize >> 16); @@ -94,14 +97,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& }; // Width and height in the information header - infoHeader[4] = (unsigned char)(img.nx); - infoHeader[5] = (unsigned char)(img.nx >> 8); - infoHeader[6] = (unsigned char)(img.nx >> 16); - infoHeader[7] = (unsigned char)(img.nx >> 24); - infoHeader[8] = (unsigned char)(img.ny); - infoHeader[9] = (unsigned char)(img.ny >> 8); - infoHeader[10] = (unsigned char)(img.ny >> 16); - infoHeader[11] = (unsigned char)(img.ny >> 24); + infoHeader[4] = (unsigned char)(bmp_size.width); + infoHeader[5] = (unsigned char)(bmp_size.width >> 8); + infoHeader[6] = (unsigned char)(bmp_size.width >> 16); + infoHeader[7] = (unsigned char)(bmp_size.width >> 24); + infoHeader[8] = (unsigned char)(bmp_size.height); + infoHeader[9] = (unsigned char)(bmp_size.height >> 8); + infoHeader[10] = (unsigned char)(bmp_size.height >> 16); + infoHeader[11] = (unsigned char)(bmp_size.height >> 24); // Write file headers file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); @@ -109,14 +112,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& // Pixel data std::vector padding(3, 0); // Max padding size to be added to each row - for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top - for (int x = 0; x < img.nx; ++x) { + for (int y = bmp_size.height - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < bmp_size.width; ++x) { // Each pixel - size_t pixelIndex = (y * img.nx + x) * 3; + const auto px = img.get_pixel(x, y); unsigned char pixel[3] = { - img.buf[pixelIndex + 2], // BMP stores pixels in BGR format - img.buf[pixelIndex + 1], - img.buf[pixelIndex] + px[2], // BMP stores pixels in BGR format + px[1], + px[0] }; file.write(reinterpret_cast(pixel), 3); } @@ -129,12 +132,13 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& // debug function to convert f32 to u8 static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(3 * src.nx * src.ny); - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + dst.set_size(src.get_size(), false); + const auto & src_buf = src.get_ro_buf(); + std::vector dst_buf(src.n_elements()); + for (size_t i = 0; i < src.n_elements(); ++i) { + dst_buf[i] = static_cast(std::min(std::max(int(src_buf[i] * 255.0f), 0), 255)); } + dst.cpy_buf(dst_buf); } #endif @@ -241,8 +245,8 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : proj_type(ctx->proj_type()), img(img), patch_size(hparams.patch_size), - n_patches_x(img.nx / patch_size), - n_patches_y(img.ny / patch_size), + n_patches_x(img.nx() / patch_size), + n_patches_y(img.ny() / patch_size), n_patches(n_patches_x * n_patches_y), n_embd(hparams.n_embd), n_head(hparams.n_head), @@ -278,8 +282,8 @@ void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { // siglip2 naflex ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; - const int height = img.ny / patch_size; - const int width = img.nx / patch_size; + const int height = img.ny() / patch_size; + const int width = img.nx() / patch_size; const uint32_t mode = interpolation_mode; const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); @@ -523,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() { } ggml_tensor * clip_graph::build_inp_raw(int channels) { - ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); + ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels, n_batch); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); return inp_raw; @@ -816,8 +820,8 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale GGML_ASSERT(scale_factor > 1); const int n_embd = cur->ne[0]; - int width = img.nx / patch_size; - int height = img.ny / patch_size; + int width = img.nx() / patch_size; + int height = img.ny() / patch_size; // pad width and height to factor const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; @@ -844,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale } static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { - GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); - const clip_image_f32 & img = *imgs.entries[0]; std::unique_ptr builder; @@ -997,10 +999,17 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } + // TODO [QWEN_VIDEO]: improve this in the future + builder->n_batch = imgs.entries.size(); + return builder->build(); } @@ -1234,12 +1243,7 @@ struct clip_model_loader { // to form the final visual features. // NOTE: gguf conversions should standardize the values of the vision feature layer to // be non-negative, since we use -1 to mark values as unset here. - std::vector vision_feature_layer; - get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false); - // convert std::vector to std::unordered_set - for (auto & layer : vision_feature_layer) { - hparams.vision_feature_layer.insert(layer); - } + get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false); // model-specific params switch (model.proj_type) { @@ -1627,6 +1631,23 @@ struct clip_model_loader { hparams.image_pad_color = {127, 127, 127}; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // SigLIP tower. + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; + hparams.image_resize_pad = PAD_CEIL; + + get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer); + get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets); + if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) { + throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d", + hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size())); + } + + get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side); + get_u32(KEY_PROJ_SAMPLE_WINDOW_SIDE, hparams.downsample_window_side); + hparams.warmup_image_size = hparams.image_size; + } break; default: throw std::runtime_error(string_format("%s: unknown vision projector type %s\n", __func__, proj_type.c_str())); } @@ -2628,46 +2649,105 @@ struct clip_model_loader { layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); } - model.qf_proj_query = get_tensor(TN_QF_PROJ_QUERY); - model.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, "weight")); - model.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, "bias")); - model.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, "weight")); - model.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, "bias")); + model.qf_proj_blocks.resize(1); + auto & qf = model.qf_proj_blocks[0]; + qf.qf_proj_query = get_tensor(string_format(TN_QF_PROJ_QUERY, prefix)); + qf.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "weight")); + qf.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "bias")); + qf.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "weight")); + qf.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "bias")); const int n_proj_layers = 2; - model.qf_proj_layers.resize(n_proj_layers); + qf.qf_proj_layers.resize(n_proj_layers); for (int il = 0; il < n_proj_layers; ++il) { - auto & pl = model.qf_proj_layers[il]; - - pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "weight")); - pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "bias")); - pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "weight")); - pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "bias")); - pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "weight")); - pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "bias")); - pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "weight")); - pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "bias")); - pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "weight")); - pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "bias")); - - pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "weight")); - pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "bias")); - pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "weight")); - pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "bias")); - pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "weight")); - pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "bias")); - pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "weight")); - pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "bias")); - pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "weight")); - pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "bias")); - - pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, il, "weight")); - pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, il, "bias")); - pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, il, "weight")); - pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, il, "bias")); - pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, il, "weight")); - pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, il, "bias")); + auto & pl = qf.qf_proj_layers[il]; + + pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "weight")); + pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "bias")); + pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "weight")); + pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "bias")); + pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "weight")); + pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "bias")); + pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "weight")); + pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "bias")); + pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "weight")); + pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "bias")); + + pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "weight")); + pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "bias")); + pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "weight")); + pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "bias")); + pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "weight")); + pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "bias")); + pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "weight")); + pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "bias")); + pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "weight")); + pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "bias")); + + pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, prefix, il, "weight")); + pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, prefix, il, "bias")); + pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "weight")); + pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "bias")); + pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "weight")); + pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "bias")); + } + } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // image_newline lives at the top-level. + model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + + // Load separate layerwise and spatial projector tensors + const auto projector_count = hparams.vision_feature_layer.size(); + model.qf_proj_blocks.resize(projector_count); + for (size_t bid = 0; bid < projector_count; ++bid) { + auto & b = model.qf_proj_blocks[bid]; + + // non-layerwise tensors + b.qf_proj_img_pos = get_tensor(string_format(TN_MULTI_PROJ_IMG_POS, bid)); + b.qf_proj_query = get_tensor(string_format(TN_MULTI_PROJ_QUERY, prefix, bid)); + b.qf_proj_linear_w = get_tensor(string_format(TN_MULTI_PROJ_LINEAR, prefix, bid, "weight")); + b.qf_proj_linear_b = get_tensor(string_format(TN_MULTI_PROJ_LINEAR, prefix, bid, "bias")); + b.qf_proj_norm_w = get_tensor(string_format(TN_MULTI_PROJ_NORM, prefix, bid, "weight")); + b.qf_proj_norm_b = get_tensor(string_format(TN_MULTI_PROJ_NORM, prefix, bid, "bias")); + b.qf_proj_post_norm_w = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "weight")); + b.qf_proj_post_norm_b = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "bias")); + + // laywerwise tensors + // NOTE: If any model uses multi-layer qformers, this will need to change + b.qf_proj_layers.resize(1); + auto & pl = b.qf_proj_layers[0]; + + pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "weight")); + pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "bias")); + pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "weight")); + pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "bias")); + pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "weight")); + pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "bias")); + pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "weight")); + pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "bias")); + pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "weight")); + pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "bias")); + + pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "weight")); + pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "bias")); + pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "weight")); + pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "bias")); + pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "weight")); + pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "bias")); + pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "weight")); + pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "bias")); + pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "weight")); + pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "bias")); + + pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, prefix, bid, "weight")); + pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, prefix, bid, "bias")); + pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "weight")); + pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "bias")); + pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "weight")); + pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "bias")); } + } break; default: GGML_ASSERT(false && "unknown projector type"); @@ -2730,13 +2810,12 @@ struct clip_model_loader { clip_image_f32_batch batch; clip_image_f32_ptr img(clip_image_f32_init()); if (ctx_clip.model.modality == CLIP_MODALITY_VISION) { - img->nx = hparams.warmup_image_size; - img->ny = hparams.warmup_image_size; - LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny); + const int sz = hparams.warmup_image_size; + img->set_size({sz, sz}, false, false); + LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz); } else { - img->nx = hparams.warmup_audio_size; - img->ny = hparams.n_mel_bins; - LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx); + img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false); + LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size); } batch.entries.push_back(std::move(img)); warmup(ctx_clip, batch); @@ -3033,12 +3112,6 @@ struct clip_image_f32_batch * clip_image_f32_batch_init() { return new clip_image_f32_batch(); } -unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { - if (nx) *nx = img->nx; - if (ny) *ny = img->ny; - return img->buf.data(); -} - void clip_image_size_free(struct clip_image_size * load_image_size) { if (load_image_size == nullptr) { return; @@ -3059,7 +3132,7 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id LOG_ERR("%s: invalid index %d\n", __func__, idx); return 0; } - return batch->entries[idx]->nx; + return batch->entries[idx]->nx(); } size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { @@ -3067,7 +3140,7 @@ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int id LOG_ERR("%s: invalid index %d\n", __func__, idx); return 0; } - return batch->entries[idx]->ny; + return batch->entries[idx]->ny(); } clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { @@ -3078,17 +3151,6 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc return batch->entries[idx].get(); } -void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { - img->nx = nx; - img->ny = ny; - img->buf.resize(3 * nx * ny); - memcpy(img->buf.data(), rgb_pixels, img->buf.size()); -} - -ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { - return ctx->model.image_newline; -} - void clip_free(clip_ctx * ctx) { if (ctx == nullptr) { return; @@ -3096,20 +3158,6 @@ void clip_free(clip_ctx * ctx) { delete ctx; } -// deprecated -size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - const int32_t nx = ctx->model.hparams.image_size; - const int32_t ny = ctx->model.hparams.image_size; - return clip_embd_nbytes_by_img(ctx, nx, ny); -} - -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { - clip_image_f32 img; - img.nx = img_w; - img.ny = img_h; - return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); -} - int32_t clip_get_image_size(const struct clip_ctx * ctx) { return ctx->model.hparams.image_size; } @@ -3140,9 +3188,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: - return (img->nx / params.patch_size) / 2; + return (img->nx() / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: - return img->nx / (params.patch_size * params.n_merge); + return img->nx() / (params.patch_size * params.n_merge); default: break; } @@ -3162,9 +3210,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: - return (img->ny / params.patch_size) / 2; + return (img->ny() / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: - return img->ny / (params.patch_size * params.n_merge); + return img->ny() / (params.patch_size * params.n_merge); default: break; } @@ -3176,7 +3224,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // for models with fixed size image, the input image is already pre-processed and resized to square int patch_size = params.patch_size; - int n_patches = (img->nx / patch_size) * (img->ny / patch_size); + int n_patches = (img->nx() / patch_size) * (img->ny() / patch_size); projector_type proj = ctx->proj_type(); @@ -3242,14 +3290,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_YOUTUVL: { // dynamic size (2 conv, so double patch size) - int x_patch = img->nx / (params.patch_size * 2); - int y_patch = img->ny / (params.patch_size * 2); + int x_patch = img->nx() / (params.patch_size * 2); + int y_patch = img->ny() / (params.patch_size * 2); n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_STEP3VL: { - int x_patch = img->nx / (params.patch_size * params.n_merge); - int y_patch = img->ny / (params.patch_size * params.n_merge); + int x_patch = img->nx() / (params.patch_size * params.n_merge); + int y_patch = img->ny() / (params.patch_size * params.n_merge); n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_GEMMA3: @@ -3276,8 +3324,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // dynamic size int out_patch_size = params.patch_size * ctx->model.hparams.n_merge; - int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; - int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; + int x_patch = CLIP_ALIGN(img->nx(), out_patch_size) / out_patch_size; + int y_patch = CLIP_ALIGN(img->ny(), out_patch_size) / out_patch_size; n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_PADDLEOCR: @@ -3293,8 +3341,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // dynamic size int n_merge = ctx->model.hparams.n_merge; - int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); - int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1); if (ctx->model.token_embd_img_break) { n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } else { @@ -3307,7 +3355,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_MERALION: case PROJECTOR_TYPE_MUSIC_FLAMINGO: { - n_patches = img->nx; + n_patches = img->nx(); const int proj_stack_factor = ctx->model.hparams.proj_stack_factor; if (ctx->model.audio_has_stack_frames()) { @@ -3329,11 +3377,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk const int chunk_size = 100; const int tokens_per_chunk = 13; - n_patches = (img->nx / chunk_size) * tokens_per_chunk; + n_patches = (img->nx() / chunk_size) * tokens_per_chunk; } break; case PROJECTOR_TYPE_GLMA: { - n_patches = img->nx; + n_patches = img->nx(); // whisper downscales input token by half after conv1d n_patches /= 2; // reshape by merge_factor @@ -3360,8 +3408,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; - int ow = (img->nx / patch_size) / merge; - int oh = (img->ny / patch_size) / merge; + int ow = (img->nx() / patch_size) / merge; + int oh = (img->ny() / patch_size) / merge; n_patches = (ow + 1) * oh + 2; } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: @@ -3375,13 +3423,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_LFM2A: { - n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; + n_patches = ((((img->nx() + 1) / 2) + 1) / 2 + 1) / 2; } break; case PROJECTOR_TYPE_GEMMA4A: { // Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2 // O = floor((I - 1) / 2) + 1 - int n = img->nx; + int n = img->nx(); for (int i = 0; i < 2; i++) { n = (n - 1) / 2 + 1; } @@ -3389,13 +3437,30 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_GEMMA4UA: { - n_patches = img->nx; // no downsampling: one token per raw waveform frame + n_patches = img->nx(); // no downsampling: one token per raw waveform frame } break; case PROJECTOR_TYPE_GRANITE_SPEECH: { const int ws = ctx->model.hparams.audio_proj_window_size; const int ds = ctx->model.hparams.audio_proj_downsample_rate; - n_patches = ((img->nx + ws - 1) / ws) * (ws / ds); + n_patches = ((img->nx() + ws - 1) / ws) * (ws / ds); + } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // Per-tile output token count: each projector block outputs + // query_side^2 tokens per window × n^2 windows. + // For 384×384 input: n = 24/8 = 3, query_side = 4 → 144. + const int window_side = ctx->model.hparams.downsample_window_side; + const int query_side = ctx->model.hparams.downsample_query_side; + const int side = img->nx() / params.patch_size; + const int n = side / window_side; + n_patches = (query_side * n) * (query_side * n); + if (img->add_newline) { + // For single-tile case: append 1 newline row. + // For multi-tile rowwise: handled by caller, but here we + // report the per-tile count including one trailing newline. + n_patches += 1; + } } break; default: GGML_ABORT("unsupported projector type"); @@ -3415,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { const clip_image_f32_batch & imgs = *imgs_c_ptr; - int batch_size = imgs.entries.size(); + int n_batch_cur = imgs.entries.size(); + + // maximum supported batch size, usually == 2 for qwen-vl-based models + int n_batch_max = clip_model_n_batch_max(ctx); // TODO @ngxson : implement batch size > 1 as a loop // we don't need true batching support because the cgraph will gonna be big anyway - if (batch_size != 1) { - return false; // only support batch size of 1 + if (n_batch_cur > n_batch_max) { + return false; } // if buffers are not allocated, we need to do a warmup run to allocate them @@ -3437,8 +3505,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const auto & model = ctx->model; const auto & hparams = model.hparams; - const int image_size_width = imgs.entries[0]->nx; - const int image_size_height = imgs.entries[0]->ny; + const int image_size_width = imgs.entries[0]->nx(); + const int image_size_height = imgs.entries[0]->ny(); const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); @@ -3458,7 +3526,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return inp; }; - auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + auto set_input_f32 = [&get_inp_tensor](const char * name, const std::vector & values) { ggml_tensor * cur = get_inp_tensor(name); GGML_ASSERT(cur->type == GGML_TYPE_F32); GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); @@ -3476,7 +3544,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (!imgs.is_audio) { size_t nelem = 0; for (const auto & img : imgs.entries) { - nelem += img->nx * img->ny * 3; + nelem += img->nx() * img->ny() * 3; } std::vector inp_raw(nelem); @@ -3491,20 +3559,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // └─────┘ │ // ──────┘ x B - for (size_t i = 0; i < imgs.entries.size(); i++) { - const int nx = imgs.entries[i]->nx; - const int ny = imgs.entries[i]->ny; - const int n = nx * ny; + // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models + // All entries must have the same spatial size (enforced by can_batch_with() during merging) + { + const int nx = imgs.entries[0]->nx(); + const int ny = imgs.entries[0]->ny(); + const int n = nx * ny; - for (int b = 0; b < batch_size; b++) { + for (int b = 0; b < n_batch_cur; b++) { + const auto & buf = imgs.entries[b]->get_ro_buf(); float * batch_entry = inp_raw.data() + b * (3*n); for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { - size_t base_src = 3*(y * nx + x); // idx of the first channel - size_t base_dst = y * nx + x; // idx of the first channel - batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; - batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; - batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; + size_t base_src = 3*(y * nx + x); + size_t base_dst = y * nx + x; + batch_entry[ base_dst] = buf[base_src ]; + batch_entry[1*n + base_dst] = buf[base_src + 1]; + batch_entry[2*n + base_dst] = buf[base_src + 2]; } } } @@ -3514,12 +3585,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } else { // audio input GGML_ASSERT(imgs.entries.size() == 1); + const auto & mel_inp = imgs.entries[0]; - const int n_step = mel_inp->nx; - const int n_mel = mel_inp->ny; - std::vector inp_raw(n_step * n_mel); - std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float)); - set_input_f32("inp_raw", inp_raw); + const auto & buf = mel_inp->get_ro_buf(); + const int n_step = mel_inp->nx(); + const int n_mel = mel_inp->ny(); + GGML_ASSERT((size_t)n_step * n_mel == buf.size()); + + set_input_f32("inp_raw", buf); } // set input per projector @@ -4130,7 +4203,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima GGML_ASSERT(imgs.entries.size() == 1); const auto & img0 = imgs.entries.front(); // Compute n_pos matching SSCP output: two stride-2 convs - int n_pos = img0->nx; + int n_pos = img0->nx(); for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; } // Chunked local attention: blocked causal mask and RPE @@ -4229,6 +4302,82 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_f32("attn_mask", mask); } } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // Granite Vision 4.1 uses precomputed permutation index + // tensors to express the _win / _unwin / spatial sampling + // reshapes as ggml_get_rows gathers. The names are set + // by g4v_gather() in models/granite4-vision.cpp. + const int patch_size = model.hparams.patch_size; + const int image_side = imgs.entries.front()->nx() / patch_size; + const int window_side = hparams.downsample_window_side; + const int query_side = hparams.downsample_query_side; + const int n = image_side / window_side; + const int new_side = n * query_side; + + // Builds the raster→window permutation indices for a + // (side, side) grid split into (n × n) windows of (win × win) + // tokens each. dst[w * win*win + p] = source raster index. + auto make_win_idx = [](int side, int win) { + const int nn = side / win; + std::vector idx(static_cast(side) * side); + for (int wy = 0; wy < nn; ++wy) { + for (int wx = 0; wx < nn; ++wx) { + for (int iy = 0; iy < win; ++iy) { + for (int ix = 0; ix < win; ++ix) { + const int w = wy * nn + wx; + const int p = iy * win + ix; + const int y = wy * win + iy; + const int x = wx * win + ix; + idx[static_cast(w) * (win*win) + p] = y * side + x; + } + } + } + } + return idx; + }; + + auto make_unwin_idx = [&](int side, int win) { + const std::vector fwd = make_win_idx(side, win); + std::vector inv(fwd.size()); + for (size_t i = 0; i < fwd.size(); ++i) { + inv[fwd[i]] = static_cast(i); + } + return inv; + }; + + auto make_spatial_idx = [](int side, int offset) { + const int off_y = (offset >> 1) & 1; + const int off_x = offset & 1; + const int new_s = side / 2; + std::vector idx(static_cast(new_s) * new_s); + for (int y = 0; y < new_s; ++y) { + for (int x = 0; x < new_s; ++x) { + idx[y * new_s + x] = (y * 2 + off_y) * side + (x * 2 + off_x); + } + } + return idx; + }; + + auto upload = [&](const std::string & name, const std::vector & idx) { + ggml_tensor * t = ggml_graph_get_tensor(gf, name.c_str()); + GGML_ASSERT(t); + ggml_backend_tensor_set(t, idx.data(), 0, idx.size() * sizeof(int32_t)); + }; + + // Stage 1b only uses block 0's permutations; future stages + // will upload all blocks. + for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) { + const std::string prefix = "g4v_blk" + std::to_string(bid) + "_"; + upload(prefix + "win_idx", make_win_idx(image_side, window_side)); + upload(prefix + "qwin_idx", make_win_idx(new_side, query_side)); + upload(prefix + "unwin_idx", make_unwin_idx(new_side, query_side)); + const auto spatial_offset = hparams.proj_spatial_offsets[bid]; + if (spatial_offset >= 0) { + upload(prefix + "spatial_idx", make_spatial_idx(image_side,spatial_offset)); + } + } + } break; default: GGML_ABORT("Unknown projector type"); } @@ -4384,7 +4533,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; case PROJECTOR_TYPE_GRANITE_SPEECH: - return ctx->model.qf_proj_linear_w->ne[1]; + return ctx->model.qf_proj_blocks[0].qf_proj_linear_w->ne[1]; + case PROJECTOR_TYPE_GRANITE4_VISION: + return ctx->model.qf_proj_blocks.size() * ctx->model.hparams.projection_dim; case PROJECTOR_TYPE_GLM4V: return ctx->model.mm_ffn_down_w->ne[1]; default: @@ -4404,17 +4555,15 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } -bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { - clip_image_f32 clip_img; - clip_img.buf.resize(h * w * 3); - for (int i = 0; i < h*w*3; i++) - { - clip_img.buf[i] = img[i]; +int clip_model_n_batch_max(const struct clip_ctx * ctx) { + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + return 2; + default: + return 1; } - clip_img.nx = w; - clip_img.ny = h; - clip_image_encode(ctx, n_threads, &clip_img, vec); - return true; } // @@ -4425,17 +4574,6 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) { return ctx->proj_type(); } -void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) { - clip_image_f32 * audio = new clip_image_f32; - audio->nx = n_frames; - audio->ny = n_mel; - audio->buf.resize(n_frames * n_mel); - std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float)); - - batch->entries.push_back(clip_image_f32_ptr(audio)); - batch->is_audio = true; -} - const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 9b807ffa77b..18c7a1d1a7c 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -17,6 +17,15 @@ struct clip_ctx; struct clip_image_size { int width; int height; + bool operator==(const clip_image_size & other) const { + return width == other.width && height == other.height; + } + bool operator!=(const clip_image_size & other) const { + return !(*this == other); + } + int area() const { + return width * height; + } }; struct clip_image_f32; @@ -54,9 +63,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params void clip_free(struct clip_ctx * ctx); -size_t clip_embd_nbytes(const struct clip_ctx * ctx); -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); - int32_t clip_get_image_size (const struct clip_ctx * ctx); int32_t clip_get_patch_size (const struct clip_ctx * ctx); int32_t clip_get_hidden_size(const struct clip_ctx * ctx); @@ -79,9 +85,6 @@ struct clip_image_u8 * clip_image_u8_init (void); struct clip_image_f32 * clip_image_f32_init(void); struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava -// nx, ny are the output image dimensions -unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny); - void clip_image_size_free (struct clip_image_size * img_size); void clip_image_u8_free (struct clip_image_u8 * img); void clip_image_f32_free(struct clip_image_f32 * img); @@ -94,14 +97,6 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data -/** - * Build image from pixels decoded by other libraries instead of stb_image.h for better performance. - * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes - */ -void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); - -struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); - bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); @@ -109,14 +104,11 @@ bool clip_is_llava(const struct clip_ctx * ctx); // note for contributor: this clip_is_(model) pattern is deprecated // do NOT add new functions like this -bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); - -// use by audio input -void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel); - bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); +int clip_model_n_batch_max(const struct clip_ctx * ctx); + std::map clip_get_mem_usage(const struct clip_ctx * ctx); struct clip_cap { diff --git a/tools/mtmd/models/conformer.cpp b/tools/mtmd/models/conformer.cpp index f58c5048f59..5f2c7b97314 100644 --- a/tools/mtmd/models/conformer.cpp +++ b/tools/mtmd/models/conformer.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_conformer::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int n_pos = n_frames / 2; const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); diff --git a/tools/mtmd/models/exaone4_5.cpp b/tools/mtmd/models/exaone4_5.cpp index 7bfbaca996b..bd9e8c74886 100644 --- a/tools/mtmd/models/exaone4_5.cpp +++ b/tools/mtmd/models/exaone4_5.cpp @@ -22,8 +22,8 @@ ggml_cgraph * clip_graph_exaone4_5::build() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); { ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index 623d2e384b6..0e1d596b41b 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_glm4v::build() { ggml_set_name(positions, "positions"); ggml_set_input(positions); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); // second conv dimension { diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp index c7e3794a49e..0bd4d75ac51 100644 --- a/tools/mtmd/models/granite-speech.cpp +++ b/tools/mtmd/models/granite-speech.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_granite_speech::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int context_size = hparams.audio_chunk_size; const int ctc_layer = n_layer / 2; const int conv_kernel = hparams.audio_conv_kernel_size; @@ -199,8 +199,8 @@ ggml_cgraph * clip_graph_granite_speech::build() { ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj); - ggml_tensor * queries = build_norm(model.qf_proj_query, - model.qf_proj_norm_w, model.qf_proj_norm_b, + ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query, + model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b, NORM_TYPE_NORMAL, proj_eps, -1); { ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1); @@ -209,8 +209,8 @@ ggml_cgraph * clip_graph_granite_speech::build() { queries = ggml_repeat(ctx0, q_3d, q_shape); } - for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) { - const auto & pl = model.qf_proj_layers[il]; + for (int il = 0; il < (int)model.qf_proj_blocks[0].qf_proj_layers.size(); il++) { + const auto & pl = model.qf_proj_blocks[0].qf_proj_layers[il]; // self-attention { @@ -265,7 +265,7 @@ ggml_cgraph * clip_graph_granite_speech::build() { } cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj); - cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b); + cur = ggml_add(ctx0, build_mm(model.qf_proj_blocks[0].qf_proj_linear_w, cur), model.qf_proj_blocks[0].qf_proj_linear_b); cb(cur, "projector_out", -1); } diff --git a/tools/mtmd/models/granite4-vision.cpp b/tools/mtmd/models/granite4-vision.cpp new file mode 100644 index 00000000000..9adb6f0fdbf --- /dev/null +++ b/tools/mtmd/models/granite4-vision.cpp @@ -0,0 +1,339 @@ +#include "models.h" +#include "../clip-impl.h" +#include "../clip-model.h" + +#include +#include +#include +#include +#include + +/* + * Granite Vision 4.1 clip graph + * + * Stage 1a: SigLIP vision tower (N layers, post-norm) + * Stage 1b: WindowQFormer blocks (deepstack + spatial) + * Stage 1c: Concatenate and pack outputs + * Stage 1d: Append newline tokens if add_newline is set + */ + +// --------------------------------------------------------------------------- +// Member method implementations +// --------------------------------------------------------------------------- + +ggml_tensor * clip_graph_granite4_vision::gather( + ggml_tensor * src, + const std::string & name, + int idx_len) { + ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, idx_len); + ggml_set_name(idx, name.c_str()); + ggml_set_input(idx); + return ggml_get_rows(ctx0, src, idx); +} + +ggml_tensor * clip_graph_granite4_vision::interp_down( + ggml_tensor * src, + int side, + int new_side) { + const int n_embd = src->ne[0]; + ggml_tensor * t = ggml_reshape_4d(ctx0, src, n_embd, side, side, 1); + t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 0, 1, 3)); + const int kernel = side / new_side; + t = ggml_pool_2d(ctx0, t, GGML_OP_POOL_AVG, kernel, kernel, kernel, kernel, 0, 0); + t = ggml_cont(ctx0, ggml_permute(ctx0, t, 1, 2, 0, 3)); + return ggml_reshape_2d(ctx0, t, n_embd, new_side * new_side); +} + +// --------------------------------------------------------------------------- +// build_block - WindowQFormer block implementation +// --------------------------------------------------------------------------- + +ggml_tensor * clip_graph_granite4_vision::build_block( + const qf_block & blk, + ggml_tensor * h, + int bid, + int spatial_offset, + int image_side, + int window_side, + int query_side, + float qformer_eps) { + + const int n_embd = h->ne[0]; + GGML_ASSERT(h->ne[1] == image_side * image_side); + const int n = image_side / window_side; + const int new_side = n * query_side; + const int n_windows = n * n; + const int enc_len = window_side * window_side; + const int query_len = query_side * query_side; + + auto cbx = [&](ggml_tensor * & t, const char * step) { + const std::string name = "g4v_blk" + std::to_string(bid) + "_" + step; + ggml_set_name(t, name.c_str()); + }; + + // 1. Top-level LN + cbx(h, "inp"); + ggml_tensor * x = build_norm(h, blk.qf_proj_norm_w, blk.qf_proj_norm_b, NORM_TYPE_NORMAL, eps, bid); + cbx(x, "norm"); + + // 2. enc = _win(x, image_side, window_side) + ggml_tensor * enc; + { + ggml_tensor * enc_flat = gather(x, + "g4v_blk" + std::to_string(bid) + "_win_idx", + image_side * image_side); + enc = ggml_reshape_3d(ctx0, enc_flat, n_embd, enc_len, n_windows); + } + cbx(enc, "enc"); + + // 3. downsampled = downsampler(x) + ggml_tensor * d; + (void) spatial_offset; + if (spatial_offset >= 0) { + d = gather(x, + "g4v_blk" + std::to_string(bid) + "_spatial_idx", + new_side * new_side); + } else { + d = interp_down(x, image_side, new_side); + } + cbx(d, "downsampled"); + + // 4. query_embeds = query + _win(d, new_side, query_side) + ggml_tensor * q_in; + { + ggml_tensor * dw_flat = gather(d, + "g4v_blk" + std::to_string(bid) + "_qwin_idx", + new_side * new_side); + ggml_tensor * dw = ggml_reshape_3d(ctx0, dw_flat, n_embd, query_len, n_windows); + q_in = ggml_add(ctx0, dw, blk.qf_proj_query); + } + cbx(q_in, "query_embeds"); + + // 5. encoder_embeds = enc + image_positions → (C, enc_len, n_windows) + ggml_tensor * e_in = ggml_add(ctx0, enc, blk.qf_proj_img_pos); + cbx(e_in, "encoder_embeds"); + + // 6. Qformer forward. + ggml_tensor * q = build_norm(q_in, blk.qf_proj_post_norm_w, blk.qf_proj_post_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid); + + // Helper for linear projections with window batching + auto linear = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) -> ggml_tensor * { + ggml_tensor * t = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]); + t = build_mm(w, t); + if (b) t = ggml_add(ctx0, t, b); + return t; + }; + + // Get the single QFormer layer + GGML_ASSERT(blk.qf_proj_layers.size() == 1); + const auto & pl = blk.qf_proj_layers[0]; + + // 6a. Self-attention + ggml_tensor * sa_out; + { + const int d_h = 64; + const int n_head = n_embd / d_h; + const int nq = q->ne[1]; + const float scale = 1.0f / std::sqrt((float) d_h); + + ggml_tensor * Q = linear(q, pl.q_w, pl.q_b); + ggml_tensor * K = linear(q, pl.k_w, pl.k_b); + ggml_tensor * V = linear(q, pl.v_w, pl.v_b); + + Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows); + K = ggml_reshape_4d(ctx0, K, d_h, n_head, nq, n_windows); + V = ggml_reshape_4d(ctx0, V, d_h, n_head, nq, n_windows); + + sa_out = build_attn(pl.o_w, pl.o_b, Q, K, V, nullptr, scale, bid); + sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, nq, n_windows); + + sa_out = ggml_add(ctx0, sa_out, q); + sa_out = build_norm(sa_out, pl.ln_1_w, pl.ln_1_b, + NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(sa_out, "sa_out"); + + // 6b. Cross-attention + ggml_tensor * ca_out; + { + const int d_h = 64; + const int n_head = n_embd / d_h; + const int nq = sa_out->ne[1]; + const int nkv = e_in->ne[1]; + const float scale = 1.0f / std::sqrt((float) d_h); + + ggml_tensor * Q = linear(sa_out, pl.cross_attn_q_w, pl.cross_attn_q_b); + ggml_tensor * K = linear(e_in, pl.cross_attn_k_w, pl.cross_attn_k_b); + ggml_tensor * V = linear(e_in, pl.cross_attn_v_w, pl.cross_attn_v_b); + + Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows); + K = ggml_reshape_4d(ctx0, K, d_h, n_head, nkv, n_windows); + V = ggml_reshape_4d(ctx0, V, d_h, n_head, nkv, n_windows); + + ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b, + Q, K, V, nullptr, scale, bid); + ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, nq, n_windows); + + ca_out = ggml_add(ctx0, ca_out, sa_out); + ca_out = build_norm(ca_out, pl.cross_attn_norm_w, pl.cross_attn_norm_b, + NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(ca_out, "ca_out"); + + // 6c. FFN + ggml_tensor * ffn; + { + ggml_tensor * t = ggml_reshape_2d(ctx0, ca_out, n_embd, query_len * n_windows); + t = build_mm(pl.ff_up_w, t); + if (pl.ff_up_b) t = ggml_add(ctx0, t, pl.ff_up_b); + t = ggml_gelu_erf(ctx0, t); + t = build_mm(pl.ff_down_w, t); + if (pl.ff_down_b) t = ggml_add(ctx0, t, pl.ff_down_b); + t = ggml_reshape_3d(ctx0, t, n_embd, query_len, n_windows); + ffn = ggml_add(ctx0, t, ca_out); + ffn = build_norm(ffn, pl.ln_2_w, pl.ln_2_b, NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(ffn, "qformer_out"); + + // 7. _unwin back to raster + ggml_tensor * unwinned; + { + ggml_tensor * flat = ggml_reshape_2d(ctx0, ffn, n_embd, query_len * n_windows); + unwinned = gather(flat, + "g4v_blk" + std::to_string(bid) + "_unwin_idx", + new_side * new_side); + } + cbx(unwinned, "unwin"); + + // 8. out_linear + ggml_tensor * out = build_mm(blk.qf_proj_linear_w, unwinned); + if (blk.qf_proj_linear_b) out = ggml_add(ctx0, out, blk.qf_proj_linear_b); + cbx(out, "out"); + + return out; +} + +// --------------------------------------------------------------------------- +// build() - top-level graph +// --------------------------------------------------------------------------- + +// Build the K-tiled, base-scaled newline row tensor. +// Shape: (n_mmproj_embd, 1) +ggml_tensor * clip_graph_granite4_vision::build_newline_row(ggml_context * ctx0) { + const int K = (int) model.qf_proj_blocks.size(); + GGML_ASSERT(K > 0); + GGML_ASSERT(n_mmproj_embd % K == 0); + const int projection_dim = n_mmproj_embd / K; + GGML_ASSERT(model.image_newline != nullptr); + GGML_ASSERT(ggml_nelements(model.image_newline) == projection_dim); + + // Build newline_row[k*projection_dim + d] = nl[d] * (k == 0 ? base : 1.0) + ggml_tensor * nl = model.image_newline; // (projection_dim,) + ggml_tensor * nl_first_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1); + ggml_tensor * nl_row_2d; + if (K == 1) { + nl_row_2d = nl_first_2d; + } else { + ggml_tensor * nl_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1); + ggml_tensor * rest_template = ggml_new_tensor_2d( + ctx0, GGML_TYPE_F32, projection_dim, K - 1); + ggml_tensor * nl_rest = ggml_repeat(ctx0, nl_2d, rest_template); + nl_row_2d = ggml_concat(ctx0, nl_first_2d, nl_rest, 1); // (projection_dim, K) + } + nl_row_2d = ggml_cont(ctx0, nl_row_2d); + return ggml_reshape_2d(ctx0, nl_row_2d, n_mmproj_embd, 1); +} + +// Append a single newline row at the end of the tile output. +ggml_tensor * clip_graph_granite4_vision::append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output) { + // For the single-tile case, append one newline row at the end. + // For the multi-tile rowwise case, this will be called per-tile + // (though currently only the single-tile path uses it). + ggml_tensor * nl_row = build_newline_row(ctx0); + return ggml_concat(ctx0, tile_output, nl_row, 1); +} + +ggml_cgraph * clip_graph_granite4_vision::build() { + GGML_ASSERT(model.patch_embeddings_0 != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + GGML_ASSERT(!model.qf_proj_blocks.empty()); + + // --- Stage 1a: SigLIP encoder producing intermediate hidden states --- + ggml_tensor * inp = build_inp(); + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "pos_embed", -1); + + ggml_tensor * inpL = inp; + std::vector layer_outs(n_layer, nullptr); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + + // Self-attention + ggml_tensor * Qcur = build_mm(layer.q_w, cur); + if (layer.q_b) Qcur = ggml_add(ctx0, Qcur, layer.q_b); + ggml_tensor * Kcur = build_mm(layer.k_w, cur); + if (layer.k_b) Kcur = ggml_add(ctx0, Kcur, layer.k_b); + ggml_tensor * Vcur = build_mm(layer.v_w, cur); + if (layer.v_b) Vcur = ggml_add(ctx0, Vcur, layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + layer_outs[il] = cur; + inpL = cur; + } + + // --- Stage 1b/1c: WindowQFormer blocks --- + const int projector_count = hparams.vision_feature_layer.size(); + const float qformer_eps = 1e-12f; + + ggml_tensor * mmproj = nullptr; + for (int bid = 0; bid < projector_count; ++bid) { + const auto & blk = model.qf_proj_blocks[bid]; + + int vlayer = hparams.vision_feature_layer[bid]; + GGML_ASSERT(vlayer >= 0 && vlayer < n_layer); + ggml_tensor * h = layer_outs[vlayer]; + + ggml_tensor * stream = build_block( + blk, h, bid, + hparams.proj_spatial_offsets[bid], + n_patches_x, + hparams.downsample_window_side, + hparams.downsample_query_side, + qformer_eps); + cb(stream, (std::string("proj_") + std::to_string(bid) + std::string("_v_out")).c_str(), vlayer); + mmproj = mmproj ? ggml_concat(ctx0, mmproj, stream, 0) : stream; + } + + // --- Stage 1d: Append newline tokens if add_newline is set --- + if (add_newline) { + mmproj = append_rowwise_newlines(ctx0, mmproj); + ggml_set_name(mmproj, "g4v_mmproj_out_nl"); + } else { + ggml_set_name(mmproj, "g4v_mmproj_out"); + } + ggml_build_forward_expand(gf, mmproj); + + return gf; +} diff --git a/tools/mtmd/models/kimik25.cpp b/tools/mtmd/models/kimik25.cpp index cf9f27f63af..cb345f0fc62 100644 --- a/tools/mtmd/models/kimik25.cpp +++ b/tools/mtmd/models/kimik25.cpp @@ -7,8 +7,8 @@ // with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3). ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; - const int height = img.ny / patch_size; - const int width = img.nx / patch_size; + const int height = img.ny() / patch_size; + const int width = img.nx() / patch_size; const uint32_t mode = interpolation_mode; GGML_ASSERT(pos_embd); diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp index 4af17ccfe85..5aa3d2f0fac 100644 --- a/tools/mtmd/models/llava.cpp +++ b/tools/mtmd/models/llava.cpp @@ -51,7 +51,6 @@ ggml_cgraph * clip_graph_llava::build() { } std::vector embedding_stack; - const auto & vision_feature_layer = hparams.vision_feature_layer; // loop over layers for (int il = 0; il < max_feature_layer; il++) { @@ -60,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() { // If this is an embedding feature layer, save the output. // NOTE: 0 index here refers to the input to the encoder. - if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + if (hparams.is_vision_feature_layer(il)) { embedding_stack.push_back(cur); } @@ -135,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() { // process vision feature layers (used by granite) { // final layer is a vision feature layer - if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { + if (hparams.is_vision_feature_layer(max_feature_layer)) { embedding_stack.push_back(inpL); } diff --git a/tools/mtmd/models/mimovl.cpp b/tools/mtmd/models/mimovl.cpp index 19db88f132a..6ff1124a02f 100644 --- a/tools/mtmd/models/mimovl.cpp +++ b/tools/mtmd/models/mimovl.cpp @@ -56,8 +56,8 @@ ggml_cgraph * clip_graph_mimovl::build() { patch_size, patch_size, 0, 0, 1, 1); inp = ggml_add(ctx0, inp, inp_1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b] inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index b882f800dd7..12082a5280a 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -31,10 +31,11 @@ struct clip_graph_pixtral : clip_graph { struct clip_graph_qwen2vl : clip_graph { clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; + ggml_tensor * build_inp_with_temporal_merge(); }; -struct clip_graph_qwen3vl : clip_graph { - clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} +struct clip_graph_qwen3vl : clip_graph_qwen2vl { + clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {} ggml_cgraph * build() override; }; @@ -211,3 +212,26 @@ struct clip_graph_exaone4_5 : clip_graph { clip_graph_exaone4_5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; + +struct clip_graph_granite4_vision : clip_graph { + clip_graph_granite4_vision(clip_ctx * ctx, const clip_image_f32 & img) + : clip_graph(ctx, img), + add_newline(img.add_newline) {} + + ggml_cgraph * build() override; + +private: + // The graph is per-tile since only batch-size 1 is supported in clip. As + // such, this value is set at construct time based on the tile that will be + // encoded, then used during build to determine how to handle newlines. + const bool add_newline; + + ggml_tensor * gather(ggml_tensor * src, const std::string & name, int idx_len); + ggml_tensor * interp_down(ggml_tensor * src, int side, int new_side); + ggml_tensor * build_block(const qf_block & blk, ggml_tensor * h, int bid, + int spatial_offset, int image_side, int window_side, + int query_side, float qformer_eps); + + ggml_tensor * build_newline_row(ggml_context * ctx0); + ggml_tensor * append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output); +}; diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp index ebf10757376..2220c2692a1 100644 --- a/tools/mtmd/models/qwen2vl.cpp +++ b/tools/mtmd/models/qwen2vl.cpp @@ -1,5 +1,34 @@ #include "models.h" +ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() { + ggml_tensor * inp_raw = build_inp_raw(); + + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); + + const size_t nb1 = ggml_row_size(inp_raw->type, img.nx()); + const size_t nb2 = ggml_row_size(inp_raw->type, img.nx() * img.ny()); + + if (n_batch == 1) { + // still image input + return ggml_add(ctx0, + ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1), + ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1)); + } else if (n_batch == 2) { + // 2 frames input (video input) + ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw, + img.nx(), img.ny(), 3, nb1, nb2, 0); + ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw, + img.nx(), img.ny(), 3, nb1, nb2, + nb2 * 3); // move to the second frame + return ggml_add(ctx0, + ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1), + ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1)); + } else { + GGML_ASSERT(false && "n_batch > 2 is not supported"); + } +} + ggml_cgraph * clip_graph_qwen2vl::build() { GGML_ASSERT(model.patch_bias == nullptr); GGML_ASSERT(model.class_embedding == nullptr); @@ -16,17 +45,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() { int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + ggml_tensor * inp = build_inp_with_temporal_merge(); // second conv dimension { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] inp = ggml_cont_4d( ctx0, inp, diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index fa1100dda8d..261e77a198a 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() { int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + ggml_tensor * inp = build_inp_with_temporal_merge(); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); - - // second conv dimension + // spatial merge { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] inp = ggml_cont_4d( ctx0, inp, diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp index 2a82ae50bf5..49d5dd5add3 100644 --- a/tools/mtmd/models/whisper-enc.cpp +++ b/tools/mtmd/models/whisper-enc.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_whisper_enc::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int n_pos = n_frames / 2; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index d6e551618e8..bd7f9871c3c 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -166,7 +166,7 @@ struct mtmd_cli_context { } bool load_media(const std::string & fname) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false)); if (!bmp.ptr) { return false; } diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 40940741637..94ad01511ed 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -478,7 +478,7 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int } // namespace audio_helpers -mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) { +mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) { if (audio_helpers::is_audio_file((const char *)buf, len)) { std::vector pcmf32; const int sample_rate = mtmd_get_audio_sample_rate(ctx); @@ -490,7 +490,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne LOG_ERR("Unable to read WAV audio file from buffer\n"); return nullptr; } - return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); + return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data()); } // otherwise, we assume it's an image @@ -502,13 +502,13 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne LOG_ERR("%s: failed to decode image bytes\n", __func__); return nullptr; } - result = mtmd_bitmap_init(nx, ny, data); + result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data); stbi_image_free(data); } return result; } -mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) { +mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) { std::vector buf; FILE * f = fopen(fname, "rb"); if (!f) { @@ -533,5 +533,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * return nullptr; } - return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); + return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder); } + diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 57da78a754f..7eecbb06723 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -29,7 +29,7 @@ MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_da // it calls mtmd_helper_bitmap_init_from_buf() internally // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); // helper function to construct a mtmd_bitmap from a buffer containing a file // supported formats: @@ -38,7 +38,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con // note: audio files will be auto-detected based on magic bytes // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index caf72d53621..bedf44e07cf 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -9,25 +9,12 @@ // void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - // TODO @ngxson : seems like this could be done more efficiently on cgraph - for (size_t i = 0; i < src.buf.size(); ++i) { - int c = i % 3; // rgb - dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; - } + dst.from_u8(src); + dst.normalize(mean, std); } void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(src.buf[i]); - } + dst.from_u8(src); } // set of tools to manipulate images @@ -40,13 +27,16 @@ struct img_tool { resize_algo algo, pad_style padding = PAD_CEIL, std::array pad_color = {0, 0, 0}) { - dst.nx = target_resolution.width; - dst.ny = target_resolution.height; - dst.buf.resize(3 * dst.nx * dst.ny); + dst.set_size(target_resolution, src.is_placeholder()); - if (dst.nx == src.nx && dst.ny == src.ny) { + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + if (dst.get_size() == src.get_size()) { // no resize needed, simple copy - dst.buf = src.buf; + dst.cpy_buf(src.get_ro_buf()); return; } @@ -68,17 +58,17 @@ struct img_tool { } else { // resize with padding clip_image_u8 resized_image; - float scale_w = static_cast(target_resolution.width) / src.nx; - float scale_h = static_cast(target_resolution.height) / src.ny; + float scale_w = static_cast(target_resolution.width) / src.get_size().width; + float scale_h = static_cast(target_resolution.height) / src.get_size().height; float scale = std::min(scale_w, scale_h); int new_width, new_height; if (padding == PAD_NEAREST) { - new_width = std::min(static_cast(std::round(src.nx * scale)), target_resolution.width); - new_height = std::min(static_cast(std::round(src.ny * scale)), target_resolution.height); + new_width = std::min(static_cast(std::round(src.get_size().width * scale)), target_resolution.width); + new_height = std::min(static_cast(std::round(src.get_size().height * scale)), target_resolution.height); } else { - new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); - new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + new_width = std::min(static_cast(std::ceil(src.get_size().width * scale)), target_resolution.width); + new_height = std::min(static_cast(std::ceil(src.get_size().height * scale)), target_resolution.height); } switch (algo) { @@ -112,18 +102,17 @@ struct img_tool { static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0); - GGML_ASSERT(x + w <= image.nx && y + h <= image.ny); - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h); + GGML_ASSERT(x + w <= image.get_size().width && y + h <= image.get_size().height); + dst.set_size({w, h}, image.is_placeholder()); + + if (image.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { - int src_idx = 3 * ((y + i)*image.nx + (x + j)); - int dst_idx = 3 * (i*w + j); - dst.buf[dst_idx] = image.buf[src_idx]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + dst.set_pixel(j, i, image.get_pixel(x + j, y + i)); } } } @@ -181,81 +170,101 @@ struct img_tool { // draw src image into dst image at offset (offset_x, offset_y) static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { - for (int y = 0; y < src.ny; ++y) { - for (int x = 0; x < src.nx; ++x) { + if (src.is_placeholder()) { + // no-op for placeholder image + return; + } + + const auto src_size = src.get_size(); + const auto dst_size = dst.get_size(); + for (int y = 0; y < src_size.height; ++y) { + for (int x = 0; x < src_size.width; ++x) { int dx = x + offset_x; int dy = y + offset_y; // skip pixels that would be out of bounds in the destination - if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { + if (dx < 0 || dy < 0 || dx >= dst_size.width || dy >= dst_size.height) { continue; } - size_t dst_idx = 3 * (static_cast(dy) * dst.nx + static_cast(dx)); - size_t src_idx = 3 * (static_cast(y) * src.nx + static_cast(x)); - dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; - dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; + dst.set_pixel(dx, dy, src.get_pixel(x, y)); } } } // fill the image with a solid color static void fill(clip_image_u8 & img, const std::array & color) { - for (size_t i = 0; i < img.buf.size(); i += 3) { - img.buf[i] = color[0]; - img.buf[i + 1] = color[1]; - img.buf[i + 2] = color[2]; + if (img.is_placeholder()) { + // no-op for placeholder image + return; + } + + const auto size = img.get_size(); + for (int y = 0; y < size.height; ++y) { + for (int x = 0; x < size.width; ++x) { + img.set_pixel(x, y, color); + } } } private: // Bilinear resize function static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { - if (src.nx == 0 || src.ny == 0) { dst.nx = dst.ny = 0; dst.buf.clear(); return; } + const auto src_size = src.get_size(); + if (src_size.width == 0 || src_size.height == 0) { dst.set_size({0, 0}, false); return; } if (target_width <= 0) target_width = 1; if (target_height <= 0) target_height = 1; - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + dst.set_size({target_width, target_height}, false); - float x_ratio = target_width > 1 ? static_cast(src.nx - 1) / (target_width - 1) : 0.0f; - float y_ratio = target_height > 1 ? static_cast(src.ny - 1) / (target_height - 1) : 0.0f; + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + float x_ratio = target_width > 1 ? static_cast(src_size.width - 1) / (target_width - 1) : 0.0f; + float y_ratio = target_height > 1 ? static_cast(src_size.height - 1) / (target_height - 1) : 0.0f; for (int y = 0; y < target_height; ++y) { for (int x = 0; x < target_width; ++x) { float px = x * x_ratio; float py = y * y_ratio; - int x0 = std::min(static_cast(px), src.nx - 1); - int y0 = std::min(static_cast(py), src.ny - 1); - int x1 = std::min(x0 + 1, src.nx - 1); - int y1 = std::min(y0 + 1, src.ny - 1); + int x0 = std::min(static_cast(px), src_size.width - 1); + int y0 = std::min(static_cast(py), src_size.height - 1); + int x1 = std::min(x0 + 1, src_size.width - 1); + int y1 = std::min(y0 + 1, src_size.height - 1); float xf = px - x0; float yf = py - y0; + const auto p00 = src.get_pixel(x0, y0); + const auto p10 = src.get_pixel(x1, y0); + const auto p01 = src.get_pixel(x0, y1); + const auto p11 = src.get_pixel(x1, y1); + + std::array pixel; for (int c = 0; c < 3; ++c) { - float top = lerp(static_cast(src.buf[3 * (y0 * src.nx + x0) + c]), - static_cast(src.buf[3 * (y0 * src.nx + x1) + c]), - xf); - float bottom = lerp(static_cast(src.buf[3 * (y1 * src.nx + x0) + c]), - static_cast(src.buf[3 * (y1 * src.nx + x1) + c]), - xf); - dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, yf)); + float top = lerp(static_cast(p00[c]), static_cast(p10[c]), xf); + float bottom = lerp(static_cast(p01[c]), static_cast(p11[c]), xf); + pixel[c] = static_cast(lerp(top, bottom, yf)); } + dst.set_pixel(x, y, pixel); } } } // Bicubic resize function // part of image will be cropped if the aspect ratio is different - static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { - const int nx = img.nx; - const int ny = img.ny; + static void resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + const auto img_size = img.get_size(); + const int nx = img_size.width; + const int ny = img_size.height; + + dst.set_size({target_width, target_height}, false); - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + if (img.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } float Cc; float C[5] = {}; @@ -280,12 +289,13 @@ struct img_tool { dx = tx * j - x; dy = ty * i - y; + std::array pixel; for (k = 0; k < 3; k++) { for (jj = 0; jj <= 3; jj++) { - d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d0 = img.get_pixel(clip(x - 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + d2 = img.get_pixel(clip(x + 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + d3 = img.get_pixel(clip(x + 2, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + a0 = img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; @@ -303,13 +313,12 @@ struct img_tool { Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); - dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + pixel[k] = Cc2; } } + dst.set_pixel(j, i, pixel); } } - - return true; } // Bicubic resize function using Pillow's ImagingResample algorithm @@ -455,16 +464,17 @@ struct img_tool { }; // Horizontal resampling pass - // Resizes width from imIn.nx to imOut.nx, preserving height + // Resizes width from imIn to out_nx, preserving height auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int out_nx, int ksize, const std::vector & bounds, const std::vector & weights) { - imOut.ny = imIn.ny; - imOut.buf.resize(3 * imOut.nx * imOut.ny); + const int in_ny = imIn.get_size().height; + imOut.set_size({out_nx, in_ny}, false); // Process each row independently - for (int yy = 0; yy < imOut.ny; yy++) { + for (int yy = 0; yy < in_ny; yy++) { // For each output pixel in this row - for (int xx = 0; xx < imOut.nx; xx++) { + for (int xx = 0; xx < out_nx; xx++) { // Get the range of input pixels and filter coefficients int xmin = bounds[xx * 2 + 0]; // First input pixel index int xcnt = bounds[xx * 2 + 1]; // Number of input pixels @@ -476,36 +486,36 @@ struct img_tool { // Convolve: sum weighted input pixels for (int x = 0; x < xcnt; x++) { - int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3; - ss0 += static_cast(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel - ss1 += static_cast(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel - ss2 += static_cast(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel + const auto src_px = imIn.get_pixel(x + xmin, yy); + ss0 += src_px[0] * weights[xx * ksize + x]; // R channel + ss1 += src_px[1] * weights[xx * ksize + x]; // G channel + ss2 += src_px[2] * weights[xx * ksize + x]; // B channel } // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255] - int dst_idx = (yy * imOut.nx + xx) * 3; - imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); - imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); - imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS), + clip8(ss1 >> PRECISION_BITS), + clip8(ss2 >> PRECISION_BITS)}); } } }; // Vertical resampling pass - // Resizes height from imIn.ny to imOut.ny, preserving width + // Resizes height from imIn to out_ny, preserving width auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int out_ny, int ksize, const std::vector & bounds, const std::vector & weight) { - imOut.nx = imIn.nx; - imOut.buf.resize(3 * imOut.nx * imOut.ny); + const int in_nx = imIn.get_size().width; + imOut.set_size({in_nx, out_ny}, false); // For each output row - for (int yy = 0; yy < imOut.ny; yy++) { + for (int yy = 0; yy < out_ny; yy++) { // Get the range of input rows and filter coefficients int ymin = bounds[yy * 2 + 0]; // First input row index int ycnt = bounds[yy * 2 + 1]; // Number of input rows // Process each column in this output row - for (int xx = 0; xx < imOut.nx; xx++) { + for (int xx = 0; xx < in_nx; xx++) { // Initialize accumulators for RGB channels with rounding bias int32_t ss0 = 1 << (PRECISION_BITS - 1); int32_t ss1 = 1 << (PRECISION_BITS - 1); @@ -513,27 +523,23 @@ struct img_tool { // Convolve: sum weighted input pixels vertically for (int y = 0; y < ycnt; y++) { - int src_idx = ((y + ymin) * imIn.nx + xx) * 3; - ss0 += static_cast(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel - ss1 += static_cast(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel - ss2 += static_cast(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel + const auto src_px = imIn.get_pixel(xx, y + ymin); + ss0 += src_px[0] * weight[yy * ksize + y]; // R channel + ss1 += src_px[1] * weight[yy * ksize + y]; // G channel + ss2 += src_px[2] * weight[yy * ksize + y]; // B channel } // Convert back from fixed-point and clamp to [0,255] - int dst_idx = (yy * imOut.nx + xx) * 3; - imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); - imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); - imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS), + clip8(ss1 >> PRECISION_BITS), + clip8(ss2 >> PRECISION_BITS)}); } } }; // Main resampling logic using separable two-pass approach - const int src_width = img.nx; - const int src_height = img.ny; - - dst.nx = target_width; - dst.ny = target_height; + const int src_width = img.get_size().width; + const int src_height = img.get_size().height; bool need_horizontal = (target_width != src_width); bool need_vertical = (target_height != src_height); @@ -555,18 +561,20 @@ struct img_tool { if (need_horizontal && need_vertical) { // Both horizontal and vertical clip_image_u8 temp; - temp.nx = target_width; - resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz); - resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert); + resample_horizontal(img, temp, target_width, ksize_horiz, bounds_horiz, weights_horiz); + resample_vertical(temp, dst, target_height, ksize_vert, bounds_vert, weights_vert); } else if (need_horizontal) { // Only horizontal - resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz); + resample_horizontal(img, dst, target_width, ksize_horiz, bounds_horiz, weights_horiz); } else if (need_vertical) { // Only vertical - resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert); + resample_vertical(img, dst, target_height, ksize_vert, bounds_vert, weights_vert); } else { // No resizing needed - direct copy - dst.buf = img.buf; + dst.set_size(img.get_size(), img.is_placeholder()); + if (!img.is_placeholder()) { + dst.cpy_buf(img.get_ro_buf()); + } } return true; @@ -588,7 +596,7 @@ struct img_tool { // bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); std::vector imgs = slice_image(img, inst); @@ -883,7 +891,7 @@ bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, c bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0); clip_image_u8 resized_image; - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); // the original pixtral model doesn't have n_merge const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( @@ -908,7 +916,7 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(hparams.image_longest_edge > 0); clip_image_u8 resized_image; - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); // the original pixtral model doesn't have n_merge const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( @@ -1040,7 +1048,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli // multiples of image_size (always rounding up) // // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( original_size, hparams.image_size, hparams.image_longest_edge); // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", @@ -1088,7 +1096,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(!hparams.image_res_candidates.empty()); - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); std::vector imgs = slice_image(img, inst, false); @@ -1108,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; // TODO: support 512 (tiny) and 640 (small) once we have eval data for them - const int64_t orig_area = static_cast(img.nx) * img.ny; + const int64_t orig_area = static_cast(img.get_size().area()); size_t mode_i = 0; int64_t min_diff = std::numeric_limits::max(); @@ -1201,10 +1209,11 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, // emit 768x768 local tiles when the image is larger than a tile in either // dimension, then always a 1024x1024 global view. order: [tiles..., global]. - if (img.nx > tile_size || img.ny > tile_size) { - const float aspect_ratio = static_cast(img.nx) / img.ny; + const auto img_size = img.get_size(); + if (img_size.width > tile_size || img_size.height > tile_size) { + const float aspect_ratio = static_cast(img_size.width) / img_size.height; const auto target_ratios = get_target_ratios(); - const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny); + const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height); // stretch onto the grid (no aspect preserve), then crop tiles row-major. clip_image_u8 refined; @@ -1247,50 +1256,57 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32( int target_height, const float mean[3], const float std[3]) { - if (src.nx == target_width && src.ny == target_height) { + const auto src_size = src.get_size(); + if (src_size.width == target_width && src_size.height == target_height) { img_u8_to_f32(src, dst, mean, std); return; } - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + dst.set_size({target_width, target_height}, false, false); + + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + const float scale_x = static_cast(src_size.width) / target_width; + const float scale_y = static_cast(src_size.height) / target_height; - const float scale_x = static_cast(src.nx) / target_width; - const float scale_y = static_cast(src.ny) / target_height; + std::vector local_buf(3 * target_width * target_height); for (int y = 0; y < target_height; ++y) { const float src_y = (static_cast(y) + 0.5f) * scale_y - 0.5f; const int y0_floor = static_cast(std::floor(src_y)); - const int y0 = std::max(0, std::min(y0_floor, src.ny - 1)); - const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1)); + const int y0 = std::max(0, std::min(y0_floor, src_size.height - 1)); + const int y1 = std::max(0, std::min(y0_floor + 1, src_size.height - 1)); const float ly = src_y - y0_floor; for (int x = 0; x < target_width; ++x) { const float src_x = (static_cast(x) + 0.5f) * scale_x - 0.5f; const int x0_floor = static_cast(std::floor(src_x)); - const int x0 = std::max(0, std::min(x0_floor, src.nx - 1)); - const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1)); + const int x0 = std::max(0, std::min(x0_floor, src_size.width - 1)); + const int x1 = std::max(0, std::min(x0_floor + 1, src_size.width - 1)); const float lx = src_x - x0_floor; - const size_t idx00 = 3 * (y0 * src.nx + x0); - const size_t idx01 = 3 * (y0 * src.nx + x1); - const size_t idx10 = 3 * (y1 * src.nx + x0); - const size_t idx11 = 3 * (y1 * src.nx + x1); - const size_t idx_dst = 3 * (y * target_width + x); + const auto p00 = src.get_pixel(x0, y0); + const auto p01 = src.get_pixel(x1, y0); + const auto p10 = src.get_pixel(x0, y1); + const auto p11 = src.get_pixel(x1, y1); + const size_t idx_dst = 3 * (y * target_width + x); for (int c = 0; c < 3; ++c) { - const float v00 = (static_cast(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c]; - const float v01 = (static_cast(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c]; - const float v10 = (static_cast(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c]; - const float v11 = (static_cast(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c]; + const float v00 = (static_cast(p00[c]) / 255.0f - mean[c]) / std[c]; + const float v01 = (static_cast(p01[c]) / 255.0f - mean[c]) / std[c]; + const float v10 = (static_cast(p10[c]) / 255.0f - mean[c]) / std[c]; + const float v11 = (static_cast(p11[c]) / 255.0f - mean[c]) / std[c]; const float top = v00 + (v01 - v00) * lx; const float bot = v10 + (v11 - v10) * lx; - dst.buf[idx_dst + c] = top + (bot - top) * ly; + local_buf[idx_dst + c] = top + (bot - top) * ly; } } } + dst.cpy_buf(local_buf); } int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) { @@ -1341,26 +1357,26 @@ std::vector mtmd_image_preprocessor_step3vl::calc_grid(int length, int wind clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) { clip_image_u8 resized = img; - const float aspect_ratio = img.ny > 0 ? static_cast(img.nx) / img.ny : 1.0f; - if (std::min(img.nx, img.ny) < 32 && + const auto img_size = img.get_size(); + const float aspect_ratio = img_size.height > 0 ? static_cast(img_size.width) / img_size.height : 1.0f; + if (std::min(img_size.width, img_size.height) < 32 && (aspect_ratio > wide_aspect_ratio_limit || aspect_ratio < 1.0f / wide_aspect_ratio_limit)) { - const int square_size = std::max(img.nx, img.ny); + const int square_size = std::max(img_size.width, img_size.height); clip_image_u8 padded; - padded.nx = square_size; - padded.ny = square_size; - padded.buf.resize(3 * square_size * square_size); + padded.set_size({square_size, square_size}, false); img_tool::fill(padded, {0, 0, 0}); img_tool::composite(padded, img, 0, 0); resized = std::move(padded); } const int max_image_size = get_image_longest_edge(params); - if (std::max(resized.nx, resized.ny) > max_image_size) { - const float scale = static_cast(max_image_size) / std::max(resized.nx, resized.ny); + const auto resized_size = resized.get_size(); + if (std::max(resized_size.width, resized_size.height) > max_image_size) { + const float scale = static_cast(max_image_size) / std::max(resized_size.width, resized_size.height); const clip_image_size new_size = { - std::max(1, static_cast(std::floor(resized.nx * scale))), - std::max(1, static_cast(std::floor(resized.ny * scale))), + std::max(1, static_cast(std::floor(resized_size.width * scale))), + std::max(1, static_cast(std::floor(resized_size.height * scale))), }; clip_image_u8 scaled; img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE); @@ -1372,14 +1388,14 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) { clip_image_u8 dst; - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h, 0); + dst.set_size({w, h}, false); + img_tool::fill(dst, {0, 0, 0}); + const auto img_size = image.get_size(); const int src_x0 = std::max(0, x); const int src_y0 = std::max(0, y); - const int src_x1 = std::min(image.nx, x + w); - const int src_y1 = std::min(image.ny, y + h); + const int src_x1 = std::min(img_size.width, x + w); + const int src_y1 = std::min(img_size.height, y + h); if (src_x0 >= src_x1 || src_y0 >= src_y1) { return dst; @@ -1390,11 +1406,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const cli for (int yy = 0; yy < src_y1 - src_y0; ++yy) { for (int xx = 0; xx < src_x1 - src_x0; ++xx) { - const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx)); - const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx)); - dst.buf[dst_idx + 0] = image.buf[src_idx + 0]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + dst.set_pixel(dst_x0 + xx, dst_y0 + yy, image.get_pixel(src_x0 + xx, src_y0 + yy)); } } @@ -1443,7 +1455,7 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { clip_image_u8 prepared = prepare_image(img, hparams); - const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny}); + const auto instructions = build_slice_instructions(hparams, prepared.get_size()); clip_image_f32_ptr overview_f32(clip_image_f32_init()); img_u8_resize_bilinear_to_f32( @@ -1462,7 +1474,8 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip } clip_image_u8 img_for_crop = prepared; - if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) { + const auto prepared_size = prepared.get_size(); + if (instructions.refined_size.width != prepared_size.width || instructions.refined_size.height != prepared_size.height) { clip_image_u8 refined; img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE); img_for_crop = std::move(refined); @@ -1503,9 +1516,10 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip hparams.image_max_pixels / (patch_size * patch_size) : 256; // Linear search for optimal scale to fit within max_num_patches + const auto img_size = img.get_size(); float scale = 1.0f; - int target_height = img.ny; - int target_width = img.nx; + int target_height = img_size.height; + int target_width = img_size.width; auto get_scaled_image_size = [align_size](float scale, int size) -> int { float scaled_size = size * scale; @@ -1517,8 +1531,8 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip // Linear search with 0.02 step size while (scale > 0.0f) { - target_height = get_scaled_image_size(scale, img.ny); - target_width = get_scaled_image_size(scale, img.nx); + target_height = get_scaled_image_size(scale, img_size.height); + target_width = get_scaled_image_size(scale, img_size.width); int num_patches_h = target_height / patch_size; int num_patches_w = target_width / patch_size; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 0b5caa6cb5c..c93fb1e0a4a 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -24,14 +24,54 @@ #include #include -// represents raw image data, layout is RGBRGBRGB... -// length of data must be nx * ny * 3 +// for still image data, layout is RGBRGBRGB... +// length of data must be nx * ny * 3 bytes +// +// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ... +// length of data must be nx * sizeof(float) bytes struct mtmd_bitmap { - uint32_t nx; - uint32_t ny; - std::vector data; + uint32_t nx = 0; + uint32_t ny = 0; std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking bool is_audio = false; // true if the bitmap is audio + + mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny) + : nx(nx), ny(ny), is_audio(false) { + if (data) { + size_t data_size = (size_t)nx * ny * 3; + this->data.resize(data_size); + std::memcpy(this->data.data(), data, data_size); + } + } + + mtmd_bitmap(const unsigned char * data, uint32_t n_samples) + : nx(n_samples), ny(1), is_audio(true) { + if (data) { + size_t data_size = (size_t)nx * sizeof(float); + this->data.resize(data_size); + std::memcpy(this->data.data(), data, data_size); + } + } + + const std::vector & get_ro_buf() const { + return data; + } + + bool is_placeholder() const { + return data.empty(); + } + + size_t n_bytes() const { + return data.size(); + } + + bool can_batch_with(const mtmd_bitmap & other) const { + // [QWEN_VIDEO] can batch if both are images with same size + return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny; + } + + private: + std::vector data; }; // position indexing for decoder model @@ -42,8 +82,8 @@ enum mtmd_pos_type { }; struct mtmd_image_tokens { - uint32_t nx; // number of tokens in x direction - uint32_t ny; // number of tokens in y direction + uint32_t nx = 0; // number of tokens in x direction + uint32_t ny = 0; // number of tokens in y direction mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL; uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL) uint32_t n_tokens() const { @@ -56,6 +96,16 @@ struct mtmd_image_tokens { clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking + // true if one of entries in batch_f32 is a placeholder + bool is_placeholder() const { + for (const auto & entry : batch_f32.entries) { + if (entry->is_placeholder()) { + return true; + } + } + return false; + } + mtmd_image_tokens clone() { return mtmd_image_tokens{ nx, @@ -70,10 +120,20 @@ struct mtmd_image_tokens { using mtmd_image_tokens_ptr = std::unique_ptr; struct mtmd_audio_tokens { - uint32_t n_tokens; // number of tokens + uint32_t n_tokens = 0; // number of tokens clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking + // true if one of entries in batch_f32 is a placeholder + bool is_placeholder() const { + for (const auto & entry : batch_f32.entries) { + if (entry->is_placeholder()) { + return true; + } + } + return false; + } + mtmd_audio_tokens clone() { return mtmd_audio_tokens{ n_tokens, @@ -513,6 +573,12 @@ struct mtmd_context { img_end = ""; image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + img_beg = ""; + img_end = ""; + image_preproc = std::make_unique(ctx_v); + } break; default: throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); } @@ -690,16 +756,55 @@ struct mtmd_tokenizer { cur.entries.clear(); std::vector parts = split_text(input_text, ctx->media_marker); size_t i_bm = 0; // index of the current bitmap + + // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl) + int n_merge_frames = 1; + if (ctx->ctx_v) { + n_merge_frames = clip_model_n_batch_max(ctx->ctx_v); + GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more"); + } + + std::vector> merged_bitmaps; + if (n_merge_frames > 1) { + size_t i_bm_scan = 0; + for (size_t i = 0; i < parts.size(); ++i) { + if (parts[i] != ctx->media_marker) { + continue; + } + if (i + 1 < parts.size() + && parts[i + 1] == ctx->media_marker + && i_bm_scan + 1 < bitmaps.size()) { + const mtmd_bitmap * bm_a = bitmaps[i_bm_scan]; + const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1]; + if (bm_a->can_batch_with(*bm_b)) { + LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1); + merged_bitmaps.push_back({bm_a, bm_b}); + parts.erase(parts.begin() + i + 1); // remove the second marker + i_bm_scan += 2; + continue; + } + } + LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan); + merged_bitmaps.push_back({bitmaps[i_bm_scan]}); + ++i_bm_scan; + } + } else { + for (size_t i = 0; i < bitmaps.size(); ++i) { + merged_bitmaps.push_back({bitmaps[i]}); + } + } + + i_bm = 0; for (auto & part : parts) { if (part == ctx->media_marker) { // this is a marker, we should add the next bitmap - if (i_bm >= bitmaps.size()) { + if (i_bm >= merged_bitmaps.size()) { LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n", - __func__, bitmaps.size(), parts.size() - 1); + __func__, merged_bitmaps.size(), parts.size() - 1); return 1; } - const mtmd_bitmap * bitmap = bitmaps[i_bm++]; - int32_t res = add_media(bitmap); + auto & bmps = merged_bitmaps[i_bm++]; + int32_t res = add_media(bmps); if (res != 0) { return res; } @@ -734,9 +839,9 @@ struct mtmd_tokenizer { } } - if (i_bm != bitmaps.size()) { + if (i_bm != merged_bitmaps.size()) { LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n", - __func__, bitmaps.size(), parts.size() - 1); + __func__, merged_bitmaps.size(), parts.size() - 1); return 1; } @@ -775,8 +880,10 @@ struct mtmd_tokenizer { } } - int32_t add_media(const mtmd_bitmap * bitmap) { - if (!bitmap->is_audio) { + int32_t add_media(std::vector & bitmaps) { + GGML_ASSERT(!bitmaps.empty()); + + if (!bitmaps[0]->is_audio) { // handle image if (!ctx->ctx_v) { @@ -788,24 +895,59 @@ struct mtmd_tokenizer { add_text(ctx->img_beg, true); // add image begin token } - // sanity check - GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0); - GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3); - GGML_ASSERT(ctx->image_preproc != nullptr); - - // convert mtmd_bitmap to clip_image_u8 - clip_image_u8_ptr img_u8(clip_image_u8_init()); - img_u8->nx = bitmap->nx; - img_u8->ny = bitmap->ny; - img_u8->buf.resize(bitmap->data.size()); - std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3); - - // preprocess image + // TODO @ngxson : this is quite hacky because preprocessor only support batch with one single element, that need to be fixed in the future (e.g. by changing the preprocessor interface always take single input) + clip_image_f32_batch batch_f32; - bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32); - if (!ok) { - LOG_ERR("Unable to preprocess image\n"); - return 2; + + for (const auto * bmp : bitmaps) { + // sanity check + GGML_ASSERT(!bmp->is_audio); + GGML_ASSERT(ctx->image_preproc != nullptr); + if (bmp->nx <= 0 || bmp->ny <= 0) { + LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n", + __func__, bmp->nx, bmp->ny); + return 2; + } + + // convert mtmd_bitmap to clip_image_u8 + clip_image_u8_ptr img_u8(clip_image_u8_init()); + img_u8->set_size( + {(int)bmp->nx, (int)bmp->ny}, + bmp->is_placeholder()); + img_u8->cpy_buf(bmp->get_ro_buf()); + + // preprocess image + clip_image_f32_batch tmp_batch; + bool ok = ctx->image_preproc->preprocess(*img_u8, tmp_batch); + if (!ok) { + LOG_ERR("Unable to preprocess image\n"); + return 2; + } + + // move entries and grid dimensions to the "global" batch_f32 + for (auto & entry : tmp_batch.entries) { + batch_f32.entries.emplace_back(std::move(entry)); + } + + // for llava-uhd style, we need to handle grid too + // we don't care about overwriting these values for now because llama-uhd doesn't support batching anyway + batch_f32.grid_x = tmp_batch.grid_x; + batch_f32.grid_y = tmp_batch.grid_y; + } + + // Annotate llava-next style tiles so clip_n_output_tokens accounts + // for per-tile newline injection. + if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) { + if (batch_f32.entries.size() == 1) { + // Single-tile (overview only): append one newline row. + batch_f32.entries[0]->add_newline = true; + } else { + // Multi-tile: overview gets no newline, grid tiles get one. + batch_f32.entries[0]->add_newline = false; + for (size_t i = 1; i < batch_f32.entries.size(); ++i) { + batch_f32.entries[i]->add_newline = true; + } + } } // handle llava-uhd style preprocessing @@ -818,11 +960,14 @@ struct mtmd_tokenizer { || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid) ) { + // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now + GGML_ASSERT(bitmaps.size() == 1); + const int n_col = batch_f32.grid_x; const int n_row = batch_f32.grid_y; // split batch into chunks of single images // NOTE: batch_f32 will be invalidated after this call - auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id); + auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[0]->id); GGML_ASSERT(chunks.size() > 0); auto ov_chunk = std::move(chunks.front()); @@ -872,9 +1017,14 @@ struct mtmd_tokenizer { } } else { + size_t n_tokens = 0; - for (const auto & entry : batch_f32.entries) { - n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get()); + for (const auto & e : batch_f32.entries) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get()); + if (clip_model_n_batch_max(ctx->ctx_v) == 2) { + // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image + break; + } } mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); @@ -897,7 +1047,7 @@ struct mtmd_tokenizer { GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens()); } image_tokens->batch_f32 = std::move(batch_f32); - image_tokens->id = bitmap->id; // optional + image_tokens->id = bitmaps[0]->id; // optional LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx); LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); @@ -922,12 +1072,15 @@ struct mtmd_tokenizer { } else { // handle audio + GGML_ASSERT(bitmaps.size() == 1); // no batching support for now + auto & bitmap = bitmaps[0]; + if (!ctx->ctx_a) { LOG_ERR("%s: error: model does not support audio input\n", __func__); return 2; } - if (bitmap->data.size() == 0) { + if (bitmap->nx == 0) { LOG_ERR("%s: error: empty audio data\n", __func__); return 2; } @@ -938,26 +1091,46 @@ struct mtmd_tokenizer { // sanity check GGML_ASSERT(ctx->audio_preproc != nullptr); - GGML_ASSERT(bitmap->data.size() > sizeof(float)); - GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0); // preprocess audio std::vector mel_spec_chunks; - const float * samples = (const float *)bitmap->data.data(); - size_t n_samples = bitmap->data.size() / sizeof(float); - bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks); - if (!ok) { - LOG_ERR("Unable to preprocess audio\n"); - return 2; + { + std::vector dummy; + const float * samples = nullptr; + size_t n_samples = 0; + if (bitmap->is_placeholder()) { + // TODO @ngxson : skip underlay processing if bitmap is placeholder + GGML_ASSERT(bitmap->ny == 1); + + dummy.resize(bitmap->nx); + samples = dummy.data(); + n_samples = dummy.size(); + } else { + const auto & buf = bitmap->get_ro_buf(); + GGML_ASSERT(buf.size() > sizeof(float)); + GGML_ASSERT(buf.size() % sizeof(float) == 0); + + samples = (const float *)buf.data(); + n_samples = buf.size() / sizeof(float); + } + bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks); + if (!ok) { + LOG_ERR("Unable to preprocess audio\n"); + return 2; + } } // consider each mel_spec as a separate audio chunk // TODO: maybe support batching, but this may come with memory cost for (auto & mel_spec : mel_spec_chunks) { + const bool is_placeholder = mel_spec.data.empty(); + clip_image_f32_ptr mel_f32(clip_image_f32_init()); - mel_f32->nx = mel_spec.n_len; - mel_f32->ny = mel_spec.n_mel; - mel_f32->buf = std::move(mel_spec.data); + mel_f32->set_size( + {mel_spec.n_len, mel_spec.n_mel}, + is_placeholder, /* is_audio */ true); + mel_f32->cpy_buf(mel_spec.data); + size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get()); clip_image_f32_batch batch_f32; @@ -1076,12 +1249,28 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { LOG_ERR("%s: model does not support vision input\n", __func__); return 1; } + if (chunk->tokens_image == nullptr) { + LOG_ERR("%s: image tokens are null\n", __func__); + return 1; + } + if (chunk->tokens_image->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; + } return mtmd_encode(ctx, chunk->tokens_image.get()); } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { if (!ctx->ctx_a) { LOG_ERR("%s: model does not support audio input\n", __func__); return 1; } + if (chunk->tokens_audio == nullptr) { + LOG_ERR("%s: audio tokens are null\n", __func__); + return 1; + } + if (chunk->tokens_audio->is_placeholder()) { + LOG_ERR("%s: audio tokens batch is placeholder\n", __func__); + return 1; + } int n_mmproj_embd = ctx->n_embd_text; ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( @@ -1111,13 +1300,18 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) || proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE || proj_type == PROJECTOR_TYPE_INTERNVL - || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2 + || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; // entries may have different token counts // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view size_t offset = 0; for (size_t i = 0; i < entries.size(); i++) { + if (entries[i]->is_placeholder()) { + LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i); + return 1; + } int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); ok = clip_image_encode( ctx_clip, @@ -1127,6 +1321,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } } else { + if (image_tokens->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; + } ok = clip_image_batch_encode( ctx_clip, ctx->n_threads, @@ -1184,24 +1382,17 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) { mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data) { - mtmd_bitmap * bitmap = new mtmd_bitmap; - bitmap->nx = nx; - bitmap->ny = ny; - size_t data_size = (size_t)nx * ny * 3; - bitmap->data.resize(data_size); - std::memcpy(bitmap->data.data(), data, data_size); + mtmd_bitmap * bitmap = new mtmd_bitmap(data, nx, ny); return bitmap; } mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data) { - mtmd_bitmap * bitmap = new mtmd_bitmap; - bitmap->nx = n_samples; - bitmap->ny = 1; - bitmap->is_audio = true; - size_t data_size = n_samples * sizeof(float); - bitmap->data.resize(data_size); - std::memcpy(bitmap->data.data(), data, data_size); + mtmd_bitmap * bitmap = new mtmd_bitmap((const unsigned char *)data, n_samples); + GGML_ASSERT(bitmap->is_audio); + if (!bitmap->is_placeholder()) { + GGML_ASSERT(bitmap->get_ro_buf().size() == n_samples * sizeof(float)); + } return bitmap; } @@ -1214,11 +1405,11 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) { } const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) { - return bitmap->data.data(); + return bitmap->get_ro_buf().data(); } size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) { - return bitmap->data.size(); + return bitmap->get_ro_buf().size(); } bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { @@ -1512,14 +1703,16 @@ void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector img_buf; + img_buf.reserve(img_sz * img_sz); for (const auto & row : image) { - inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end()); + img_buf.insert(img_buf.end(), row.begin(), row.end()); } - LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny); + clip_image_f32 inp_image; + inp_image.set_size({img_sz, img_sz}, false, false); + inp_image.cpy_buf(img_buf); + LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, img_sz, img_sz); mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image); } @@ -1529,16 +1722,17 @@ void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector & inpu return; } int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins; - clip_image_f32 inp_audio; - inp_audio.nx = input.size(); - inp_audio.ny = n_mel; - inp_audio.buf.resize(input.size() * n_mel); - for (size_t i = 0; i < input.size(); i++) { + const int audio_nx = (int)input.size(); + std::vector audio_buf(audio_nx * n_mel); + for (int i = 0; i < audio_nx; i++) { for (int j = 0; j < n_mel; j++) { - inp_audio.buf[j * inp_audio.nx + i] = input[i]; + audio_buf[j * audio_nx + i] = input[i]; } } - LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny); + clip_image_f32 inp_audio; + inp_audio.set_size({audio_nx, n_mel}, false, true); + inp_audio.cpy_buf(audio_buf); + LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, audio_nx, n_mel); mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio); } @@ -1548,9 +1742,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector return; } clip_image_u8 img_u8; - img_u8.nx = nx; - img_u8.ny = ny; - img_u8.buf = rgb_values; + img_u8.set_size({nx, ny}, false); + img_u8.cpy_buf(rgb_values); clip_image_f32_batch batch_f32; GGML_ASSERT(ctx->image_preproc != nullptr); bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32); @@ -1560,7 +1753,7 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector } LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size()); for (size_t i = 0; i < batch_f32.entries.size(); i++) { - LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny); + LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny()); // TODO: better way to dump entry content? } } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 5d518df799e..128fb18261b 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -133,9 +133,16 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); // if bitmap is image: // length of data must be nx * ny * 3 // the data is in RGBRGBRGB... format +// note: some video-capable models (i.e. qwen-vl) can merge consecutive bitmaps +// into one chunk, mtmd_tokenize() will automatically handle this // if bitmap is audio: // length of data must be n_samples * sizeof(float) // the data is in float format (PCM F32) +// +// if data == nullptr: +// the bitmap is considered "empty", and will be treated as a placeholder for counting tokens +// you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens +// note: passing a placeholder bitmap to mtmd_encode() will return an error MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); diff --git a/tools/quantize/README.md b/tools/quantize/README.md index b8c225124b3..27384bebf69 100644 --- a/tools/quantize/README.md +++ b/tools/quantize/README.md @@ -5,62 +5,87 @@ Quantization reduces the precision of model weights (e.g., from 32-bit floats to This process however, may introduce some accuracy loss which is usually measured in [Perplexity](https://huggingface.co/docs/transformers/en/perplexity) (ppl) and/or [Kullback–Leibler Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) (kld). This can be minimized by using a suitable imatrix file. -You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup. +You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup. It syncs from llama.cpp `main` every 6 hours. -Note: It is synced from llama.cpp `main` every 6 hours. +## Overview -Example usage: +Quantization is done in two phases: +- Convert the original model to GGUF format. +- Quantize the converted GGUF file. -```./llama-quantize [options] input-model-f32.gguf [output-model-quant.gguf] type [threads]``` +If the model supports multimodal inputs (images or audio), you also need to convert and quantize the multimodal encoders and projectors. + +To perform these tasks, you need to install the Python requirements: ```bash -# from Hugginface, obtain the official meta-llama/Llama-3.1-8B model weights and place them in ./models -ls ./models -config.json model-00001-of-00004.safetensors model-00004-of-00004.safetensors README.md tokenizer.json -generation_config.json model-00002-of-00004.safetensors model.safetensors.index.json special_tokens_map.json USE_POLICY.md -LICENSE model-00003-of-00004.safetensors original tokenizer_config.json +python3 -m pip install -r requirements.txt +``` -# [Optional] for PyTorch .bin models like Mistral-7B -ls ./models - +Or if you use `uv`: -# install Python dependencies -python3 -m pip install -r requirements.txt +```bash +uv pip install -r requirements.txt --index-strategy unsafe-best-match +``` -# convert the model to ggml FP16 format -python3 convert_hf_to_gguf.py ./models/mymodel/ +## Prepare the input GGUF file -# quantize the model to 4-bits (using Q4_K_M method) -./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M +To convert a model from a Hugging Face repo, you can use a command like the following: -# update the gguf filetype to current version if older version is now unsupported -./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY ``` +python convert_hf_to_gguf.py --outfile gemma-4-E2B-it-bf16.gguf --outtype bf16 --remote google/gemma-4-E2B-it +``` + +Notes: +- In the usual case where the model is distributed in 16-bit format, `--outtype auto` (or omitting `--outtype` entirely) also works well. +- If you have previously downloaded the model locally, specify the directory and remove the `--remote` flag. +- For compatibility reasons, the Python requirements install transformers 4, but more and more models (like Gemma 4) require transformers 5. You can safely `pip install -U transformers` to get the latest version. + +## Quantize the GGUF -Run the quantized model: +After you have created a high-quality GGUF version of the model, you use `llama-quantize` to apply quantization. For example, quantize to `Q4_K_M` using a command like the following: ```bash -# start inference on a gguf model -./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant" +./build/bin/llama-quantize gemma-4-E2B-it-bf16.gguf gemma-4-E2B-it-Q4_K_M.gguf Q4_K_M ``` +Various quantization methods are described [later in this document](#quantize). + Options: -* `--allow-requantize` allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit -* `--leave-output-tensor` will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing -* `--pure` disables k-quant mixtures and quantizes all tensors to the same type -* `--imatrix` uses data in file generated by `llama-imatrix` as importance matrix for quant optimizations (highly recommended) -* `--include-weights` use an importance matrix for tensor(s) in the list. Cannot be used with `--exclude-weights` -* `--exclude-weights` use an importance matrix for tensor(s) in the list. Cannot be used with `--include-weights` +* `--allow-requantize` allow requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit +* `--leave-output-tensor` leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing +* `--pure` disable k-quant mixtures and quantizes all tensors to the same type +* `--imatrix file_name` use data in file_name as importance matrix for quant optimizations +* `--include-weights tensor_name` use importance matrix for this tensor (can be specified multiple times) +* `--exclude-weights tensor_name` use importance matrix for the tensors **not** specified (include/exclude cannot be mixed) * `--output-tensor-type` use a specific quant type for the output.weight tensor * `--token-embedding-type` use a specific quant type for the token embeddings tensor -* `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file +* `--keep-split` generate the quantized model in the same shards as the input file instead of a single quantized file Advanced options: * `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times. * `--prune-layers` prune (remove) the layers in the list -* `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times +* `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times. + +## (Optional) Convert the multimodal components + +llama.cpp will convert the LLM portion of the source model, which is enough for conversational applications. If the model accepts multimodal inputs and you wish to take advantage of them, you need to create a separate GGUF file. This file is generically known as `mmproj`, for "multimedia projector"; however, it may contain various components such as vision or audio encoders in addition to projections. + +Multimodal components are usually much smaller than the LLMs they come with. In addition, their quality has a direct impact on the quality of LLM generations, because these components are in charge of preparing the inputs for the LLM: the closer inputs are to data seen during training, the better LLM results will be. + +For these reasons, multimodal components are usually kept in a high-quality format such as bf16 or q8. The impact on speed and memory from using a smaller quant is negligible, but overall quality could be impacted. + +```bash +python convert_hf_to_gguf.py --mmproj --outfile mmproj-gemma-4-E2B-it-Q8_0.gguf --outtype q8_0 --remote google/gemma-4-E2B-it +``` + +## Run the quantized model + + +```bash +./build/bin/llama cli -m ./gemma-4-E2B-it-Q4_K_M.gguf --mmproj ./mmproj-gemma-4-E2B-it-Q8_0.gguf --image --prompt "Describe this image" +``` -Examples: +## Quantization Examples ```bash # naive Q4_K_M quantization using default settings and 8 CPU threads. Output will be "ggml-model-Q4_K_M.gguf" diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index fa7ee89dece..ce74adb4c12 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -2,6 +2,7 @@ #include "build-info.h" #include "common.h" +#include "imatrix-loader.h" #include "gguf.h" @@ -14,7 +15,6 @@ #include #include #include -#include #include #include @@ -80,11 +80,6 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; -// TODO: share with imatrix.cpp -static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets"; -static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count"; -static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size"; - static bool striequals(const char * a, const char * b) { while (*a && *b) { if (std::tolower(*a) != std::tolower(*b)) { @@ -183,184 +178,84 @@ static void usage(const char * executable) { exit(1); } -static int load_legacy_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & imatrix_data) { - std::ifstream in(imatrix_file.c_str(), std::ios::binary); - if (!in) { - printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); - exit(1); - } - int n_entries; - in.read((char *)&n_entries, sizeof(n_entries)); - if (in.fail() || n_entries < 1) { - printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); - exit(1); - } - for (int i = 0; i < n_entries; ++i) { - int len; in.read((char *)&len, sizeof(len)); - std::vector name_as_vec(len+1); - in.read((char *)name_as_vec.data(), len); - if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); - exit(1); - } - name_as_vec[len] = 0; - std::string name{name_as_vec.data()}; - auto & e = imatrix_data[name]; - int ncall; - in.read((char *)&ncall, sizeof(ncall)); - int nval; - in.read((char *)&nval, sizeof(nval)); - if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n", __func__, i); - imatrix_data = {}; - exit(1); - } - e.resize(nval); - in.read((char *)e.data(), nval*sizeof(float)); - if (in.fail()) { - printf("%s: failed reading data for entry %d\n", __func__, i); - imatrix_data = {}; - exit(1); - } - if (ncall > 0) { - for (auto & v : e) { - v /= ncall; - } - } - - if (getenv("LLAMA_TRACE")) { - printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); - } - } - - // latest legacy imatrix version contains the dataset filename at the end of the file - int m_last_call = 0; - if (in.peek() != EOF) { - in.read((char *)&m_last_call, sizeof(m_last_call)); - int dataset_len; - in.read((char *)&dataset_len, sizeof(dataset_len)); - std::vector dataset_as_vec(dataset_len); - in.read(dataset_as_vec.data(), dataset_len); - imatrix_datasets.resize(1); - imatrix_datasets[0].assign(dataset_as_vec.begin(), dataset_as_vec.end()); - printf("%s: imatrix dataset='%s'\n", __func__, imatrix_datasets[0].c_str()); - } - printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call); - return m_last_call; -} - static int load_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & imatrix_data) { - - struct ggml_context * ctx = nullptr; - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ false, // the data is needed - /* .ctx = */ &ctx, - }; - struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params); - if (!ctx_gguf) { - fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str()); - return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data); - } - const int32_t n_entries = gguf_get_n_tensors(ctx_gguf); - if (n_entries < 1) { - fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str()); - gguf_free(ctx_gguf); - ggml_free(ctx); + common_imatrix loaded; + if (!common_imatrix_load(imatrix_file, loaded)) { + fprintf(stderr, "%s: failed to load imatrix from '%s'\n", __func__, imatrix_file.c_str()); exit(1); } - const int dataset_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS); - const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT); - const int chunk_size_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE); - if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) { + if (!loaded.is_legacy && !loaded.has_metadata) { fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str()); - gguf_free(ctx_gguf); - ggml_free(ctx); exit(1); } - const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx); - - const std::string sums_suffix{ ".in_sum2" }; - const std::string counts_suffix{ ".counts" }; - - // Using an ordered map to get a deterministic iteration order. - std::map> sums_counts_for; - - for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - std::string name = cur->name; - - if (name.empty()) { continue; } + for (const auto & [name, entry] : loaded.entries) { + auto & e = imatrix_data[name]; + e.resize(entry.sums.size()); + + if (!loaded.is_legacy) { + // GGUF format: normalize by per-expert counts + const int64_t ncounts = entry.counts.size(); + const int64_t ne0 = (int64_t) entry.sums.size() / ncounts; + + for (int64_t j = 0; j < ncounts; ++j) { + const float count = (float) entry.counts[j]; + if (count > 0.0f) { + for (int64_t i = 0; i < ne0; ++i) { + e[j*ne0 + i] = entry.sums[j*ne0 + i] / count; + } + } else { + for (int64_t i = 0; i < ne0; ++i) { + e[j*ne0 + i] = 1; + } + } + } - if (string_remove_suffix(name, sums_suffix)) { - // in_sum2 - sums_counts_for[std::move(name)].first = cur; - } else if (string_remove_suffix(name, counts_suffix)) { - // counts - sums_counts_for[std::move(name)].second = cur; + if (getenv("LLAMA_TRACE")) { + float max_count = 0.0f; + for (int64_t j = 0; j < ncounts; ++j) { + const float count = (float) entry.counts[j]; + if (count > max_count) { + max_count = count; + } + } + printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", + __func__, int(e.size()), int(max_count), int(max_count / loaded.chunk_size), name.c_str()); + } } else { - // ignore other tensors - } - } - - for (const auto & sc : sums_counts_for) { - const std::string & name = sc.first; - const struct ggml_tensor * sums = sc.second.first; - const struct ggml_tensor * counts = sc.second.second; - - if (!sums || !counts) { - fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str()); - gguf_free(ctx_gguf); - ggml_free(ctx); - exit(1); - } - - const int64_t ne0 = sums->ne[0]; - const int64_t ne1 = sums->ne[1]; - - auto & e = imatrix_data[name]; - e.resize(ggml_nelements(sums)); - float max_count = 0.0f; - for (int64_t j = 0; j < ne1; ++j) { - const float count = ((const float *) counts->data)[j]; - if (count > 0.0f) { - for (int64_t i = 0; i < ne0; ++i) { - e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; + // Legacy format: sums contain (raw/count)*ncall, divide by ncall + const int64_t ncall = entry.counts.empty() ? 0 : entry.counts[0]; + if (ncall > 0) { + for (size_t i = 0; i < entry.sums.size(); ++i) { + e[i] = entry.sums[i] / ncall; } } else { - // Partial imatrix data, this tensor never got any input during calibration - for (int64_t i = 0; i < ne0; ++i) { - e[j*ne0 + i] = 1; + for (size_t i = 0; i < entry.sums.size(); ++i) { + e[i] = entry.sums[i]; } } - if (count > max_count) { - max_count = count; + + if (getenv("LLAMA_TRACE")) { + printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", + __func__, int(e.size()), int(ncall), name.c_str()); } } - if (getenv("LLAMA_TRACE")) { - printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str()); - } } - int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx); + imatrix_datasets = std::move(loaded.datasets); - int64_t n_datasets = gguf_get_arr_n(ctx_gguf, dataset_idx); - imatrix_datasets.reserve(n_datasets); - for (int64_t i = 0; i < n_datasets; ++i) { - imatrix_datasets.push_back(gguf_get_arr_str(ctx_gguf, dataset_idx, i)); - } - printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str()); - for (size_t i = 1; i < imatrix_datasets.size(); ++i) { - printf(", '%s'", imatrix_datasets[i].c_str()); + if (!imatrix_datasets.empty()) { + printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str()); + for (size_t i = 1; i < imatrix_datasets.size(); ++i) { + printf(", '%s'", imatrix_datasets[i].c_str()); + } + printf("]\n"); } - printf("]\n"); - - printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk); - gguf_free(ctx_gguf); - ggml_free(ctx); + printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), loaded.chunk_count); - return m_last_chunk; + return loaded.chunk_count; } static int prepare_imatrix(const std::string & imatrix_file, diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 3c80de9fddd..420a1dc3a0e 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -717,10 +717,10 @@ static std::string fnv_hash(const uint8_t * data, size_t len) { return std::to_string(hash); } -server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files) { +server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector & files, bool is_placeholder) { mtmd::bitmaps bitmaps; for (auto & file : files) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder)); if (!bmp.ptr) { throw std::runtime_error("Failed to load image or audio file"); } diff --git a/tools/server/server-common.h b/tools/server/server-common.h index c6b6945a10e..693b07b4701 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -259,7 +259,8 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, size_t validate_utf8(const std::string& text); // process mtmd prompt, return the server_tokens containing both text tokens and media chunks -server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files); +// if is_placeholder is true, the media chunk will be treated as placeholder for counting tokens; the output tokens are not usable for actual inference (e.g. for submitting a task to server_queue) +server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector & files, bool is_placeholder = false); /** * break the input "prompt" object into multiple prompt if needed, then tokenize them diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 74ce6dfa6bb..21b2cceeb85 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1,4 +1,3 @@ - #include "server-context.h" #include "server-chat.h" #include "server-common.h" @@ -16,6 +15,11 @@ #include "mtmd.h" #include "mtmd-helper.h" +#include "ggml-cpp.h" + +// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING] +#include "../../src/llama-ext.h" + #include #include #include @@ -884,7 +888,7 @@ struct server_context_impl { has_draft ? "draft model" : "MTP context", total / (1024.0 * 1024.0)); } catch (const std::exception & e) { - SRV_ERR("[spec] failed to measure %s memory: %s\n", + SRV_WRN("[spec] failed to measure %s memory: %s\n", has_draft ? "draft model" : "MTP context", e.what()); } } @@ -952,6 +956,7 @@ struct server_context_impl { const bool spec_mtp = std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + if (spec_mtp) { cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP; } @@ -962,15 +967,22 @@ struct server_context_impl { // note: for small models maybe we can set this to the maximum possible draft from all speculative types // the extra memory for small models is likely negligible? - cparams.n_rs_seq = 0; + cparams.n_rs_seq = 0; + cparams.ctx_other = ctx_tgt; + ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); if (params_base.speculative.dflash) { llama_set_dflash(ctx_tgt, model_dft.get()); } + // note: MTP target wiring uses cparams.ctx_other set before + // llama_init_from_model above — no explicit call needed here. + (void) spec_mtp; + ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); } else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), @@ -984,6 +996,7 @@ struct server_context_impl { cparams_mtp.type_v = params_base.speculative.draft.cache_type_v; cparams_mtp.n_rs_seq = 0; cparams_mtp.n_outputs_max = params_base.n_parallel; + cparams_mtp.ctx_other = ctx_tgt; ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); if (ctx_dft == nullptr) { @@ -991,8 +1004,6 @@ struct server_context_impl { return false; } - ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); - params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); } @@ -1080,6 +1091,10 @@ struct server_context_impl { } } + if (ctx_dft) { + ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + } + if (spec) { SRV_INF("%s", "speculative decoding context initialized\n"); } else { @@ -2557,7 +2572,7 @@ struct server_context_impl { llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), slot.id)); if (use_ckpt_dft) { - slot.spec_ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + slot.spec_ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } slot.spec_prompt = slot.prompt.tokens.get_text_tokens(); @@ -2596,7 +2611,7 @@ struct server_context_impl { if (ctx_dft) { if (use_ckpt_dft) { - ckpt.load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } common_context_seq_rm(ctx_dft.get(), slot.id, ckpt.pos_max + 1, -1); @@ -2613,7 +2628,7 @@ struct server_context_impl { if (use_ckpt_tgt) { //const int64_t t_start = ggml_time_us(); - ckpt.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); //const int64_t t_total = ggml_time_us() - t_start; //printf("checkpoint total: %f ms\n", t_total / 1000.0); @@ -2625,7 +2640,7 @@ struct server_context_impl { } if (use_ckpt_dft) { - ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } } } @@ -2837,8 +2852,11 @@ struct server_context_impl { llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); + // ref: https://github.com/ggml-org/llama.cpp/pull/24110 + const bool has_new_tokens = (n_past < slot.task->n_tokens()); + // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, pos_next - n_swa - 1); + const auto pos_min_thold = std::max(0, pos_next - n_swa - (has_new_tokens ? 0 : 1)); if (n_past > 0 && n_past <= slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); @@ -3045,10 +3063,11 @@ struct server_context_impl { continue; } - if (ctx_dft) { + if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) { // TODO: in the future, figure out how to infuse target embeddings to the images // for now, we skip this for simplicity // maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above? + // [TAG_MTMD_DRAFT_PROCESSING] res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); if (res != 0) { GGML_ABORT("failed to process multi-modal data on draft context\n"); @@ -3518,13 +3537,13 @@ struct server_context_impl { SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", ckpt.pos_min, ckpt.pos_max, ckpt.size()); { - ckpt.load_tgt(slot.ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_tgt(slot.ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); common_context_seq_rm(slot.ctx_tgt, slot.id, ckpt.pos_max + 1, -1); } if (slot.ctx_dft) { - ckpt.load_dft(slot.ctx_dft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_dft(slot.ctx_dft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); common_context_seq_rm(slot.ctx_dft, slot.id, ckpt.pos_max + 1, -1); } @@ -4404,6 +4423,10 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_CHAT); }; + this->post_chat_completions_tok = [this](const server_http_req & req) { + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_CHAT); + }; + this->post_control = [this](const server_http_req & req) { auto res = create_response(); const json body = json::parse(req.body); @@ -4459,6 +4482,10 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_RESP); }; + this->post_responses_tok_oai = [this](const server_http_req & req) { + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_RESP); + }; + this->post_transcriptions_oai = [this](const server_http_req & req) { auto res = create_response(); @@ -4506,20 +4533,7 @@ void server_routes::init_routes() { }; this->post_anthropic_count_tokens = [this](const server_http_req & req) { - auto res = create_response(); - std::vector files; - json body = server_chat_convert_anthropic_to_oai(json::parse(req.body)); - SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions"); - SRV_DBG("converted request: %s\n", body.dump().c_str()); - json body_parsed = oaicompat_chat_params_parse( - body, - meta->chat_params, - files); - - json prompt = body_parsed.at("prompt"); - llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true); - res->ok({{"input_tokens", static_cast(tokens.size())}}); - return res; + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_ANTHROPIC); }; // same with handle_chat_completions, but without inference part @@ -4999,3 +5013,54 @@ std::unique_ptr server_routes::handle_embeddings_impl(cons res->ok(root); return res; } + +std::unique_ptr server_routes::handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type) { + auto res = create_response(); + std::vector files; + json body = json::parse(req.body); + bool is_oai = false; + + switch (res_type) { + case TASK_RESPONSE_TYPE_OAI_CHAT: + { + is_oai = true; + } break; + case TASK_RESPONSE_TYPE_OAI_RESP: + { + is_oai = true; + body = server_chat_convert_responses_to_chatcmpl(body); + } break; + case TASK_RESPONSE_TYPE_ANTHROPIC: + { + body = server_chat_convert_anthropic_to_oai(body); + } break; + default: + res->error(format_error_response("invalid res_type", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + json body_parsed = oaicompat_chat_params_parse( + body, + meta->chat_params, + files); + json prompt = body_parsed.at("prompt"); + // SRV_DBG("prompt = %s\n", prompt.dump().c_str()); + + // TODO @ngxson : refactor this code block, move this to server-common and reuse it in other places + size_t n_tokens; + if (mctx != nullptr) { + if (!prompt.is_string()) { + throw std::runtime_error("for mtmd, input prompt must be a string."); + } + n_tokens = process_mtmd_prompt(mctx, prompt.get(), files, true).size(); + } else { + n_tokens = tokenize_mixed(vocab, prompt, true, true).size(); + } + + json response = {{"input_tokens", static_cast(n_tokens)}}; + if (is_oai) { + response["object"] = "response.input_tokens"; + } + res->ok(response); + return res; +} diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 73caff54a46..72a1f40e014 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -110,8 +110,10 @@ struct server_routes { server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_chat_completions_tok; server_http_context::handler_t post_control; server_http_context::handler_t post_responses_oai; + server_http_context::handler_t post_responses_tok_oai; server_http_context::handler_t post_transcriptions_oai; server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; @@ -139,6 +141,7 @@ struct server_routes { std::unique_ptr handle_slots_restore(const server_http_req & req, int id_slot); std::unique_ptr handle_slots_erase(const server_http_req &, int id_slot); std::unique_ptr handle_embeddings_impl(const server_http_req & req, task_response_type res_type); + std::unique_ptr handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type); // using unique_ptr to allow late initialization of const std::unique_ptr meta; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 49de7eecc33..38aca29226f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -167,6 +167,8 @@ int llama_server(int argc, char ** argv) { routes.post_tokenize = models_routes->proxy_post; routes.post_detokenize = models_routes->proxy_post; routes.post_apply_template = models_routes->proxy_post; + routes.post_chat_completions_tok = models_routes->proxy_post; + routes.post_responses_tok_oai = models_routes->proxy_post; routes.get_lora_adapters = models_routes->proxy_get; routes.post_lora_adapters = models_routes->proxy_post; routes.get_slots = models_routes->proxy_get; @@ -198,7 +200,6 @@ int llama_server(int argc, char ** argv) { ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); ctx_http.post("/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API - ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting ctx_http.post("/infill", ex_wrapper(routes.post_infill)); ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); @@ -210,6 +211,12 @@ int llama_server(int argc, char ** argv) { ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); + // token counting + ctx_http.post("/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok)); + ctx_http.post("/v1/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok)); + ctx_http.post("/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai)); + ctx_http.post("/v1/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai)); + ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting // LoRA adapters hotswap ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index f80e46133c7..fe55dc5ab17 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -573,3 +573,19 @@ def test_chat_completions_multiple_choices(): for choice in res.body["choices"]: assert "assistant" == choice["message"]["role"] assert choice["finish_reason"] == "length" + + +def test_chat_completions_token_count(): + global server + server.start() + # make sure cache can be reused across multiple choices and multiple requests + # ref: https://github.com/ggml-org/llama.cpp/pull/18663 + for _ in range(2): + res = server.make_request("POST", "/chat/completions/input_tokens", data={ + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + }) + assert res.status_code == 200 + assert res.body["input_tokens"] > 5 diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py index fb77084c89b..d74cc3a43ed 100644 --- a/tools/server/tests/unit/test_vision_api.py +++ b/tools/server/tests/unit/test_vision_api.py @@ -98,6 +98,25 @@ def test_vision_chat_completion(prompt, image_url, success, re_content): assert res.status_code != 200 +def test_vision_chat_completion_token_count(): + global server + server.start() + res = server.make_request("POST", "/chat/completions/input_tokens", data={ + "temperature": 0.0, + "top_k": 1, + "messages": [ + {"role": "user", "content": [ + {"type": "text", "text": "What is this:"}, + {"type": "image_url", "image_url": { + "url": get_img_url("IMG_URL_0"), + }}, + ]}, + ], + }) + assert res.status_code == 200 + assert res.body["input_tokens"] > 10 + + @pytest.mark.parametrize( "prompt, image_data, success, re_content", [ diff --git a/tools/ui/src/lib/hooks/use-throttle.svelte.ts b/tools/ui/src/lib/hooks/use-throttle.svelte.ts new file mode 100644 index 00000000000..0795519787b --- /dev/null +++ b/tools/ui/src/lib/hooks/use-throttle.svelte.ts @@ -0,0 +1,32 @@ +/** + * Creates a reactive throttle key that increments when `getValue()` changes + * and the throttle window has elapsed since the last increment. + * + * Useful for throttling animations that should not fire on every rapid update. + * + * @param getValue - A reactive getter for the value to watch + * @param ms - Throttle window in milliseconds + * @returns A reactive number that increments when the throttled value changes + */ +export function useThrottle(getValue: () => string | undefined, ms: number) { + let key = $state(0); + let throttleEnd = $state(0); + let lastValue: string | undefined = getValue(); + + $effect(() => { + const value = getValue(); + if (value === lastValue) return; + const now = Date.now(); + if (now >= throttleEnd) { + lastValue = value; + key++; + throttleEnd = now + ms; + } + }); + + return { + get key() { + return key; + } + }; +} diff --git a/tools/ui/tests/stories/a11y/ActionIcon.a11y.stories.svelte b/tools/ui/tests/stories/a11y/ActionIcon.a11y.stories.svelte new file mode 100644 index 00000000000..20f5e057b0c --- /dev/null +++ b/tools/ui/tests/stories/a11y/ActionIcon.a11y.stories.svelte @@ -0,0 +1,34 @@ + + + { + const before = await canvas.findByRole('button', { name: 'before' }); + const target = await canvas.findByRole('button', { name: 'Copy' }); + + before.focus(); + await userEvent.tab(); + + await expect(target).toHaveFocus(); + }} +> +
+ + {}} /> +
+
diff --git a/tools/ui/tests/stories/a11y/ChatMessageStatistics.a11y.stories.svelte b/tools/ui/tests/stories/a11y/ChatMessageStatistics.a11y.stories.svelte new file mode 100644 index 00000000000..4aaf60cd656 --- /dev/null +++ b/tools/ui/tests/stories/a11y/ChatMessageStatistics.a11y.stories.svelte @@ -0,0 +1,50 @@ + + + { + const reading = await canvas.findByRole('button', { name: 'Reading' }); + const generation = await canvas.findByRole('button', { name: 'Generation' }); + const tools = await canvas.findByRole('button', { name: 'Tools' }); + const summary = await canvas.findByRole('button', { name: 'Summary' }); + + reading.focus(); + await expect(reading).toHaveFocus(); + + await userEvent.tab(); + await expect(generation).toHaveFocus(); + + await userEvent.tab(); + await expect(tools).toHaveFocus(); + + await userEvent.tab(); + await expect(summary).toHaveFocus(); + }} +/> diff --git a/tools/ui/tests/stories/a11y/HorizontalScrollCarousel.a11y.stories.svelte b/tools/ui/tests/stories/a11y/HorizontalScrollCarousel.a11y.stories.svelte new file mode 100644 index 00000000000..937d7ab1094 --- /dev/null +++ b/tools/ui/tests/stories/a11y/HorizontalScrollCarousel.a11y.stories.svelte @@ -0,0 +1,69 @@ + + + { + const before = await canvas.findByRole('button', { name: 'before' }); + const after = await canvas.findByRole('button', { name: 'after' }); + const leftArrow = await canvas.findByRole('button', { name: 'Scroll left' }); + + await waitFor(() => { + expect(leftArrow).toBeDisabled(); + }); + + before.focus(); + await userEvent.tab(); + + await expect(after).toHaveFocus(); + }} +> +
+ + +
+
+
+ +
+
+ + { + const before = await canvas.findByRole('button', { name: 'before' }); + const rightArrow = await canvas.findByRole('button', { name: 'Scroll right' }); + + await waitFor(() => { + expect(rightArrow).not.toBeDisabled(); + }); + + before.focus(); + await userEvent.tab(); + + await expect(rightArrow).toHaveFocus(); + }} +> +
+ + + {#each [...Array(20).keys()] as i (i)} +
{i}
+ {/each} +
+
+
diff --git a/tools/ui/tests/stories/a11y/SidebarNavigationConversationItem.a11y.stories.svelte b/tools/ui/tests/stories/a11y/SidebarNavigationConversationItem.a11y.stories.svelte new file mode 100644 index 00000000000..1fc42608f72 --- /dev/null +++ b/tools/ui/tests/stories/a11y/SidebarNavigationConversationItem.a11y.stories.svelte @@ -0,0 +1,36 @@ + + + { + const row = await canvas.findByRole('button', { name: /Forked Conversation/ }); + const forkIcon = await canvas.findByRole('link'); + + row.focus(); + await userEvent.tab(); + + await expect(forkIcon).toHaveFocus(); + }} +/>