From 4d742877b2631bd9094bc7603bc59b65940563e2 Mon Sep 17 00:00:00 2001 From: Gerard Martinez Date: Thu, 4 Jun 2026 03:58:25 -0700 Subject: [PATCH 01/71] build : use umbrella Headers directory for XCFramework module map (#23974) The XCFramework generated by build-xcframework.sh creates a module map that manually lists public headers. That list can fall out of sync with the framework's Headers directory. The module map is currently missing ggml-opt.h, which is present in the framework headers. This can cause downstream Apple builds to fail with: Include of non-modular header inside framework module 'llama' Use the framework's Headers directory itself as the module map umbrella instead of maintaining a manual header list. This makes all public headers under the generated framework's Headers directory part of the llama module. --- build-xcframework.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/build-xcframework.sh b/build-xcframework.sh index 5d289922a84..180c01a88e9 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -130,14 +130,7 @@ setup_framework_structure() { # Create module map (common for all platforms) cat > ${module_path}module.modulemap << EOF framework module llama { - header "llama.h" - header "ggml.h" - header "ggml-alloc.h" - header "ggml-backend.h" - header "ggml-metal.h" - header "ggml-cpu.h" - header "ggml-blas.h" - header "gguf.h" + umbrella "Headers" link "c++" link framework "Accelerate" From 4586479852e40f0ce8d4a38b8ec3b98c042d5a04 Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 4 Jun 2026 13:09:49 +0200 Subject: [PATCH 02/71] webui: fix tool selector toggle/counter, key tools by stable identity (#24065) * webui: fix tool selector toggle/counter, key tools by stable identity Key the disabled set, counts and toggles by a stable per-tool key instead of bare function name, deduped from one canonical list. Per-tool checkboxes become presentational (single row handler, no nested button), category checkboxes drop the tristate (n/total carries partial). One getEnabledToolsForLLM keeps normalized MCP schemas and dedupes by name. * ui: use SvelteSet and SvelteMap for local tool collections to satisfy svelte/prefer-svelte-reactivity --- .../ChatFormActionAddSheet.svelte | 3 +- .../ChatFormActionAddToolsSubmenu.svelte | 39 +- .../SettingsChat/SettingsChatToolsTab.svelte | 18 +- tools/ui/src/lib/constants/storage.ts | 3 + .../src/lib/hooks/use-tools-panel.svelte.ts | 27 +- tools/ui/src/lib/stores/tools.svelte.ts | 349 ++++++++---------- tools/ui/src/lib/types/tools.d.ts | 4 +- 7 files changed, 203 insertions(+), 240 deletions(-) diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte index 9adb9eb89d8..c4069163f61 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte @@ -231,7 +231,7 @@
{#each toolsPanel.activeGroups as group (group.label)} - {@const { checked, indeterminate } = toolsPanel.getGroupCheckedState(group)} + {@const checked = toolsPanel.isGroupChecked(group)} {@const enabledCount = toolsPanel.getEnabledToolCount(group)} {@const favicon = toolsPanel.getFavicon(group)} @@ -259,7 +259,6 @@ e.stopPropagation()} onCheckedChange={() => toolsPanel.toggleGroupByLabel(group.label)} diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte index 813227fbce0..9a5b0cbe862 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte @@ -1,5 +1,5 @@ - - - -

Reading (prompt processing)

-
- - {/if} - - +{#snippet viewButton(opts: { + view: ChatMessageStatsView; + icon: Component; + label: string; + tooltipText: string; + disabled?: boolean; +})} + {@const IconComponent = opts.icon} + + + + {#snippet child({ props })} - + {/snippet} + - -

- {isGenerationDisabled - ? 'Generation (waiting for tokens...)' - : 'Generation (token output)'} -

-
-
+ +

{opts.tooltipText}

+
+ +{/snippet} - {#if hasAgenticStats} - - - - + {@render viewButton({ + view: ChatMessageStatsView.GENERATION, + icon: Sparkles, + label: 'Generation', + tooltipText: isGenerationDisabled + ? 'Generation (waiting for tokens...)' + : 'Generation (token output)', + disabled: isGenerationDisabled + })} - -

Tool calls

-
-
+ {#if hasAgenticStats} + {@render viewButton({ + view: ChatMessageStatsView.TOOLS, + icon: Wrench, + label: 'Tools', + tooltipText: 'Tool calls' + })} {#if !hideSummary} - - - - - - -

Agentic summary

-
-
+ {@render viewButton({ + view: ChatMessageStatsView.SUMMARY, + icon: Layers, + label: 'Summary', + tooltipText: 'Agentic summary' + })} {/if} {/if}
diff --git a/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatisticsBadge.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatisticsBadge.svelte index eea7da7b2f1..db7d01690a5 100644 --- a/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatisticsBadge.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatisticsBadge.svelte @@ -21,13 +21,16 @@ {#if tooltipLabel} - - {#snippet icon()} - - {/snippet} + + {#snippet child({ props })} + + {#snippet icon()} + + {/snippet} - {value} - + {value} + + {/snippet}

{tooltipLabel}

diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte index c43bee3e3c3..a22c491adac 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte @@ -41,16 +41,13 @@ }); -
+
+ {/snippet} + - {#if ms.updating} - + {#if selectedOption} + +

{selectedOption.model}

+
{/if} - + {/if} {/if}
diff --git a/tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte b/tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte index 83d856d10ea..951831149fc 100644 --- a/tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte +++ b/tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte @@ -34,24 +34,28 @@ - e.stopPropagation()} - > - {#if triggerTooltip} - - + + + + {#snippet child({ props })} + e.stopPropagation()} + > {@render iconComponent(triggerIcon, 'h-3 w-3')} - {triggerTooltip} - - -

{triggerTooltip}

-
-
- {:else} - {@render iconComponent(triggerIcon, 'h-3 w-3')} + {#if triggerTooltip} + {triggerTooltip} + {/if} +
+ {/snippet} + + {#if triggerTooltip} + +

{triggerTooltip}

+
{/if} - +
{#each actions as action, index (action.label)} diff --git a/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte index dad8d954cbb..e38a937385a 100644 --- a/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte +++ b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte @@ -105,6 +105,12 @@ onclick={handleSelect} onmouseover={handleMouseOver} onmouseleave={handleMouseLeave} + onfocusin={handleMouseOver} + onfocusout={(e) => { + if (!e.currentTarget.contains(e.relatedTarget as Node | null)) { + handleMouseLeave(); + } + }} >
0} - - - + + {#snippet child({ props })} + + + + {/snippet} @@ -195,7 +205,8 @@ opacity: 0; } - &:is(:hover) :global([data-slot='dropdown-menu-trigger']) { + &:is(:hover) :global([data-slot='dropdown-menu-trigger']), + &:focus-within :global([data-slot='dropdown-menu-trigger']) { opacity: 1; } @media (max-width: 768px) { diff --git a/tools/ui/tests/stories/a11y/ActionIcon.a11y.stories.svelte b/tools/ui/tests/stories/a11y/ActionIcon.a11y.stories.svelte new file mode 100644 index 00000000000..20f5e057b0c --- /dev/null +++ b/tools/ui/tests/stories/a11y/ActionIcon.a11y.stories.svelte @@ -0,0 +1,34 @@ + + + { + const before = await canvas.findByRole('button', { name: 'before' }); + const target = await canvas.findByRole('button', { name: 'Copy' }); + + before.focus(); + await userEvent.tab(); + + await expect(target).toHaveFocus(); + }} +> +
+ + {}} /> +
+
diff --git a/tools/ui/tests/stories/a11y/ChatMessageStatistics.a11y.stories.svelte b/tools/ui/tests/stories/a11y/ChatMessageStatistics.a11y.stories.svelte new file mode 100644 index 00000000000..4aaf60cd656 --- /dev/null +++ b/tools/ui/tests/stories/a11y/ChatMessageStatistics.a11y.stories.svelte @@ -0,0 +1,50 @@ + + + { + const reading = await canvas.findByRole('button', { name: 'Reading' }); + const generation = await canvas.findByRole('button', { name: 'Generation' }); + const tools = await canvas.findByRole('button', { name: 'Tools' }); + const summary = await canvas.findByRole('button', { name: 'Summary' }); + + reading.focus(); + await expect(reading).toHaveFocus(); + + await userEvent.tab(); + await expect(generation).toHaveFocus(); + + await userEvent.tab(); + await expect(tools).toHaveFocus(); + + await userEvent.tab(); + await expect(summary).toHaveFocus(); + }} +/> diff --git a/tools/ui/tests/stories/ChatScreenForm.a11y.stories.svelte b/tools/ui/tests/stories/a11y/ChatScreenForm.a11y.stories.svelte similarity index 100% rename from tools/ui/tests/stories/ChatScreenForm.a11y.stories.svelte rename to tools/ui/tests/stories/a11y/ChatScreenForm.a11y.stories.svelte diff --git a/tools/ui/tests/stories/a11y/HorizontalScrollCarousel.a11y.stories.svelte b/tools/ui/tests/stories/a11y/HorizontalScrollCarousel.a11y.stories.svelte new file mode 100644 index 00000000000..937d7ab1094 --- /dev/null +++ b/tools/ui/tests/stories/a11y/HorizontalScrollCarousel.a11y.stories.svelte @@ -0,0 +1,69 @@ + + + { + const before = await canvas.findByRole('button', { name: 'before' }); + const after = await canvas.findByRole('button', { name: 'after' }); + const leftArrow = await canvas.findByRole('button', { name: 'Scroll left' }); + + await waitFor(() => { + expect(leftArrow).toBeDisabled(); + }); + + before.focus(); + await userEvent.tab(); + + await expect(after).toHaveFocus(); + }} +> +
+ + +
+
+
+ +
+
+ + { + const before = await canvas.findByRole('button', { name: 'before' }); + const rightArrow = await canvas.findByRole('button', { name: 'Scroll right' }); + + await waitFor(() => { + expect(rightArrow).not.toBeDisabled(); + }); + + before.focus(); + await userEvent.tab(); + + await expect(rightArrow).toHaveFocus(); + }} +> +
+ + + {#each [...Array(20).keys()] as i (i)} +
{i}
+ {/each} +
+
+
diff --git a/tools/ui/tests/stories/a11y/SidebarNavigationConversationItem.a11y.stories.svelte b/tools/ui/tests/stories/a11y/SidebarNavigationConversationItem.a11y.stories.svelte new file mode 100644 index 00000000000..1fc42608f72 --- /dev/null +++ b/tools/ui/tests/stories/a11y/SidebarNavigationConversationItem.a11y.stories.svelte @@ -0,0 +1,36 @@ + + + { + const row = await canvas.findByRole('button', { name: /Forked Conversation/ }); + const forkIcon = await canvas.findByRole('link'); + + row.focus(); + await userEvent.tab(); + + await expect(forkIcon).toHaveFocus(); + }} +/> From 260862b8ca2c9a652c297488c623997c492310cf Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 4 Jun 2026 18:23:48 +0200 Subject: [PATCH 12/71] arg: fix double mtp downloads (#24128) --- common/arg.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index f53b4798105..1ffaf704858 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -446,6 +446,12 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex) opts.download_mtp = spec_type_draft_mtp; opts.download_mmproj = !params.no_mmproj; + // sub-models (draft, mmproj, vocoder) are explicitly specified by the user, + // so we should not auto-discover mtp/mmproj siblings for them + common_download_opts sub_opts = opts; + sub_opts.download_mtp = false; + sub_opts.download_mmproj = false; + try { auto res = common_params_handle_model(params.model, opts); if (params.no_mmproj) { @@ -457,7 +463,7 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex) // only download mmproj if the current example is using it for (const auto & ex : mmproj_examples) { if (curr_ex == ex) { - common_params_handle_model(params.mmproj, opts); + common_params_handle_model(params.mmproj, sub_opts); break; } } @@ -470,8 +476,8 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex) params.speculative.draft.mparams.url.empty()) { params.speculative.draft.mparams.path = res.mtp.path; } - common_params_handle_model(params.speculative.draft.mparams, opts); - common_params_handle_model(params.vocoder.model, opts); + common_params_handle_model(params.speculative.draft.mparams, sub_opts); + common_params_handle_model(params.vocoder.model, sub_opts); return true; } catch (const common_skip_download_exception &) { return false; From 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Jun 2026 19:30:59 +0300 Subject: [PATCH 13/71] server : disable on-device spec checkpoints (#24108) --- examples/speculative-simple/speculative-simple.cpp | 10 +++++----- tools/server/server-context.cpp | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 5325bcc9e3f..d87ba48beb1 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -175,7 +175,7 @@ int main(int argc, char ** argv) { llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), seq_id)); if (use_ckpt_dft) { - ckpt.update_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.update_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } // generate a new draft @@ -196,12 +196,12 @@ int main(int argc, char ** argv) { // this allows us to restore the state if partial draft acceptance occurs if (!draft.empty()) { if (use_ckpt_tgt) { - ckpt.update_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.update_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } } { - ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1); } @@ -261,13 +261,13 @@ int main(int argc, char ** argv) { draft = std::move(ids); { - ckpt.load_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); llama_memory_seq_rm(llama_get_memory(ctx_tgt), seq_id, ckpt.pos_max + 1, -1); } { - ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1); } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 28f738c3feb..ab0d5944763 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2512,7 +2512,7 @@ struct server_context_impl { llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), slot.id)); if (use_ckpt_dft) { - slot.spec_ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + slot.spec_ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } slot.spec_prompt = slot.prompt.tokens.get_text_tokens(); @@ -2551,7 +2551,7 @@ struct server_context_impl { if (ctx_dft) { if (use_ckpt_dft) { - ckpt.load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } common_context_seq_rm(ctx_dft.get(), slot.id, ckpt.pos_max + 1, -1); @@ -2568,7 +2568,7 @@ struct server_context_impl { if (use_ckpt_tgt) { //const int64_t t_start = ggml_time_us(); - ckpt.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); //const int64_t t_total = ggml_time_us() - t_start; //printf("checkpoint total: %f ms\n", t_total / 1000.0); @@ -2580,7 +2580,7 @@ struct server_context_impl { } if (use_ckpt_dft) { - ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } } } @@ -3447,13 +3447,13 @@ struct server_context_impl { SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", ckpt.pos_min, ckpt.pos_max, ckpt.size()); { - ckpt.load_tgt(slot.ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_tgt(slot.ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); common_context_seq_rm(slot.ctx_tgt, slot.id, ckpt.pos_max + 1, -1); } if (slot.ctx_dft) { - ckpt.load_dft(slot.ctx_dft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + ckpt.load_dft(slot.ctx_dft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); common_context_seq_rm(slot.ctx_dft, slot.id, ckpt.pos_max + 1, -1); } From 7fe2ae45ab644c5b9b5740dff5068442f64fabce Mon Sep 17 00:00:00 2001 From: Mason Milburn Date: Fri, 5 Jun 2026 01:10:31 -0400 Subject: [PATCH 14/71] sycl : port multi-column MMVQ from CUDA backend (#21845) mmvq: Port the ncols_dst optimization from ggml-cuda/mmvq.cu to SYCL. Read weights once per dispatch instead of once per column. Covers all standard quant types + reorder paths for Q4_0, Q8_0, Q3_K, Q4_K, Q5_K, Q6_K. IQ types (except IQ4_XS) excluded due to incompatible vec_dot signatures. ggml-sycl: The weight reorder was only bootstrapped on single-token mat-vec (ne[1] == 1). Speculative / MTP verify issues only multi-column mat-vec, so it never triggered the reorder and ran on the slower non-reorder kernel. Bootstrap it on small multi-column batches (ne[1] <= 8) too. --- ggml/src/ggml-sycl/ggml-sycl.cpp | 4 +- ggml/src/ggml-sycl/mmvq.cpp | 1118 +++++++++++++++++++++++++++++- 2 files changed, 1095 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 96138f57ebe..3f246e8672d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3971,7 +3971,9 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf. dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases. - dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1; + // ne[1] <= 8 so multi-column decode (spec / MTP verify) also bootstraps the reorder; + // all reorderable types have a _switch_ncols kernel. + dst->src[1]->ne[1] <= 8 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1; } static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */, diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index abd1e49a70e..cf2b59576aa 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -56,6 +56,65 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r } } +template +static void mul_mat_vec_q_reorder_ncols(const void * __restrict__ vx, const void * __restrict__ vy, + float * __restrict__ dst, const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + const sycl::nd_item<3> & nd_item) { + using block_type = ggml_sycl_reordered::block_q_t; + using block_traits = typename block_type::traits; + + const auto sg = nd_item.get_sub_group(); + const int sg_range = sg.get_group_linear_range(); + const int workgroup_id = nd_item.get_group_linear_id(); + const int sg_id = sg.get_group_linear_id(); + const int row = workgroup_id * sg_range + sg_id; + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / block_traits::qk; + constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi); + constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq; + const int nblocks = nrows * (ncols / block_traits::qk); + + static_assert(blocks_per_subgroup > 0); + static_assert(block_elements_per_subgroup > 0); + + float partial_sum[ncols_dst] = {0.0f}; + for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) { + const int ibx = row * blocks_per_row + i; + + const auto bx_offset = block_type::get_block_offset(ibx, nblocks); + const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx); + const int iby = i * block_type::block_to_q8_1_ratio(); + +#pragma unroll + for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) { + const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup); + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const char * vy_j = (const char *)vy + j * stride_col_y_bytes; + const int8_t * q8_1_quant_ptr = (const int8_t *)vy_j + iby * QK8_1; + const sycl::half2* q8_1_ds_ptr = (const sycl::half2 *)(vy_j + ncols + iby * sizeof(sycl::half2)); + + partial_sum[j] += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs); + } + } + } + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + float sum = sycl::reduce_over_group(nd_item.get_sub_group(), partial_sum[j], std::plus<>()); + + if (sg.leader()) { + dst[j * stride_col_dst + row] = sum; + } + } +} + template static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) { @@ -100,6 +159,70 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ } } +template +static void mul_mat_vec_q_ncols( + const void * __restrict__ vx, + const void * __restrict__ vy, + float * __restrict__ dst, + const int ncols, + const int nrows, + const int stride_col_y, + const int stride_col_dst, + const sycl::nd_item<3> & item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; + + // partial sums: one per output column + float tmp[ncols_dst] = {0.0f}; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); + i < blocks_per_row; + i += blocks_per_warp) { + + const int ibx = row * blocks_per_row + i; + const int iby = i * (qk / QK8_1); + + // read weight block once, dot against all columns + for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) { + const int iqs = elem + vdr * (item_ct1.get_local_id(2) % (qi / vdr)); + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + tmp[j] += vec_dot_q_sycl(&x[ibx], &y[j * stride_col_y + iby], iqs); + } + } + } + + // reduce within subgroup +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp[j] += dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), tmp[j], mask); + } + } + + if (item_ct1.get_local_id(2) == 0) { +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + dst[j * stride_col_dst + row] = tmp[j]; + } + } +} + template static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, const void *__restrict__ vy, @@ -553,6 +676,45 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q4_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q4_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_0 reorder multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK4_0 == 0); @@ -571,6 +733,45 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * } } +template +static void mul_mat_vec_q4_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q4_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q4_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q4_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q4_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q4_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q4_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q4_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q4_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q4_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_0 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -595,6 +796,45 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q4_1_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_1 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q4_1_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q4_1_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q4_1_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q4_1_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q4_1_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q4_1_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q4_1_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q4_1_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q4_1_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_1 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_mxfp4_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_MXFP4 == 0); @@ -613,6 +853,45 @@ static void mul_mat_vec_mxfp4_q8_1_sycl(const void * vx, const void * vy, float } } +template +static void mul_mat_vec_mxfp4_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_MXFP4 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_mxfp4_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_mxfp4_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_mxfp4_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_mxfp4_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_mxfp4_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_mxfp4_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_mxfp4_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_mxfp4_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_mxfp4_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for MXFP4 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_nvfp4_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_NVFP4 == 0); @@ -631,6 +910,45 @@ static void mul_mat_vec_nvfp4_q8_1_sycl(const void * vx, const void * vy, float } } +template +static void mul_mat_vec_nvfp4_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_NVFP4 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_nvfp4_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_nvfp4_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_nvfp4_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_nvfp4_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_nvfp4_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_nvfp4_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_nvfp4_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_nvfp4_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_nvfp4_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for NVFP4 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -655,6 +973,45 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q5_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q5_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q5_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q5_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q5_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q5_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q5_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q5_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q5_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q5_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q5_0 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -679,6 +1036,45 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q5_1_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_1 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q5_1_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q5_1_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q5_1_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q5_1_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q5_1_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q5_1_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q5_1_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q5_1_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q5_1_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q5_1 multi-col MMVQ", ncols_dst); + } +} + static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK8_0 == 0); @@ -698,6 +1094,45 @@ static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q8_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q8_0 reorder multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -722,6 +1157,45 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q8_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q8_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q8_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q8_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q8_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q8_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q8_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q8_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q8_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q8_0 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -746,6 +1220,45 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q2_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q2_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q2_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q2_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q2_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q2_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q2_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q2_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q2_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q2_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q2_K multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -790,6 +1303,85 @@ static void reorder_mul_mat_vec_q3_k_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q3_k_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q3_k_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q3_K reorder multi-col MMVQ", ncols_dst); + } +} + +template +static void mul_mat_vec_q3_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q3_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q3_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q3_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q3_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q3_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q3_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q3_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q3_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q3_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q3_K multi-col MMVQ", ncols_dst); + } +} + + static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -814,6 +1406,51 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q4_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, + stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q4_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q4_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q4_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q4_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q4_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q4_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q4_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q4_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q4_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_K multi-col MMVQ", ncols_dst); + } +} + static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); @@ -834,6 +1471,44 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q4_k_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q4_k_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q4_K reorder multi-col MMVQ", ncols_dst); + } +} static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, @@ -859,6 +1534,51 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q5_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, + stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q5_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q5_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q5_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q5_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q5_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q5_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q5_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q5_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q5_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q5_K multi-col MMVQ", ncols_dst); + } +} + static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); @@ -879,6 +1599,45 @@ static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, }); } +template +static void reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q5_k_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q5_k_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q5_K reorder multi-col MMVQ", ncols_dst); + } +} + static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); @@ -897,6 +1656,46 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, }); }); } + +template +static void reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder_ncols, ncols_dst>( + vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); + }); + }); +} + +static void reorder_mul_mat_vec_q6_k_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y_bytes, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: reorder_mul_mat_vec_q6_k_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 3: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 4: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 5: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 6: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 7: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + case 8: reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q6_K reorder multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -921,6 +1720,51 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_q6_K_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, + stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q6_K_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q6_K_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q6_K_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q6_K_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q6_K_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q6_K_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q6_K_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q6_K_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q6_K_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q6_K multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, @@ -1117,6 +1961,51 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, } } +template +static void mul_mat_vec_iq4_xs_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, + stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_iq4_xs_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_iq4_xs_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_iq4_xs_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for IQ4_XS multi-col MMVQ", ncols_dst); + } +} + void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, @@ -1143,42 +2032,135 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q4_0: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n"); - reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q4_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n"); + reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q4_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n"); mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } break; case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_1_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q4_1_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q5_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_1_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q5_1_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q8_0: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n"); - reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n"); + reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl\n"); mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } break; case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q2_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q2_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q3_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl\n"); - reorder_mul_mat_vec_q3_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, - stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q3_k_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q3_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q3_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q3_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q3_K_q8_1_sycl\n"); mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } @@ -1186,9 +2168,27 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q4_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n"); - reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q4_k_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q4_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n"); mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } @@ -1196,9 +2196,27 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q5_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n"); - reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q5_k_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q5_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl\n"); mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } @@ -1206,9 +2224,27 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q6_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n"); - reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - } else { + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y_bytes = src1_padded_col_size * q8_1_ts / q8_1_bs; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + reorder_mul_mat_vec_q6_k_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y_bytes, stride_col_dst, stream); + return; + } else { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + } else if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_K_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q6_K_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_k_q8_1_sycl\n"); mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } @@ -1238,13 +2274,43 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); break; case GGML_TYPE_IQ4_XS: - mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_iq4_xs_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_iq4_xs_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_MXFP4: - mul_mat_vec_mxfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_mxfp4_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_mxfp4_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_mxfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_NVFP4: - mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_nvfp4_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_nvfp4_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; default: GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(src0->type)); From 46fa662b1f4cd3f00d774512cd50044b6d17bc2c Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 5 Jun 2026 07:57:36 +0200 Subject: [PATCH 15/71] ci : build-msys job slimming [no ci] (#24157) This PR attempts to slim down the dependencies for build-msys jobs making the same changes that we applied in whisper.cpp to reduce the size of the github actions cache, and should also improve the run time due to fewer dependencies that need to be installed. I realize this is a scheduled job but I think it would still make sense to apply these changes. Refs: https://github.com/ggml-org/whisper.cpp/pull/3858 --- .github/workflows/build-msys.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-msys.yml b/.github/workflows/build-msys.yml index c2633c151a5..15c55cf12cc 100644 --- a/.github/workflows/build-msys.yml +++ b/.github/workflows/build-msys.yml @@ -27,8 +27,8 @@ jobs: fail-fast: false matrix: include: - - { sys: UCRT64, env: ucrt-x86_64, build: Release } - - { sys: CLANG64, env: clang-x86_64, build: Release } + - { sys: UCRT64, env: ucrt-x86_64, compiler: gcc, build: Release } + - { sys: CLANG64, env: clang-x86_64, compiler: clang, build: Release } steps: - name: Clone @@ -48,9 +48,7 @@ jobs: update: true msystem: ${{matrix.sys}} install: >- - base-devel - git - mingw-w64-${{matrix.env}}-toolchain + mingw-w64-${{matrix.env}}-${{matrix.compiler}} mingw-w64-${{matrix.env}}-cmake mingw-w64-${{matrix.env}}-openblas From 2154a0fdcf3a28d38038cd12a8b26fca724e9485 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Fri, 5 Jun 2026 08:37:34 +0200 Subject: [PATCH 16/71] CUDA: enroll mul_mat_vec_q_moe into pdl (#24087) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Enroll mul_mat_vec_q_moe into PDL, boosting MTP performance on BW Data collected on a B4500: Before ``` (llama.cpp) ➜ llama.cpp git:(master) ✗ python mtp-bench.py code_python pred= 192 draft= 150 acc= 116 rate=0.773 tok/s=202.8 code_cpp pred= 192 draft= 147 acc= 117 rate=0.796 tok/s=212.8 explain_concept pred= 192 draft= 161 acc= 110 rate=0.683 tok/s=196.4 summarize pred= 192 draft= 138 acc= 122 rate=0.884 tok/s=226.6 qa_factual pred= 192 draft= 138 acc= 121 rate=0.877 tok/s=225.1 translation pred= 192 draft= 158 acc= 112 rate=0.709 tok/s=201.5 creative_short pred= 192 draft= 160 acc= 110 rate=0.688 tok/s=197.2 stepwise_math pred= 192 draft= 150 acc= 115 rate=0.767 tok/s=209.2 long_code_review pred= 192 draft= 148 acc= 116 rate=0.784 tok/s=208.9 ``` After ``` (llama.cpp) ➜ llama.cpp git:(master) ✗ python mtp-bench.py code_python pred= 192 draft= 150 acc= 116 rate=0.773 tok/s=211.9 code_cpp pred= 192 draft= 147 acc= 117 rate=0.796 tok/s=224.6 explain_concept pred= 192 draft= 161 acc= 110 rate=0.683 tok/s=207.8 summarize pred= 192 draft= 138 acc= 122 rate=0.884 tok/s=240.2 qa_factual pred= 192 draft= 138 acc= 121 rate=0.877 tok/s=238.5 translation pred= 192 draft= 158 acc= 112 rate=0.709 tok/s=213.4 creative_short pred= 192 draft= 160 acc= 110 rate=0.688 tok/s=208.8 stepwise_math pred= 192 draft= 150 acc= 115 rate=0.767 tok/s=221.7 long_code_review pred= 192 draft= 148 acc= 116 rate=0.784 tok/s=220.7 ``` Server launched with: ``` ➜ llama.cpp git:(osimons/enroll_mul_mat_vec_q_moe_into_PDL) ✗ ./build-x64-linux-gcc-reldbg/bin/llama-server \ -m /mnt/share/gguf/unsloth/Qwen3.6-35B-A3B-MTP-GGUF/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf -dio \ --spec-type draft-mtp \ --spec-draft-n-max 2 \ -ngl all \ -fa on \ --host 0.0.0.0 \ --port 8080 -np 1 --chat-template-kwargs "{\"preserve_thinking\": true}" ``` * LC to overlap with following kernels --- ggml/src/ggml-cuda/mmvq.cu | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 4b0426590ac..bdfbfd2d387 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -682,12 +682,16 @@ static __global__ void mul_mat_vec_q( template __launch_bounds__(get_mmvq_mmid_max_batch_for_device()*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q_moe( - const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, - float * __restrict__ dst, + const void * vx_ptr, const void * vy_ptr, const int32_t * ids_ptr, + float * dst_ptr, const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x, const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst, const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint32_t ncols_dst, const uint32_t ids_stride) { + const void * GGML_CUDA_RESTRICT vx = vx_ptr; + const void * GGML_CUDA_RESTRICT vy = vy_ptr; + const int32_t * GGML_CUDA_RESTRICT ids = ids_ptr; + float * GGML_CUDA_RESTRICT dst = dst_ptr; constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; @@ -707,6 +711,7 @@ static __global__ void mul_mat_vec_q_moe( return; } + ggml_cuda_pdl_sync(); const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride]; const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y); @@ -726,6 +731,8 @@ static __global__ void mul_mat_vec_q_moe( } } + ggml_cuda_pdl_lc(); + // Warp-level reduction only - no shared memory needed #pragma unroll for (int i = 0; i < c_rows_per_block; ++i) { @@ -794,8 +801,9 @@ static void mul_mat_vec_q_moe_launch( const int64_t nblocks_rows = (nrows_x + rows_per_block - 1) / rows_per_block; const dim3 block_nums(nblocks_rows, nchannels_dst); const dim3 block_dims(warp_size, ncols_dst); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); - mul_mat_vec_q_moe<<>>( + ggml_cuda_kernel_launch(mul_mat_vec_q_moe, launch_params, vx, vy, ids, dst, ncols_x, nchannels_y, nrows_x, stride_row_x, stride_col_y, stride_col_dst, stride_channel_x, stride_channel_y, stride_channel_dst, From 3ecfb150a4bd2d92b2a7974bb1af954c8a5e2985 Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Fri, 5 Jun 2026 09:11:47 +0200 Subject: [PATCH 17/71] kleidiai : dynamic chunck-based scheduling for hybrid execution (#23819) --- ggml/src/ggml-cpu/kleidiai/kleidiai.cpp | 272 ++++++++++++------------ 1 file changed, 141 insertions(+), 131 deletions(-) diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 0ecf7ae02ac..9e54b676b93 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -38,6 +38,7 @@ #include "kleidiai.h" #include "ggml-cpu.h" +#include "ggml-cpu-impl.h" #include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-threading.h" @@ -61,7 +62,8 @@ struct ggml_kleidiai_context { ggml_kleidiai_kernels * kernels_q8; int sme_thread_cap; // <= 0 means “SME disabled/unknown”; int thread_hint; // <= 0 means “no hint” -} static ctx = { CPU_FEATURE_NONE, nullptr, nullptr, 0, -1 }; + int chunk_multiplier; +} static ctx = { CPU_FEATURE_NONE, nullptr, nullptr, 0, -1, 4 }; static const char* cpu_feature_to_string(cpu_feature f) { if (f == CPU_FEATURE_NONE) { @@ -186,8 +188,9 @@ static void init_kleidiai_context(void) { if (!initialized) { initialized = true; - const char *env_sme = getenv("GGML_KLEIDIAI_SME"); - const char *env_threads = getenv("GGML_TOTAL_THREADS"); + const char *env_sme = getenv("GGML_KLEIDIAI_SME"); + const char *env_threads = getenv("GGML_TOTAL_THREADS"); + const char *env_chunk_mult = getenv("GGML_KLEIDIAI_CHUNK_MULTIPLIER"); const bool cpu_has_sme = ggml_cpu_has_sme(); size_t detected_smcus = 0; @@ -204,6 +207,14 @@ static void init_kleidiai_context(void) { } } + if (env_chunk_mult) { + bool ok = false; + int multiplier = parse_uint_env(env_chunk_mult, "GGML_KLEIDIAI_CHUNK_MULTIPLIER", &ok); + if (ok && multiplier > 0) { + ctx.chunk_multiplier = multiplier; + } + } + // SME policy: // - If CPU doesn't support SME: SME always off. // - Else: @@ -296,6 +307,50 @@ static inline size_t align_up(size_t value, size_t alignment) { return remainder == 0 ? value : value + (alignment - remainder); } +static inline size_t gcd_size(size_t a, size_t b) { + while (b != 0) { + const size_t t = a % b; + a = b; + b = t; + } + return a; +} + +static inline bool lcm_size(size_t a, size_t b, size_t & result) { + if (a == 0 || b == 0) { + result = 0; + return false; + } + const size_t g = gcd_size(a, b); + const size_t q = a / g; + if (q > SIZE_MAX / b) { + return false; + } + result = q * b; + return true; +} + +static inline size_t ceil_div_size(size_t a, size_t b) { + return b == 0 ? 0 : (a + b - 1) / b; +} + +struct kleidiai_block_args { + size_t lhs_bl; + size_t rhs_bl; + size_t pack_bl; +}; + +static inline kleidiai_block_args kleidiai_get_block_args(ggml_type rhs_type) { + switch (rhs_type) { + case GGML_TYPE_Q4_0: + return { QK4_0, QK4_0, QK4_0 }; + case GGML_TYPE_Q8_0: + return { 0, 0, QK8_0 }; + default: + return { 0, 0, 0 }; + } +} + static inline bool kleidiai_pack_fallback_allowed() { if (ctx.sme_thread_cap <= 0) { return false; @@ -746,8 +801,10 @@ class tensor_traits : public ggml::cpu::tensor_traits { size_t n_step; size_t lhs_packed_size; size_t lhs_offset; - size_t n_offset; - size_t n_cols; + size_t lhs_bl; + size_t rhs_bl; + size_t pack_bl; + size_t lhs_packed_offset0; int assigned_threads; int thread_begin; int thread_end; @@ -772,6 +829,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { continue; } + const kleidiai_block_args block_args = kleidiai_get_block_args(kernels->rhs_type); + runtime[runtime_count] = { slot, kernels, @@ -784,7 +843,9 @@ class tensor_traits : public ggml::cpu::tensor_traits { kinfo->get_n_step(), 0, 0, - 0, + block_args.lhs_bl, + block_args.rhs_bl, + block_args.pack_bl, 0, 0, 0, @@ -795,45 +856,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { } if (runtime_count == 0) { - ggml_kleidiai_kernels * fallback = ggml_kleidiai_select_kernels(ctx.features, dst); - if (!fallback) { - return false; - } - kernel_info * kinfo = is_gemv ? &fallback->gemv : &fallback->gemm; - lhs_packing_info * linfo = is_gemv ? &fallback->gemv_lhs_info : &fallback->gemm_lhs_info; - rhs_packing_info * rinfo = &fallback->rhs_info; - if (!kinfo || !linfo || !linfo->packed_size_ex || !linfo->pack_func_ex || - !kinfo->get_rhs_packed_offset_ex || !kinfo->run_kernel_ex || !kinfo->get_dst_offset || - !rinfo || !rinfo->pack_func_ex || !rinfo->packed_size_ex) { - return false; - } - kernel_chain[0] = fallback; - runtime[0] = { - 0, - fallback, - kinfo, - linfo, - kinfo->get_mr(), - kinfo->get_nr(), - kinfo->get_kr(), - kinfo->get_sr(), - kinfo->get_n_step(), - 0, - 0, - 0, - 0, - 0, - 0, - 0, - nullptr - }; - size_t rhs_size_fallback = 0; - const uint8_t * rhs_base = weight_for_slot(0, rhs_size_fallback); - if (!rhs_base) { - rhs_base = static_cast(src0->data); - } - runtime[0].rhs_base = rhs_base; - runtime_count = 1; + GGML_LOG_WARN("kleidiai: no runtime kernel slot available for supported op %s\n", dst->name); + return false; } const int nth_total = params->nth > 0 ? params->nth : 1; @@ -846,6 +870,13 @@ class tensor_traits : public ggml::cpu::tensor_traits { break; } } + int non_sme_slot = -1; + for (int i = 0; i < runtime_count; ++i) { + if ((runtime[i].kernels->required_cpu & CPU_FEATURE_SME) != CPU_FEATURE_SME) { + non_sme_slot = i; + break; + } + } const int sme_cap_limit = ctx.sme_thread_cap; const bool use_hybrid = sme_cap_limit > 0 && @@ -864,12 +895,15 @@ class tensor_traits : public ggml::cpu::tensor_traits { if (!hybrid_enabled) { int chosen_slot = 0; if (too_small_for_hybrid && sme_slot != -1) { - chosen_slot = sme_slot; + chosen_slot = nth_total > sme_cap_limit && non_sme_slot != -1 ? non_sme_slot : sme_slot; } else if (runtime_count > 1 && ctx.sme_thread_cap > 0 && nth_total > ctx.sme_thread_cap) { chosen_slot = 1; } if (chosen_slot != 0 && chosen_slot < runtime_count) { runtime[0] = runtime[chosen_slot]; + runtime[0].assigned_threads = 0; + runtime[0].thread_begin = 0; + runtime[0].thread_end = 0; } runtime_count = runtime_count > 0 ? 1 : 0; @@ -896,6 +930,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { int fallback_indices[GGML_KLEIDIAI_MAX_KERNEL_SLOTS]; int fallback_count = 0; + // The current hybrid chain is bounded to SME + one non-SME fallback slot. + GGML_ASSERT(GGML_KLEIDIAI_MAX_KERNEL_SLOTS == 2); for (int i = 0; i < runtime_count; ++i) { if (i == sme_slot) { continue; @@ -952,73 +988,67 @@ class tensor_traits : public ggml::cpu::tensor_traits { size_t cursor = 0; for (int i = 0; i < runtime_count; ++i) { - const ggml_type slot_rhs_type = runtime[i].kernels->rhs_type; - const size_t slot_pack_size_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 : - slot_rhs_type == GGML_TYPE_Q8_0 ? QK8_0 : 0; - runtime[i].lhs_packed_size = runtime[i].lhs_info->packed_size_ex(m, k, slot_pack_size_arg, runtime[i].mr, runtime[i].kr, runtime[i].sr); + runtime[i].lhs_packed_size = runtime[i].lhs_info->packed_size_ex(m, k, runtime[i].pack_bl, runtime[i].mr, runtime[i].kr, runtime[i].sr); cursor = align_up(cursor, GGML_KLEIDIAI_PACK_ALIGN); runtime[i].lhs_offset = cursor; + runtime[i].lhs_packed_offset0 = runtime[i].lhs_info->get_packed_offset_ex(0, k, runtime[i].lhs_bl, runtime[i].mr, runtime[i].kr, runtime[i].sr); cursor += runtime[i].lhs_packed_size; } GGML_ASSERT(cursor <= params->wsize); uint8_t * scratch = static_cast(params->wdata); - size_t assigned_cols = 0; - uint64_t weighted_total = 0; - if (runtime_count > 1 && sme_slot != -1) { - for (int i = 0; i < runtime_count; ++i) { - const uint64_t weight = (i == sme_slot) ? (sme_cap << 1) : 1; - weighted_total += (uint64_t)runtime[i].assigned_threads * weight; - } - } + size_t common_step = 1; for (int i = 0; i < runtime_count; ++i) { - runtime[i].n_offset = assigned_cols; if (runtime[i].assigned_threads == 0) { - runtime[i].n_cols = 0; continue; } - const size_t remaining_cols = n - assigned_cols; - if (remaining_cols == 0) { - runtime[i].n_cols = 0; - continue; - } - const size_t step = runtime[i].n_step ? runtime[i].n_step : 1; - size_t target = 0; - if (weighted_total > 0) { - const uint64_t weight = (i == sme_slot) ? (sme_cap << 1) : 1; - target = (size_t)(((uint64_t)n * runtime[i].assigned_threads * weight) / weighted_total); - } else { - target = (size_t)(((uint64_t)n * runtime[i].assigned_threads) / nth_total); - } - target = std::min(target, remaining_cols); - size_t aligned = round_down(target, step); - if (aligned == 0 && remaining_cols >= step) { - aligned = step; + size_t next_step = 0; + if (!lcm_size(common_step, runtime[i].n_step ? runtime[i].n_step : 1, next_step)) { + return false; } - runtime[i].n_cols = aligned; - assigned_cols += aligned; + common_step = next_step; } - - if (assigned_cols < n) { - for (int i = runtime_count - 1; i >= 0; --i) { - if (runtime[i].assigned_threads > 0) { - runtime[i].n_cols += n - assigned_cols; - break; - } - } + GGML_ASSERT(common_step > 0); + + const bool disable_chunking = ggml_is_numa(); + const size_t chunk_multiplier = std::max(1, ctx.chunk_multiplier); + const size_t chunk_divisor = (nth_total == 1 || disable_chunking) ? (size_t)nth_total : (size_t)nth_total * chunk_multiplier; + size_t chunk_cols = align_up(std::max(1, ceil_div_size(n, chunk_divisor)), common_step); + if (chunk_cols == 0) { + chunk_cols = common_step; } + // If common_step is larger than n, the loop below runs one valid tail chunk + // with cols == n. + const size_t nchunk_size = std::max(1, ceil_div_size(n, chunk_cols)); + GGML_ASSERT(nchunk_size <= (size_t)INT_MAX); + const int nchunk = (int)nchunk_size; const size_t dst_stride = dst->nb[1]; + auto run_chunk = [&](runtime_slot & slot, size_t global_start, size_t cols, uint8_t * dst_batch_base) { + const size_t rhs_packed_offset = slot.kernel->get_rhs_packed_offset_ex(global_start, k, slot.rhs_bl); + const size_t dst_offset = slot.kernel->get_dst_offset(0, global_start, dst_stride); + + const uint8_t * lhs_ptr = scratch + slot.lhs_offset + slot.lhs_packed_offset0; + const uint8_t * rhs_ptr = slot.rhs_base + rhs_packed_offset; + float * dst_ptr = reinterpret_cast(dst_batch_base + dst_offset); + + slot.kernel->run_kernel_ex(m, cols, k, slot.rhs_bl, + lhs_ptr, + rhs_ptr, + dst_ptr, + dst_stride, + sizeof(float), + -FLT_MAX, + FLT_MAX); + }; + for (int64_t batch_idx = 0; batch_idx < ne12; ++batch_idx) { const uint8_t * lhs_batch_base = static_cast(src1->data) + batch_idx * src1->nb[2]; uint8_t * dst_batch_base = static_cast(dst->data) + batch_idx * dst->nb[2]; if (runtime[local_slot].assigned_threads > 0) { runtime_slot & slot = runtime[local_slot]; - const ggml_type slot_rhs_type = slot.kernels->rhs_type; - const size_t slot_lhs_exec_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 : - slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0; const int64_t m_roundup_mr = kai_roundup((int64_t)m, (int64_t)slot.mr); int64_t max_threads = slot.mr ? (m_roundup_mr / (int64_t)slot.mr) : slot.assigned_threads; max_threads = std::max(1, max_threads); @@ -1031,8 +1061,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { const int64_t m_start = (int64_t)local_ith * num_m_per_thread0; const int64_t m_count = (local_ith == use_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0; - const size_t base_packed_off = slot.lhs_info->get_packed_offset_ex(m_start, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr); - const size_t next_block_off = slot.lhs_info->get_packed_offset_ex(m_start + slot.mr, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr); + const size_t base_packed_off = slot.lhs_info->get_packed_offset_ex(m_start, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr); + const size_t next_block_off = slot.lhs_info->get_packed_offset_ex(m_start + slot.mr, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr); const size_t row_stride_bytes = slot.mr ? (next_block_off - base_packed_off) / slot.mr : 0; int64_t remaining = m_count; @@ -1049,7 +1079,7 @@ class tensor_traits : public ggml::cpu::tensor_traits { const size_t dst_off = base_packed_off + (size_t)(cur - m_start) * row_stride_bytes; void * dst_ptr = lhs_packed + dst_off; - slot.lhs_info->pack_func_ex(take, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr, 0, src_ptr, src1->nb[1], dst_ptr); + slot.lhs_info->pack_func_ex(take, k, slot.lhs_bl, slot.mr, slot.kr, slot.sr, 0, src_ptr, src1->nb[1], dst_ptr); cur += take; remaining -= take; @@ -1057,49 +1087,29 @@ class tensor_traits : public ggml::cpu::tensor_traits { } } + if (ith_total == 0) { + ggml_threadpool_chunk_set(params->threadpool, nth_total); + } + + // Publishes both LHS packing and the initialized dynamic chunk queue. ggml_barrier(params->threadpool); runtime_slot & slot = runtime[local_slot]; - if (slot.n_cols > 0 && slot.assigned_threads > 0) { - int64_t active_threads = slot.assigned_threads; - const int64_t max_threads = slot.n_step ? (slot.n_cols / slot.n_step) : slot.assigned_threads; - if (max_threads > 0) { - active_threads = std::min(active_threads, std::max(1, max_threads)); + int current_chunk = ith_total; + while (current_chunk < nchunk) { + const size_t global_start = (size_t)current_chunk * chunk_cols; + if (global_start >= n) { + break; } - active_threads = std::max(1, active_threads); - - if (local_ith < active_threads) { - const size_t step = slot.n_step ? slot.n_step : 1; - const size_t chunk0 = round_down((size_t)(slot.n_cols / active_threads), step); - const size_t chunkN = slot.n_cols - (active_threads - 1) * chunk0; - const size_t local_start = (size_t)local_ith * chunk0; - const size_t cols = (local_ith == active_threads - 1) ? chunkN : chunk0; - - if (cols > 0) { - const ggml_type slot_rhs_type = slot.kernels->rhs_type; - const size_t slot_lhs_exec_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 : - slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0; - const size_t slot_rhs_block_arg = slot_rhs_type == GGML_TYPE_Q4_0 ? QK4_0 : - slot_rhs_type == GGML_TYPE_Q8_0 ? 0 : 0; - const size_t global_start = slot.n_offset + local_start; - const size_t lhs_packed_offset = slot.lhs_info->get_packed_offset_ex(0, k, slot_lhs_exec_arg, slot.mr, slot.kr, slot.sr); - const size_t rhs_packed_offset = slot.kernel->get_rhs_packed_offset_ex(global_start, k, slot_rhs_block_arg); - const size_t dst_offset = slot.kernel->get_dst_offset(0, global_start, dst_stride); - - const uint8_t * lhs_ptr = scratch + slot.lhs_offset + lhs_packed_offset; - const uint8_t * rhs_ptr = slot.rhs_base + rhs_packed_offset; - float * dst_ptr = reinterpret_cast(dst_batch_base + dst_offset); - - slot.kernel->run_kernel_ex(m, cols, k, slot_rhs_block_arg, - lhs_ptr, - rhs_ptr, - dst_ptr, - dst_stride, - sizeof(float), - -FLT_MAX, - FLT_MAX); - } + + const size_t cols = std::min(chunk_cols, n - global_start); + if (cols > 0) { + // KleidiAI GEMM/GEMV kernels accept arbitrary final tail widths; + // only non-tail chunks are guaranteed to be n_step-aligned. + run_chunk(slot, global_start, cols, dst_batch_base); } + + current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); } if (batch_idx != ne12 - 1) { From 7acb4e8cd2ce21f457d1298e75fad729520d263c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Jun 2026 11:09:36 +0300 Subject: [PATCH 18/71] hparams : refactor `hparams.n_layer` (#24060) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * hparams : refactor hparams.n_layer * cont : remove `n_layer_kv()`, use n_layer_all instead * cont : type consistency * pi : update SYSTEM.md * models : fix Step3.5 MTP * cont : remove duplicate switch cases * cont : explicitly set `false` to extra layers for `is_swa` and `is_recr` * cont : fix nextn layer count handling Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- .pi/gg/SYSTEM.md | 4 +- src/llama-adapter.cpp | 8 ++-- src/llama-context.cpp | 10 ++-- src/llama-graph.cpp | 2 +- src/llama-hparams.cpp | 83 ++++++++++++++++------------------ src/llama-hparams.h | 17 ++++--- src/llama-kv-cache.cpp | 8 ++-- src/llama-memory-recurrent.cpp | 8 ++-- src/llama-model-loader.cpp | 6 +-- src/llama-model-saver.cpp | 6 +-- src/llama-model.cpp | 79 ++++++++++++++++---------------- src/llama-model.h | 3 +- src/llama-quant.cpp | 4 +- src/models/afmoe.cpp | 2 +- src/models/apertus.cpp | 11 +++-- src/models/arcee.cpp | 2 +- src/models/arctic.cpp | 2 +- src/models/arwkv7.cpp | 2 +- src/models/baichuan.cpp | 2 +- src/models/bailingmoe.cpp | 2 +- src/models/bailingmoe2.cpp | 21 ++++----- src/models/bert.cpp | 4 +- src/models/bitnet.cpp | 2 +- src/models/bloom.cpp | 2 +- src/models/chameleon.cpp | 2 +- src/models/chatglm.cpp | 3 +- src/models/codeshell.cpp | 3 +- src/models/cogvlm.cpp | 3 +- src/models/cohere2.cpp | 4 +- src/models/command-r.cpp | 3 +- src/models/dbrx.cpp | 12 ++--- src/models/deci.cpp | 3 +- src/models/deepseek2.cpp | 11 ++--- src/models/deepseek2ocr.cpp | 2 +- src/models/deepseek32.cpp | 22 ++++----- src/models/dots1.cpp | 3 +- src/models/dream.cpp | 3 +- src/models/ernie4-5.cpp | 2 +- src/models/eurobert.cpp | 2 +- src/models/exaone-moe.cpp | 22 ++++----- src/models/exaone.cpp | 2 +- src/models/exaone4.cpp | 22 ++++----- src/models/falcon-h1.cpp | 2 +- src/models/falcon.cpp | 2 +- src/models/gemma-embedding.cpp | 2 +- src/models/gemma.cpp | 2 +- src/models/gemma2.cpp | 2 +- src/models/gemma3.cpp | 2 +- src/models/gemma3n.cpp | 6 +-- src/models/gemma4.cpp | 6 +-- src/models/glm-dsa.cpp | 17 +++---- src/models/glm4-moe.cpp | 26 +++++------ src/models/glm4.cpp | 20 ++++---- src/models/gpt2.cpp | 3 +- src/models/gptneox.cpp | 3 +- src/models/granite-hybrid.cpp | 2 +- src/models/granite-moe.cpp | 2 +- src/models/granite.cpp | 2 +- src/models/grok.cpp | 2 +- src/models/grovemoe.cpp | 2 +- src/models/hunyuan-moe.cpp | 2 +- src/models/internlm2.cpp | 3 +- src/models/jais.cpp | 2 +- src/models/jais2.cpp | 2 +- src/models/jamba.cpp | 4 +- src/models/jina-bert-v2.cpp | 2 +- src/models/jina-bert-v3.cpp | 2 +- src/models/kimi-linear.cpp | 4 +- src/models/lfm2.cpp | 10 ++-- src/models/lfm2moe.cpp | 4 +- src/models/llada-moe.cpp | 5 +- src/models/llada.cpp | 4 +- src/models/llama.cpp | 4 +- src/models/llama4.cpp | 2 +- src/models/maincoder.cpp | 3 +- src/models/mamba.cpp | 2 +- src/models/mamba2.cpp | 2 +- src/models/mellum.cpp | 4 +- src/models/mimo2.cpp | 22 ++++----- src/models/minicpm.cpp | 4 +- src/models/minicpm3.cpp | 2 +- src/models/minimax-m2.cpp | 2 +- src/models/mistral3.cpp | 2 +- src/models/modern-bert.cpp | 2 +- src/models/mpt.cpp | 2 +- src/models/nemotron-h.cpp | 4 +- src/models/nemotron.cpp | 3 +- src/models/neo-bert.cpp | 2 +- src/models/nomic-bert-moe.cpp | 2 +- src/models/nomic-bert.cpp | 2 +- src/models/olmo.cpp | 2 +- src/models/olmo2.cpp | 2 +- src/models/olmoe.cpp | 3 +- src/models/openai-moe.cpp | 2 +- src/models/openelm.cpp | 12 ++--- src/models/orion.cpp | 2 +- src/models/pangu-embed.cpp | 3 +- src/models/phi2.cpp | 2 +- src/models/phi3.cpp | 2 +- src/models/phimoe.cpp | 2 +- src/models/plamo.cpp | 2 +- src/models/plamo2.cpp | 4 +- src/models/plamo3.cpp | 2 +- src/models/plm.cpp | 3 +- src/models/qwen.cpp | 2 +- src/models/qwen2.cpp | 3 +- src/models/qwen2moe.cpp | 3 +- src/models/qwen3.cpp | 3 +- src/models/qwen35.cpp | 33 ++++++-------- src/models/qwen35moe.cpp | 33 ++++++-------- src/models/qwen3moe.cpp | 6 +-- src/models/qwen3next.cpp | 8 ++-- src/models/qwen3vl.cpp | 3 +- src/models/qwen3vlmoe.cpp | 3 +- src/models/refact.cpp | 3 +- src/models/rnd1.cpp | 5 +- src/models/rwkv6.cpp | 2 +- src/models/rwkv6qwen2.cpp | 2 +- src/models/rwkv7.cpp | 2 +- src/models/seed-oss.cpp | 3 +- src/models/smallthinker.cpp | 4 +- src/models/smollm3.cpp | 2 +- src/models/stablelm.cpp | 2 +- src/models/starcoder.cpp | 3 +- src/models/starcoder2.cpp | 3 +- src/models/step35.cpp | 36 +++++++-------- src/models/t5.cpp | 4 +- src/models/talkie.cpp | 2 +- src/models/xverse.cpp | 3 +- 129 files changed, 412 insertions(+), 431 deletions(-) diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md index 06d97ae78ee..197173faed8 100644 --- a/.pi/gg/SYSTEM.md +++ b/.pi/gg/SYSTEM.md @@ -16,12 +16,12 @@ Pull requests (PRs): - New branch names are prefixed with "gg/" - Before opening a pull request, ask the user to confirm the description - When creating a pull request, look for the repository's PR template and follow it -- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]" +- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]" - Ask the user to tell you what model was used and write it in place of [MODEL] - Always create the pull requests in draft mode Commits: -- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag +- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag - Do not explicitly set the git author in commits - rely on the default git config - Always use `--no-gpg-sign` when committing - Never `git push` without explicit confirmation from the user diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 4a1aaa955a8..3e0fe66afff 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), + /*.mem_size =*/ hparams.n_layer()*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) { }; // make tensors - tensors.reserve(hparams.n_layer); + tensors.reserve(hparams.n_layer()); tensors.push_back(nullptr); // there's never a tensor for layer 0 - for (size_t il = 1; il < hparams.n_layer; il++) { + for (size_t il = 1; il < hparams.n_layer(); il++) { ggml_backend_buffer_type_t buft = model.select_buft(il); ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { @@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply( layer_start = il_start; layer_end = il_end; - for (size_t il = 1; il < hparams.n_layer; il++) { + for (size_t il = 1; il < hparams.n_layer(); il++) { assert(tensors[il] != nullptr); const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f59381a4d75..eff1d8f89f2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -341,7 +341,7 @@ llama_context::llama_context( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = model.n_devices() > 1 && - model.n_gpu_layers() > model.hparams.n_layer && + model.n_gpu_layers() > model.hparams.n_layer() && model.split_mode() == LLAMA_SPLIT_MODE_LAYER && cparams.offload_kqv && !model.has_tensor_overrides(); @@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const { // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer; + const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer(); if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); @@ -3416,7 +3416,7 @@ llama_context * llama_init_from_model( if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) { const uint32_t blck_size = ggml_blck_size(params.type_k); - for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) { if (model->hparams.n_embd_head_k(il) % blck_size != 0) { LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n", __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il)); @@ -3427,7 +3427,7 @@ llama_context * llama_init_from_model( if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) { const uint32_t blck_size = ggml_blck_size(params.type_v); - for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) { if (model->hparams.n_embd_head_v(il) % blck_size != 0) { LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n", __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il)); @@ -3449,7 +3449,7 @@ llama_context * llama_init_from_model( } if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && - model->hparams.nextn_predict_layers == 0) { + model->hparams.n_layer_nextn == 0) { LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); return nullptr; } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f910528d21b..172edf24cb1 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1005,7 +1005,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : cparams (params.cparams), ubatch (params.ubatch), n_embd (hparams.n_embd), - n_layer (hparams.n_layer), + n_layer (hparams.n_layer()), n_rot (hparams.n_rot()), n_ctx (cparams.n_ctx), n_head (hparams.n_head()), diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 087afec55c6..e1e49d1cc1f 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -7,31 +7,38 @@ void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { if (dense_first) { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer(); ++il) { is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0); } } else { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer(); ++il) { is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); } } + + for (uint32_t il = n_layer(); il < n_layer_all; ++il) { + is_swa_impl[il] = false; + } } -// TODO: implement -//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) { -// if (dense_first) { -// for (uint32_t il = 0; il < n_layer; ++il) { -// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0); -// } -// } else { -// for (uint32_t il = 0; il < n_layer; ++il) { -// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); -// } -// } -//} +void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) { + if (dense_first) { + for (uint32_t il = 0; il < n_layer(); ++il) { + is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0); + } + } else { + for (uint32_t il = 0; il < n_layer(); ++il) { + is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); + } + } + + for (uint32_t il = n_layer(); il < n_layer_all; ++il) { + is_recr_impl[il] = false; + } +} bool llama_hparams::is_swa_any() const { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (is_swa_impl[il]) { return true; } @@ -41,7 +48,7 @@ bool llama_hparams::is_swa_any() const { } uint32_t llama_hparams::n_head(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_head_arr[il]; } @@ -49,7 +56,7 @@ uint32_t llama_hparams::n_head(uint32_t il) const { } uint32_t llama_hparams::n_head_kv(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_head_kv_arr[il]; } @@ -57,7 +64,7 @@ uint32_t llama_hparams::n_head_kv(uint32_t il) const { } uint32_t llama_hparams::n_ff(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_ff_arr[il]; } @@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const { } uint32_t llama_hparams::n_rot(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_rot_swa : n_rot_full; } @@ -98,7 +105,7 @@ uint32_t llama_hparams::n_embd_out() const { } uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full; } @@ -106,7 +113,7 @@ uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { } uint32_t llama_hparams::n_embd_head_v(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full; } @@ -127,7 +134,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const { bool llama_hparams::is_n_embd_k_gqa_variable() const { const uint32_t val = n_embd_k_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (val != n_embd_k_gqa(il)) { return true; } @@ -138,7 +145,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const { bool llama_hparams::is_n_embd_v_gqa_variable() const { const uint32_t val = n_embd_v_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (val != n_embd_v_gqa(il)) { return true; } @@ -149,7 +156,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const { uint32_t llama_hparams::n_embd_k_gqa_max() const { uint32_t val = n_embd_k_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { val = std::max(val, n_embd_k_gqa(il)); } @@ -158,7 +165,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const { uint32_t llama_hparams::n_embd_v_gqa_max() const { uint32_t val = n_embd_v_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { val = std::max(val, n_embd_v_gqa(il)); } @@ -207,11 +214,11 @@ uint32_t llama_hparams::n_embd_s() const { } bool llama_hparams::is_recr(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_recr_impl[il]; } - GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer); + GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all); } uint32_t llama_hparams::n_pos_per_embd() const { @@ -219,11 +226,11 @@ uint32_t llama_hparams::n_pos_per_embd() const { } bool llama_hparams::is_swa(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa_impl[il]; } - GGML_ABORT("fatal error"); + GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all); } bool llama_hparams::is_mla() const { @@ -242,12 +249,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const { } bool llama_hparams::has_kv(uint32_t il) const { - if (kv_only_nextn) { - // MTP head: only the trailing nextn_predict_layers blocks own a KV cache; - // the leading trunk blocks are not executed in this graph. - return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers); - } - if (n_layer_kv_from_start >= 0) { if (il < (uint32_t) n_layer_kv_from_start) { return true; @@ -260,16 +261,8 @@ bool llama_hparams::has_kv(uint32_t il) const { return true; } -uint32_t llama_hparams::n_layer_kv() const { - uint32_t res = 0; - - for (uint32_t il = 0; il < n_layer; ++il) { - if (has_kv(il)) { - res++; - } - } - - return res; +uint32_t llama_hparams::n_layer() const { + return n_layer_all - n_layer_nextn; } bool llama_hparams::use_mrope() const { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index e8ed4dd74de..fde6183e878 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -48,12 +48,15 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; - uint32_t n_layer; - int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + uint32_t n_layer_all; + uint32_t n_layer_nextn = 0; uint32_t n_expert = 0; uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; + // TODO: this needs to be reworked + int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + // different head size for full_attention and SWA layers uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head @@ -96,9 +99,6 @@ struct llama_hparams { uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t moe_every_n_layers = 0; uint32_t moe_latent_size = 0; - uint32_t nextn_predict_layers = 0; - - bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches) float f_norm_eps; float f_norm_rms_eps; @@ -272,8 +272,7 @@ struct llama_hparams { bool is_swa(uint32_t il) const; - // TODO: implement - //void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); + void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); // whether or not the given layer is recurrent (for hybrid models) bool is_recr(uint32_t il) const; @@ -329,8 +328,8 @@ struct llama_hparams { bool has_kv(uint32_t il) const; - // number of layers for which has_kv() returns true - uint32_t n_layer_kv() const; + // number of effective layers (excludes nextn layers) + uint32_t n_layer() const; // note that this function uses different SWA parameters from those in the hparams // note: inlined on purpose for performance reasons diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 82da38e0b61..60ae42e3786 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -97,7 +97,7 @@ llama_kv_cache::llama_kv_cache( GGML_ASSERT(kv_size % n_pad == 0); - const uint32_t n_layer_kv = hparams.n_layer_kv(); + const uint32_t n_layer = hparams.n_layer_all; // define a comparator for the buft -> ctx map to ensure that the order is well-defined: struct ggml_backend_buft_comparator { @@ -112,7 +112,7 @@ llama_kv_cache::llama_kv_cache( auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache( const bool is_mla = hparams.is_mla(); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + for (uint32_t il = 0; il < n_layer; il++) { if (!hparams.has_kv(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); continue; @@ -230,7 +230,7 @@ llama_kv_cache::llama_kv_cache( if (reuse) { LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + for (uint32_t il = 0; il < n_layer; il++) { const int32_t il_reuse = reuse(il); if (il_reuse < 0) { diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index ec5dc5835dd..6a4892fb471 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -26,7 +26,7 @@ llama_memory_recurrent::llama_memory_recurrent( uint32_t n_seq_max, uint32_t n_rs_seq, const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) { - const int32_t n_layer = hparams.n_layer; + const int32_t n_layer = hparams.n_layer(); head = 0; size = mem_size; @@ -863,7 +863,7 @@ void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std:: void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { const uint32_t s_trans = 0; - const uint32_t n_layer = hparams.n_layer; + const uint32_t n_layer = hparams.n_layer(); io.write(&s_trans, sizeof(s_trans)); io.write(&n_layer, sizeof(n_layer)); @@ -1047,8 +1047,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell io.read(&s_trans, sizeof(s_trans)); io.read(&n_layer, sizeof(n_layer)); - if (n_layer != hparams.n_layer) { - LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + if (n_layer != hparams.n_layer()) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer()); return false; } if (cell_count > size) { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 4d7b11067c9..ba08a19ac76 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1050,10 +1050,10 @@ struct ggml_tensor * llama_model_loader::create_tensor( if (it == ctx_map.end()) { // one ggml context per buffer type int max_n_tensors = n_tensors; - max_n_tensors += 1; // duplicated output tensor - max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors + max_n_tensors += 1; // duplicated output tensor + max_n_tensors += hparams.n_layer()*2; // duplicated rope freq tensors if (files.empty()) { - max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses + max_n_tensors += hparams.n_layer()*256; // this should be well above what any model actually uses } const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors; diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 26fda1abfae..b0522878090 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -77,7 +77,7 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) { template void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) { GGML_ASSERT(model != nullptr || !per_layer); - const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size(); + const size_t n_values = per_layer ? size_t(model->hparams.n_layer()) : value.size(); GGML_ASSERT(n_values <= value.size()); if (n_values == 0) { @@ -206,7 +206,7 @@ void llama_model_saver::add_kv_from_model() { if (hparams.n_embd_out_impl > 0) { add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl); } - add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer); + add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true); add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); @@ -227,7 +227,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale); add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers); - add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers); + add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn); add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers); add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bc7a83b15f5..c98cb27e4d4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -398,7 +398,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str rotation = get_il_eff(il) % ud->n_devices; } else { il = 0; - rotation = hparams.n_layer % ud->n_devices; + rotation = hparams.n_layer() % ud->n_devices; } const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str()); if (tensor_axis_0 == nullptr) { @@ -1034,7 +1034,7 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); @@ -1089,13 +1089,13 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f); std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f); - ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false); // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer(), false); bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -1194,7 +1194,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { const auto & use_mlock = params.use_mlock; const auto & tensor_split = params.tensor_split; - const int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer_all; const int n_gpu_layers = this->n_gpu_layers(); const bool use_mmap_buffer = true; @@ -1251,10 +1251,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { splits[i] /= split_sum; } - const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0); - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1); + const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0); + const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { - const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il); + const bool is_swa = il < n_layer && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; @@ -1557,7 +1557,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } if (llama_supports_gpu_offload()) { - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + const int n_gpu = std::min(n_gpu_layers, n_layer); int n_repeating = n_gpu; if (n_repeating > 0) { @@ -1566,8 +1566,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating); - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_backend_supported_layers = n_layer + 1; + const int max_offloadable_layers = n_layer + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); } @@ -1636,7 +1636,7 @@ const float * llama_model::tensor_split() const { } uint32_t llama_model::n_gpu_layers() const { - return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1; + return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer() + 1; } llama_split_mode llama_model::split_mode() const { @@ -1707,17 +1707,17 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp()); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer()); + LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer()).c_str()); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any()); LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full); LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full); - LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer()).c_str()); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); @@ -1725,7 +1725,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attn_value_scale = %.4f\n", __func__, hparams.f_attn_value_scale); - LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer()).c_str()); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups); @@ -1852,7 +1852,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers); + LLAMA_LOG_INFO("%s: n_layer_nextn = %d\n", __func__, hparams.n_layer_nextn); } if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) { @@ -2034,22 +2034,21 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; llama_memory_hybrid::layer_filter_cb filter_recr = nullptr; if (arch == LLM_ARCH_FALCON_H1) { - filter_attn = [&](int32_t) { return true; }; - filter_recr = [&](int32_t) { return true; }; + filter_attn = [&](uint32_t) { return true; }; + filter_recr = [&](uint32_t) { return true; }; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { - filter_attn = [&](int32_t il) { + filter_attn = [&](uint32_t il) { return !hparams.is_recr(il) && hparams.n_ff(il) == 0; }; - filter_recr = [&](int32_t il) { + filter_recr = [&](uint32_t il) { return hparams.is_recr(il) && hparams.n_ff(il) == 0; }; } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - filter_attn = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && !hparams.is_recr(il); + filter_attn = [&](uint32_t il) { + return il < hparams.n_layer() && !hparams.is_recr(il); }; - filter_recr = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && hparams.is_recr(il); + filter_recr = [&](uint32_t il) { + return il < hparams.n_layer() && hparams.is_recr(il); }; } @@ -2098,9 +2097,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_kv_cache::layer_filter_cb filter = nullptr; if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { - reuse = [&](int32_t il) { - if (il >= (int32_t) hparams.n_layer_kv_from_start) { - return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); + reuse = [&](uint32_t il) { + GGML_ASSERT(hparams.n_layer_kv_from_start >= 2); + + if (il >= (uint32_t)hparams.n_layer_kv_from_start) { + return hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); } return -1; @@ -2108,16 +2109,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } if (mtp_on_hybrid_qwen35) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; }; + filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; } - if (arch == LLM_ARCH_STEP35 && hparams.nextn_predict_layers > 0) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + if (arch == LLM_ARCH_STEP35 && hparams.n_layer_nextn > 0) { if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) { - filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; }; + filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; } else { - filter = [n_main](int32_t il) { return (uint32_t)il < n_main; }; + filter = [&](uint32_t il) { return il < hparams.n_layer(); }; } } @@ -2242,7 +2241,7 @@ int32_t llama_model_n_embd_out(const llama_model * model) { } int32_t llama_model_n_layer(const llama_model * model) { - return model->hparams.n_layer; + return model->hparams.n_layer(); } int32_t llama_model_n_head(const llama_model * model) { diff --git a/src/llama-model.h b/src/llama-model.h index a561374ed95..884cfdf5c3a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -700,7 +700,8 @@ const char * llm_type_name(llm_type type); // convenience macro for loading local variables for load_tensors() in llama_model_base // note: cast to int64_t since we will use these for the tensor dimensions #define LLAMA_LOAD_LOCALS \ - const int n_layer = hparams.n_layer; GGML_UNUSED(n_layer); \ + const int n_layer = hparams.n_layer(); GGML_UNUSED(n_layer); \ + const int n_layer_all = hparams.n_layer_all; GGML_UNUSED(n_layer_all); \ const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \ const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \ const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \ diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43e05c3d56f..cf92ce4bb8b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vectorhparams.n_embd = desc->n_embd; model->hparams.n_embd_head_k_full = desc->n_embd_head_k; model->hparams.n_embd_head_v_full = desc->n_embd_head_v; - model->hparams.n_layer = desc->n_layer; + model->hparams.n_layer_all = desc->n_layer; model->hparams.n_expert = desc->n_expert; for (uint32_t i = 0; i < desc->n_layer; i++) { diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index a7c77ee5d28..063b214256e 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -30,7 +30,7 @@ void llama_model_afmoe::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 56: type = LLM_TYPE_6B; break; case 32: type = LLM_TYPE_26B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp index bec7136521c..6dfb8905fbe 100644 --- a/src/models/apertus.cpp +++ b/src/models/apertus.cpp @@ -2,12 +2,13 @@ void llama_model_apertus::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer); - switch (hparams.n_layer) { + ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer()); + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp index d086c4717ff..9536e7c5d42 100644 --- a/src/models/arcee.cpp +++ b/src/models/arcee.cpp @@ -4,7 +4,7 @@ void llama_model_arcee::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); // Arcee uses the same structure as Llama - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index 27deadffeb7..09ee0f752f0 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -4,7 +4,7 @@ void llama_model_arctic::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); if (hparams.n_expert == 128) { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 35: type = LLM_TYPE_10B_128x3_66B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp index 9bd04127b25..b38b2064785 100644 --- a/src/models/arwkv7.cpp +++ b/src/models/arwkv7.cpp @@ -10,7 +10,7 @@ void llama_model_arwkv7::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: switch (hparams.n_embd) { case 768: type = LLM_TYPE_190M; break; diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp index 4d26081cd5d..585f3614174 100644 --- a/src/models/baichuan.cpp +++ b/src/models/baichuan.cpp @@ -2,7 +2,7 @@ void llama_model_baichuan::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp index fe1ae10864b..7faf73c835b 100644 --- a/src/models/bailingmoe.cpp +++ b/src/models/bailingmoe.cpp @@ -8,7 +8,7 @@ void llama_model_bailingmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_16B; break; case 88: type = LLM_TYPE_290B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index 2f0d44a6259..5000e9c6db8 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -9,17 +9,13 @@ void llama_model_bailingmoe2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 20: type = LLM_TYPE_16B_A1B; break; - case 21: type = LLM_TYPE_16B_A1B; break; case 32: type = LLM_TYPE_100B_A6B; break; - case 33: type = LLM_TYPE_100B_A6B; break; default: type = LLM_TYPE_UNKNOWN; } } @@ -39,9 +35,9 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) { GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2"); GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2"); - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -78,7 +74,7 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); @@ -112,8 +108,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -146,7 +141,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/bert.cpp b/src/models/bert.cpp index 3c28f419ccf..53ce29f23ca 100644 --- a/src/models/bert.cpp +++ b/src/models/bert.cpp @@ -1,9 +1,9 @@ #include "models.h" void llama_model_bert::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 3: type = LLM_TYPE_17M; break; // bge-micro case 6: diff --git a/src/models/bitnet.cpp b/src/models/bitnet.cpp index 7e8125deec4..c8330274580 100644 --- a/src/models/bitnet.cpp +++ b/src/models/bitnet.cpp @@ -3,7 +3,7 @@ void llama_model_bitnet::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp index 30b0f3d07d0..609d2ddf998 100644 --- a/src/models/bloom.cpp +++ b/src/models/bloom.cpp @@ -3,7 +3,7 @@ void llama_model_bloom::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 30: switch (hparams.n_embd) { diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp index 4bceaefd63b..4f45acecf84 100644 --- a/src/models/chameleon.cpp +++ b/src/models/chameleon.cpp @@ -6,7 +6,7 @@ void llama_model_chameleon::load_arch_hparams(llama_model_loader & ml) { hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_34B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index 6766fa71c15..7ae5b938fde 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -2,7 +2,8 @@ void llama_model_chatglm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: { if (hparams.n_head(0) == 16) { type = LLM_TYPE_1_5B; diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp index 274dd3342a7..de53bb98184 100644 --- a/src/models/codeshell.cpp +++ b/src/models/codeshell.cpp @@ -2,7 +2,8 @@ void llama_model_codeshell::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 42: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index 2e231bb3f93..750f57a394e 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -2,7 +2,8 @@ void llama_model_cogvlm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp index a514cf88fc6..61a5945a194 100644 --- a/src/models/cohere2.cpp +++ b/src/models/cohere2.cpp @@ -5,6 +5,7 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) { uint32_t swa_period = 4; ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; @@ -12,7 +13,8 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp index adf7fcaa20f..94a46188bb8 100644 --- a/src/models/command-r.cpp +++ b/src/models/command-r.cpp @@ -3,7 +3,8 @@ void llama_model_command_r::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_35B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index af71c775365..4f5ac4d06a4 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -1,14 +1,14 @@ #include "models.h" void llama_model_dbrx::load_arch_hparams(llama_model_loader & ml) { -ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); -switch (hparams.n_layer) { - case 40: type = LLM_TYPE_16x12B; break; - default: type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer()) { + case 40: type = LLM_TYPE_16x12B; break; + default: type = LLM_TYPE_UNKNOWN; + } } - } void llama_model_dbrx::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; diff --git a/src/models/deci.cpp b/src/models/deci.cpp index 567e3535276..cdfcf29e02f 100644 --- a/src/models/deci.cpp +++ b/src/models/deci.cpp @@ -2,7 +2,8 @@ void llama_model_deci::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; case 162: type = LLM_TYPE_405B; break; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 1fe54adc13e..a9e8bc51403 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -5,7 +5,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false); // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B - const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256)); + const bool is_lite = (hparams.n_layer() == 27 || hparams.n_layer() == 26 || (hparams.n_layer() == 48 && n_vocab == 128256)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); @@ -23,7 +23,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { // for compatibility with existing DeepSeek V2 and V2.5 GGUFs // that have no expert_gating_func model parameter set - if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) { + if ((hparams.n_layer() == 47 || hparams.n_layer() == 48) && n_vocab == 154880) { // GLM 4.7 Lite hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } else { @@ -43,7 +43,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { hparams.f_attn_temp_offset = 0.0f; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 27: type = LLM_TYPE_16B; break; case 47: type = LLM_TYPE_30B_A3B; break; case 60: type = LLM_TYPE_236B; break; @@ -191,8 +191,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p ggml_tensor * inp_out_ids = build_inp_out_ids(); - int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < effective_n_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -366,7 +365,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } } - if (il == effective_n_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/deepseek2ocr.cpp b/src/models/deepseek2ocr.cpp index f9e4c98785c..65d31c31b93 100644 --- a/src/models/deepseek2ocr.cpp +++ b/src/models/deepseek2ocr.cpp @@ -14,7 +14,7 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/deepseek32.cpp b/src/models/deepseek32.cpp index c92ab60d166..9a20e2ce907 100644 --- a/src/models/deepseek32.cpp +++ b/src/models/deepseek32.cpp @@ -31,7 +31,7 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); // Expert gating function - ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) { // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] @@ -40,13 +40,10 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_685B_A37B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -82,9 +79,9 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; @@ -142,7 +139,7 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -205,8 +202,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); - int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < effective_n_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -427,7 +423,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il); } } - if (il == effective_n_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index 435d27281c6..07d6ab1b7cd 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -8,7 +8,8 @@ void llama_model_dots1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_142B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dream.cpp b/src/models/dream.cpp index 12ac6f1ce88..abe737c335a 100644 --- a/src/models/dream.cpp +++ b/src/models/dream.cpp @@ -2,8 +2,9 @@ void llama_model_dream::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // Dream models are primarily 7B with 28 layers - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_7B; break; diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index 9b39c605e35..895cf690bd2 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -12,7 +12,7 @@ void llama_model_ernie4_5::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_0_3B; break; case 28: type = LLM_TYPE_21B_A3B; break; case 54: type = LLM_TYPE_300B_A47B; break; diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp index ddf13c3028f..0948d7de656 100644 --- a/src/models/eurobert.cpp +++ b/src/models/eurobert.cpp @@ -3,7 +3,7 @@ void llama_model_eurobert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (hparams.n_layer == 12) { + if (hparams.n_layer() == 12) { type = LLM_TYPE_SMALL; // 0.2B } } diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index 76d91982fc5..bccf169f8c0 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -20,13 +20,12 @@ void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_30B_A3B; break; - case 48: - case 49: type = LLM_TYPE_235B_A22B; break; + case 48: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } @@ -50,9 +49,9 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -70,7 +69,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end - if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers)) { + if (i < (int) hparams.n_layer_dense_lead || (i >= n_layer)) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); @@ -95,7 +94,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags); @@ -130,8 +129,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // use RoPE for SWA layers @@ -170,7 +168,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp index c7e9960d718..676fb37b5a6 100644 --- a/src/models/exaone.cpp +++ b/src/models/exaone.cpp @@ -3,7 +3,7 @@ void llama_model_exaone::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp index b5030eb0545..863268abcef 100644 --- a/src/models/exaone4.cpp +++ b/src/models/exaone4.cpp @@ -1,7 +1,7 @@ #include "models.h" void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { - if (hparams.n_layer == 64) { // 32B + if (hparams.n_layer() == 64) { // 32B hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.n_swa = 4096; uint32_t swa_period = 4; @@ -15,11 +15,11 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); - switch (hparams.n_layer) { + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer"); + + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_1_2B; break; case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; @@ -40,8 +40,8 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { - const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers; + for (int i = 0; i < n_layer_all; ++i) { + const bool is_nextn = i >= n_layer; int flags = 0; if (is_nextn) { // NextN/MTP layers are preserved in GGUF but are not executed yet. @@ -109,11 +109,7 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra } ggml_tensor * inp_out_ids = build_inp_out_ids(); - // MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe). - const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers); - GGML_ASSERT(n_layer_main > 0); - - for (int il = 0; il < n_layer_main; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // use RoPE for SWA layers or non-SWA models @@ -149,7 +145,7 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } - if (il == n_layer_main - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index c130ccdd49e..d6ef2d51986 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -13,7 +13,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) { std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_0_5B; break; case 24: diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp index ad546ef2db5..b2ad90b3272 100644 --- a/src/models/falcon.cpp +++ b/src/models/falcon.cpp @@ -3,7 +3,7 @@ void llama_model_falcon::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 60: type = LLM_TYPE_40B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp index 4e07f5f2bda..80ed3b1a460 100644 --- a/src/models/gemma-embedding.cpp +++ b/src/models/gemma-embedding.cpp @@ -21,7 +21,7 @@ void llama_model_gemma_embedding::load_arch_hparams(llama_model_loader & ml) { GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd"); GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_0_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp index 1519682fdf6..651cd7e64de 100644 --- a/src/models/gemma.cpp +++ b/src/models/gemma.cpp @@ -3,7 +3,7 @@ void llama_model_gemma::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_2B; break; case 28: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma2.cpp b/src/models/gemma2.cpp index ae3f9ffb530..2fbfb15a94a 100644 --- a/src/models/gemma2.cpp +++ b/src/models/gemma2.cpp @@ -16,7 +16,7 @@ void llama_model_gemma2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_2B; break; case 42: type = LLM_TYPE_9B; break; case 46: type = LLM_TYPE_27B; break; diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index 63a2b380e71..690194529e3 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -17,7 +17,7 @@ void llama_model_gemma3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_270M; break; case 26: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_8B; break; // Rnj-1 diff --git a/src/models/gemma3n.cpp b/src/models/gemma3n.cpp index 6ec3a006081..83eb8250aa9 100644 --- a/src/models/gemma3n.cpp +++ b/src/models/gemma3n.cpp @@ -6,14 +6,14 @@ void llama_model_gemma3n::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(swa_period); - hparams.n_layer_kv_from_start = 20; - hparams.f_attention_scale = 1.0f; + hparams.n_layer_kv_from_start = 20; + hparams.f_attention_scale = 1.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_E2B; break; case 35: type = LLM_TYPE_E4B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index 31906de33d9..7198e541116 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -2,12 +2,12 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); uint32_t n_kv_shared_layers = 0; ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); - hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers; + hparams.n_layer_kv_from_start = hparams.n_layer_all - (int32_t)n_kv_shared_layers; hparams.f_attention_scale = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling) ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); @@ -19,7 +19,7 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_26B_A4B; break; case 35: type = LLM_TYPE_E2B; break; case 42: type = LLM_TYPE_E4B; break; diff --git a/src/models/glm-dsa.cpp b/src/models/glm-dsa.cpp index af2b55ef563..11d91312def 100644 --- a/src/models/glm-dsa.cpp +++ b/src/models/glm-dsa.cpp @@ -33,13 +33,10 @@ void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 79: type = LLM_TYPE_744B_A40B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -76,9 +73,9 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; @@ -135,8 +132,8 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); } - // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // NextN/MTP tensors (preserved but unused) - conditionally load for last n_layer_nextn + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 27654b8cba3..3105c56b530 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -20,16 +20,13 @@ void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { - case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer) - case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open - case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer) + switch (hparams.n_layer()) { + case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air + case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open + case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 default: type = LLM_TYPE_UNKNOWN; } } @@ -54,9 +51,9 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) { // Load ALL tensors including NextN layer to satisfy total tensor count // but only PROCESS up to last layer (skipping final NextN layer) in forward pass - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -116,7 +113,7 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -161,8 +158,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -211,7 +207,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa model.layers[il].wo, NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 7c242fed298..b4326c5f210 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -5,13 +5,10 @@ void llama_model_glm4::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); // NextN/MTP parameters (GLM-OCR) - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 17: type = LLM_TYPE_1B; break; // GLM-OCR case 40: type = LLM_TYPE_9B; break; case 61: type = LLM_TYPE_32B; break; @@ -32,9 +29,9 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -55,7 +52,7 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags); // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -100,8 +97,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -140,7 +136,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params model.layers[il].wo, NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp index e2dcc8b1521..45afbccc121 100644 --- a/src/models/gpt2.cpp +++ b/src/models/gpt2.cpp @@ -2,7 +2,8 @@ void llama_model_gpt2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_SMALL; break; case 24: type = LLM_TYPE_MEDIUM; break; case 36: type = LLM_TYPE_LARGE; break; diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp index 443e35addf2..ed5e8c50da2 100644 --- a/src/models/gptneox.cpp +++ b/src/models/gptneox.cpp @@ -3,7 +3,8 @@ void llama_model_gptneox::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 6: switch (hparams.n_ff()) { case 512: type = LLM_TYPE_14M; break; diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 8740d9fc7d9..eb23095aece 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -19,7 +19,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) { hparams.rope_finetuned = rope_finetuned; // A layer is recurrent IFF the n_head_kv value is set to 0 - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } diff --git a/src/models/granite-moe.cpp b/src/models/granite-moe.cpp index 0d89bc1f340..115263c418f 100644 --- a/src/models/granite-moe.cpp +++ b/src/models/granite-moe.cpp @@ -12,7 +12,7 @@ void llama_model_granite_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; // Add additional layer/vocab/etc checks here for other model sizes diff --git a/src/models/granite.cpp b/src/models/granite.cpp index cda4aa231fa..7aff942da01 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -12,7 +12,7 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; // Add additional layer/vocab/etc checks here for other model sizes diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 7c46ec1c0f2..42f38af6724 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -26,7 +26,7 @@ void llama_model_grok::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 64: type = LLM_TYPE_314B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index 1cab75adc7f..643a448e59a 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -7,7 +7,7 @@ void llama_model_grovemoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index deb3c9671f3..4d55f5e7f31 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -5,7 +5,7 @@ void llama_model_hunyuan_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_A13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp index f9ee37a24b6..f6cfdfb9458 100644 --- a/src/models/internlm2.cpp +++ b/src/models/internlm2.cpp @@ -2,7 +2,8 @@ void llama_model_internlm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_20B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jais.cpp b/src/models/jais.cpp index 2ba162605f1..415103ce23a 100644 --- a/src/models/jais.cpp +++ b/src/models/jais.cpp @@ -4,7 +4,7 @@ void llama_model_jais::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_3B; break; case 40: type = LLM_TYPE_13B; break; /* TODO: add variants */ diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp index 8966131441c..8610fcc9f82 100644 --- a/src/models/jais2.cpp +++ b/src/models/jais2.cpp @@ -3,7 +3,7 @@ void llama_model_jais2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; case 68: type = LLM_TYPE_70B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index a62b121b3ee..dba160b014f 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -8,11 +8,11 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { // TODO: Jamba layers are a bit heterogeneous, so naming this is hard. case 12: // 900M 8x???M case 32: // 51B 16x?B diff --git a/src/models/jina-bert-v2.cpp b/src/models/jina-bert-v2.cpp index 4f8866ece4d..86ff1c84d1a 100644 --- a/src/models/jina-bert-v2.cpp +++ b/src/models/jina-bert-v2.cpp @@ -4,7 +4,7 @@ void llama_model_jina_bert_v2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); hparams.f_max_alibi_bias = 8.0f; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jina-bert-v3.cpp b/src/models/jina-bert-v3.cpp index e0527529f56..1c974a6f16c 100644 --- a/src/models/jina-bert-v3.cpp +++ b/src/models/jina-bert-v3.cpp @@ -3,7 +3,7 @@ void llama_model_jina_bert_v3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_558M; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index c13f71b5bcb..367f6990d1f 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -14,7 +14,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent } @@ -25,7 +25,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 3898b56bb12..97da8a6abb8 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -5,10 +5,13 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } - hparams.n_layer_dense_lead = hparams.n_layer; + + hparams.n_layer_dense_lead = hparams.n_layer(); + switch (hparams.n_ff()) { case 4608: type = LLM_TYPE_350M; break; case 6912: type = LLM_TYPE_700M; break; @@ -16,9 +19,10 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { case 10752: type = LLM_TYPE_2_6B; break; default: type = LLM_TYPE_UNKNOWN; } + if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_swa_impl[il] = !hparams.is_recr_impl[il]; } } diff --git a/src/models/lfm2moe.cpp b/src/models/lfm2moe.cpp index 81ced2eaba2..490f5c223eb 100644 --- a/src/models/lfm2moe.cpp +++ b/src/models/lfm2moe.cpp @@ -9,11 +9,11 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_8B_A1B; break; case 40: type = LLM_TYPE_24B_A2B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index 9722dde9f17..2ae89386447 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -2,11 +2,12 @@ void llama_model_llada_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // diffusion language model uses non-causal attention hparams.causal_attn = false; - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_A1_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/llada.cpp b/src/models/llada.cpp index 58b2c466e17..87d4259f9a7 100644 --- a/src/models/llada.cpp +++ b/src/models/llada.cpp @@ -2,14 +2,16 @@ void llama_model_llada::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } + // Set non-causal attention for diffusion models hparams.causal_attn = false; } diff --git a/src/models/llama.cpp b/src/models/llama.cpp index cef66d054b0..c0ec7e0a9ad 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -7,13 +7,13 @@ void llama_model_llama::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); if (hparams.n_expert == 8) { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8x7B; break; case 56: type = LLM_TYPE_8x22B; break; default: type = LLM_TYPE_UNKNOWN; } } else { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B case 22: type = LLM_TYPE_1B; break; case 26: type = LLM_TYPE_3B; break; diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp index 8f39b3f59a5..7194c72a585 100644 --- a/src/models/llama4.cpp +++ b/src/models/llama4.cpp @@ -8,7 +8,7 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) { const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); if (found_swa && hparams.n_swa == 0) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope + hparams.n_no_rope_layer_step = hparams.n_layer(); // always use rope } else { hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED; hparams.n_swa = 8192; diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp index 84cfe399027..ae56a26a1f6 100644 --- a/src/models/maincoder.cpp +++ b/src/models/maincoder.cpp @@ -2,7 +2,8 @@ void llama_model_maincoder::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp index 887a1fa509a..0d94e98281c 100644 --- a/src/models/mamba.cpp +++ b/src/models/mamba.cpp @@ -9,7 +9,7 @@ void llama_model_mamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: switch (hparams.n_embd) { case 768: type = LLM_TYPE_SMALL; break; diff --git a/src/models/mamba2.cpp b/src/models/mamba2.cpp index 3277ca53ec4..c5951cf0f7f 100644 --- a/src/models/mamba2.cpp +++ b/src/models/mamba2.cpp @@ -9,7 +9,7 @@ void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: switch (hparams.n_embd) { case 768: type = LLM_TYPE_SMALL; break; diff --git a/src/models/mellum.cpp b/src/models/mellum.cpp index 1e1e97e9fa0..28823018bc0 100644 --- a/src/models/mellum.cpp +++ b/src/models/mellum.cpp @@ -13,7 +13,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) { if (res) { hparams.set_swa_pattern(swa_period); } else { - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); } hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; @@ -24,7 +24,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_12B_A2_5B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp index 1bcdf696f2e..88989160570 100644 --- a/src/models/mimo2.cpp +++ b/src/models/mimo2.cpp @@ -9,18 +9,17 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); float value_scale = 0.0f; if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) { hparams.f_attn_value_scale = value_scale; } - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_310B_A15B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -35,16 +34,14 @@ void llama_model_mimo2::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - const uint32_t n_nextn = hparams.nextn_predict_layers; - - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { auto & layer = layers[i]; uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); uint32_t n_head = hparams.n_head(i); // NextN/MTP layers (the last n_nextn blocks) are preserved but disabled pending support - const bool is_nextn = (n_nextn > 0) && (static_cast(i) >= n_layer - n_nextn); + const bool is_nextn = i >= n_layer; const int skip = is_nextn ? TENSOR_SKIP : 0; create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, skip); @@ -93,10 +90,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param const float v_scale = hparams.f_attn_value_scale; - // The last hparams.nextn_predict_layers blocks are MTP heads, currently inactive - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; uint32_t n_head_l = hparams.n_head(il); @@ -174,7 +168,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param } } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp index 966d3af615c..fc3e5b171d5 100644 --- a/src/models/minicpm.cpp +++ b/src/models/minicpm.cpp @@ -3,7 +3,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) { // Backward-compatible defaults for older MiniCPM GGUFs hparams.f_embedding_scale = 12.0f; - hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer)); + hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer())); hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f; ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -16,7 +16,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) { // MiniCPM uses rope by default, unlike Granite which uses it as a switch hparams.rope_finetuned = true; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 52: type = LLM_TYPE_1B; break; case 40: type = LLM_TYPE_2B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index 1ffc54fa7c6..e011b1ff0a8 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -5,7 +5,7 @@ void llama_model_minicpm3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index 22e291d73a3..b25435e4d97 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -5,7 +5,7 @@ void llama_model_minimax_m2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_230B_A10B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 1ac5a95ccdc..9a8e3f9a50b 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -18,7 +18,7 @@ void llama_model_mistral3::load_arch_hparams(llama_model_loader & ml) { } } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_3B; break; case 34: type = LLM_TYPE_8B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index 5ab51867cc0..f3e9407e012 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -22,7 +22,7 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) { hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU); } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_47M; break; // granite-embedding-small case 22: diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp index 0229d20ed36..d094fd9f80b 100644 --- a/src/models/mpt.cpp +++ b/src/models/mpt.cpp @@ -5,7 +5,7 @@ void llama_model_mpt::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_30B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index d2c811d2497..a456269347b 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -9,7 +9,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { // A layer is recurrent IFF the n_head_kv value is set to 0 and // the n_ff value is set to 0 - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0); } @@ -22,7 +22,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B case 56: type = LLM_TYPE_9B; break; case 88: type = LLM_TYPE_120B_A12B; break; diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp index 5d4a3b5c69e..6e2bd9a33ca 100644 --- a/src/models/nemotron.cpp +++ b/src/models/nemotron.cpp @@ -2,7 +2,8 @@ void llama_model_nemotron::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/neo-bert.cpp b/src/models/neo-bert.cpp index f00d6eddfc9..4a08d7abd40 100644 --- a/src/models/neo-bert.cpp +++ b/src/models/neo-bert.cpp @@ -3,7 +3,7 @@ void llama_model_neo_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (hparams.n_layer == 28) { + if (hparams.n_layer() == 28) { type = LLM_TYPE_250M; } } diff --git a/src/models/nomic-bert-moe.cpp b/src/models/nomic-bert-moe.cpp index a17abe2c269..da4b62919bb 100644 --- a/src/models/nomic-bert-moe.cpp +++ b/src/models/nomic-bert-moe.cpp @@ -4,7 +4,7 @@ void llama_model_nomic_bert_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); - if (hparams.n_layer == 12 && hparams.n_embd == 768) { + if (hparams.n_layer() == 12 && hparams.n_embd == 768) { if (arch == LLM_ARCH_NOMIC_BERT) { type = LLM_TYPE_137M; } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { diff --git a/src/models/nomic-bert.cpp b/src/models/nomic-bert.cpp index 5a8a5584457..e7fc72286a6 100644 --- a/src/models/nomic-bert.cpp +++ b/src/models/nomic-bert.cpp @@ -4,7 +4,7 @@ void llama_model_nomic_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); - if (hparams.n_layer == 12 && hparams.n_embd == 768) { + if (hparams.n_layer() == 12 && hparams.n_embd == 768) { if (arch == LLM_ARCH_NOMIC_BERT) { type = LLM_TYPE_137M; } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp index cfcf17bcb03..9f7a2ba60ef 100644 --- a/src/models/olmo.cpp +++ b/src/models/olmo.cpp @@ -4,7 +4,7 @@ void llama_model_olmo::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 22: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp index 7cc262f5504..cb52cdef720 100644 --- a/src/models/olmo2.cpp +++ b/src/models/olmo2.cpp @@ -17,7 +17,7 @@ void llama_model_olmo2::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index 7976ae44a51..1e2baeb207f 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -2,7 +2,8 @@ void llama_model_olmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_A1_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 15b6c8c1205..3ab15d61f08 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -14,7 +14,7 @@ void llama_model_openai_moe::load_arch_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_20B; break; case 36: type = LLM_TYPE_120B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index 9f76350fd4d..13120bd3236 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -3,12 +3,12 @@ void llama_model_openelm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { - case 16: type = LLM_TYPE_270M; break; - case 20: type = LLM_TYPE_450M; break; - case 28: type = LLM_TYPE_1B; break; - case 36: type = LLM_TYPE_3B; break; - default: type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer()) { + case 16: type = LLM_TYPE_270M; break; + case 20: type = LLM_TYPE_450M; break; + case 28: type = LLM_TYPE_1B; break; + case 36: type = LLM_TYPE_3B; break; + default: type = LLM_TYPE_UNKNOWN; } } diff --git a/src/models/orion.cpp b/src/models/orion.cpp index bcb4bbba4b1..863a2822269 100644 --- a/src/models/orion.cpp +++ b/src/models/orion.cpp @@ -3,7 +3,7 @@ void llama_model_orion::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_14B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/pangu-embed.cpp b/src/models/pangu-embed.cpp index 7593f879b24..90f05c088c1 100644 --- a/src/models/pangu-embed.cpp +++ b/src/models/pangu-embed.cpp @@ -2,7 +2,8 @@ void llama_model_pangu_embed::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1 case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1 default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp index 8f3ed5f7b7d..81b1ad12cc0 100644 --- a/src/models/phi2.cpp +++ b/src/models/phi2.cpp @@ -3,7 +3,7 @@ void llama_model_phi2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index f8a4a4d5aa5..716ff814cc1 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -3,7 +3,7 @@ void llama_model_phi3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/phimoe.cpp b/src/models/phimoe.cpp index 4575d6139cf..c332553bc7d 100644 --- a/src/models/phimoe.cpp +++ b/src/models/phimoe.cpp @@ -3,7 +3,7 @@ void llama_model_phimoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_16x3_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp index c7ed1211c31..246144519e4 100644 --- a/src/models/plamo.cpp +++ b/src/models/plamo.cpp @@ -3,7 +3,7 @@ void llama_model_plamo::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index 2ffa0898f71..b93cf48bc5c 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -11,11 +11,11 @@ void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; case 32: if (hparams.n_embd == 2048) { diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 29f3e803d68..16d0b1dcef7 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -13,7 +13,7 @@ void llama_model_plamo3::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_2B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plm.cpp b/src/models/plm.cpp index ce050919e6a..8ca325f5e2c 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -3,7 +3,8 @@ void llama_model_plm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp index 00467dbad7d..1f5dff3843c 100644 --- a/src/models/qwen.cpp +++ b/src/models/qwen.cpp @@ -3,7 +3,7 @@ void llama_model_qwen::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index a5147460bae..e9c2ea80a6b 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -2,7 +2,8 @@ void llama_model_qwen2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break; case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break; case 32: type = LLM_TYPE_7B; break; diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index 7cb03859deb..e831ed11aad 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -5,7 +5,8 @@ void llama_model_qwen2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_A2_7B; break; case 28: type = LLM_TYPE_57B_A14B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index 41b97fed956..1d0d2fab362 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -2,7 +2,8 @@ void llama_model_qwen3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 348650b3796..4b642cff467 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -13,22 +13,20 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break; case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break; case 64: type = LLM_TYPE_27B; break; @@ -39,9 +37,7 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -122,10 +118,10 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - for (int i = (int) n_main; i < n_layer; ++i) { + for (int i = n_layer; i < n_layer_all; ++i) { load_block_mtp(i); } } @@ -159,8 +155,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -177,7 +172,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -490,15 +485,15 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35 MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35 MTP currently only supports a single MTP block"); const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); // hparams.n_layer includes both main model layers and MTP layers. The MTP // layer is stored immediately after the main layers in model.layers[]. - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 7d906191cbb..eb5e9a406a1 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -16,22 +16,20 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_35B_A3B; break; case 48: type = LLM_TYPE_122B_A10B; break; case 60: type = LLM_TYPE_397B_A17B; break; @@ -42,9 +40,7 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -145,10 +141,10 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - for (int i = (int) n_main; i < n_layer; ++i) { + for (int i = n_layer; i < n_layer_all; ++i) { load_block_mtp(i); } } @@ -182,8 +178,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -200,7 +195,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -555,13 +550,13 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35MOE MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index a4f8e1379c9..317e668bec7 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -1,10 +1,10 @@ #include "models.h" void llama_model_qwen3moe::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 9e09ae6f232..97200a44072 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -14,15 +14,15 @@ void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // Mark recurrent layers (linear attention layers) - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_80B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 5defd893944..724d6140d19 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -4,7 +4,8 @@ void llama_model_qwen3vl::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false); ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; case 64: type = LLM_TYPE_32B; break; diff --git a/src/models/qwen3vlmoe.cpp b/src/models/qwen3vlmoe.cpp index 5b77df57122..7c41592f772 100644 --- a/src/models/qwen3vlmoe.cpp +++ b/src/models/qwen3vlmoe.cpp @@ -5,7 +5,8 @@ void llama_model_qwen3vlmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/refact.cpp b/src/models/refact.cpp index bf3949a9092..a46c358fa68 100644 --- a/src/models/refact.cpp +++ b/src/models/refact.cpp @@ -2,7 +2,8 @@ void llama_model_refact::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index ca8e009615e..fc276ce591b 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -2,12 +2,13 @@ void llama_model_rnd1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } + // Set non-causal attention for diffusion models hparams.causal_attn = false; } diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp index ba2a9dfa0db..0b5013dc758 100644 --- a/src/models/rwkv6.cpp +++ b/src/models/rwkv6.cpp @@ -9,7 +9,7 @@ void llama_model_rwkv6::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_6B; break; case 32: switch (hparams.n_embd) { diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp index 566b8cdcb54..6c7db514435 100644 --- a/src/models/rwkv6qwen2.cpp +++ b/src/models/rwkv6qwen2.cpp @@ -9,7 +9,7 @@ void llama_model_rwkv6qwen2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_6B; break; case 32: switch (hparams.n_embd) { diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp index 7574b252621..67c51f5b59c 100644 --- a/src/models/rwkv7.cpp +++ b/src/models/rwkv7.cpp @@ -10,7 +10,7 @@ void llama_model_rwkv7::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: switch (hparams.n_embd) { case 768: type = LLM_TYPE_190M; break; diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp index 806cba574be..57de881a091 100644 --- a/src/models/seed-oss.cpp +++ b/src/models/seed-oss.cpp @@ -2,7 +2,8 @@ void llama_model_seed_oss::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 64: type = LLM_TYPE_36B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 4231cccc666..a8e3d957f1f 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -15,14 +15,14 @@ void llama_model_smallthinker::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_no_rope_layer_step = hparams.n_layer; + hparams.n_no_rope_layer_step = hparams.n_layer(); } ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_4B; break; case 52: type = LLM_TYPE_20B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp index 90e7d473eaf..c67d967b204 100644 --- a/src/models/smollm3.cpp +++ b/src/models/smollm3.cpp @@ -4,7 +4,7 @@ void llama_model_smollm3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); hparams.n_no_rope_layer_step = 4; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp index 4da7f7aefcf..bf6087b8796 100644 --- a/src/models/stablelm.cpp +++ b/src/models/stablelm.cpp @@ -3,7 +3,7 @@ void llama_model_stablelm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_12B; break; diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp index e131af058bc..f73a88fd4e9 100644 --- a/src/models/starcoder.cpp +++ b/src/models/starcoder.cpp @@ -2,7 +2,8 @@ void llama_model_starcoder::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 36: type = LLM_TYPE_3B; break; case 42: type = LLM_TYPE_7B; break; diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp index 9c207c02885..b81b469374a 100644 --- a/src/models/starcoder2.cpp +++ b/src/models/starcoder2.cpp @@ -2,7 +2,8 @@ void llama_model_starcoder2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_3B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_15B; break; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index cf9942b200f..e2218c58704 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -23,16 +23,16 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer(), false); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer(), false); // NextN/MTP (Step3p5): extra decoder block appended beyond the main stack. - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 45: type = LLM_TYPE_196B_A11B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -41,15 +41,12 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP // tensors live in a separate file (e.g. user split target/draft). Mark // MTP tensors NOT_REQUIRED so the trunk loads cleanly. - const std::string mtp_probe = "blk." + std::to_string(n_main) + ".nextn.eh_proj.weight"; - const bool trunk_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight(mtp_probe.c_str()) == nullptr); + const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight"; + const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; const int mtp_flags = trunk_only ? TENSOR_NOT_REQUIRED : 0; @@ -176,7 +173,7 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } // Only the first MTP block (i == n_main) is required at runtime — the @@ -184,8 +181,8 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { // Trailing MTP blocks are loaded if present (so an un-pruned GGUF with // all MTP layers still works) but tolerated when absent via the pruning // path. See scripts/prune_step35_extra_mtp.py for the pruner. - for (int i = (int) n_main; i < n_layer; ++i) { - load_block_mtp(i, /*is_first_mtp=*/ i == (int) n_main); + for (int i = n_layer; i < n_layer_all; ++i) { + load_block_mtp(i, /*is_first_mtp=*/ i == n_layer); } } @@ -206,8 +203,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; const uint32_t n_head_l = hparams.n_head(il); @@ -294,7 +290,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "attn_proj", il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -374,7 +370,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para // LLM_GRAPH_TYPE_DECODER_MTP draft head for Step3p5 (MoE) llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "STEP35 MTP requires nextn_predict_layers > 0"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0"); // Single-block MTP only: always run the first trained MTP block (Qwen // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to @@ -382,7 +378,7 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just // block 0) also work — see load_arch_tensors below and // scripts/prune_step35_extra_mtp.py. - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/t5.cpp b/src/models/t5.cpp index 73e32741406..b0e3f062572 100644 --- a/src/models/t5.cpp +++ b/src/models/t5.cpp @@ -9,10 +9,10 @@ void llama_model_t5::load_arch_hparams(llama_model_loader & ml) { hparams.dec_start_token_id = dec_start_token_id; } - hparams.dec_n_layer = hparams.n_layer; + hparams.dec_n_layer = hparams.n_layer(); ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 6: type = LLM_TYPE_60M; break; // t5-small case 8: type = LLM_TYPE_80M; break; // flan-t5-small case 12: diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp index 1258eeb19b6..393e8f65bf4 100644 --- a/src/models/talkie.cpp +++ b/src/models/talkie.cpp @@ -4,7 +4,7 @@ void llama_model_talkie::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp index d6d1c7a2e5d..3135001293a 100644 --- a/src/models/xverse.cpp +++ b/src/models/xverse.cpp @@ -2,7 +2,8 @@ void llama_model_xverse::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; case 80: type = LLM_TYPE_65B; break; From 59917d3922e976ae4d7a86eb976bd4c330fb5391 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Jun 2026 11:17:54 +0300 Subject: [PATCH 19/71] minor : fix lint issues (#24165) --- src/models/exaone-moe.cpp | 2 +- src/models/glm4-moe.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index bccf169f8c0..5aed9379400 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -25,7 +25,7 @@ void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) { switch (hparams.n_layer()) { case 32: type = LLM_TYPE_30B_A3B; break; - case 48: type = LLM_TYPE_235B_A22B; break; + case 48: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 3105c56b530..d60e47ddf0c 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -24,9 +24,9 @@ void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) { GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); switch (hparams.n_layer()) { - case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air - case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open - case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 + case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air + case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open + case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 default: type = LLM_TYPE_UNKNOWN; } } From ad1b88ca0d37a2171efba1c04f1a3531c78f1b52 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Fri, 5 Jun 2026 12:21:26 +0200 Subject: [PATCH 20/71] docs: Update quantization readme (#24133) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update quantization readme * install requirements * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret * dos2unix suggestions --------- Co-authored-by: Sigbjørn Skjæret --- tools/quantize/README.md | 89 +++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/tools/quantize/README.md b/tools/quantize/README.md index b8c225124b3..27384bebf69 100644 --- a/tools/quantize/README.md +++ b/tools/quantize/README.md @@ -5,62 +5,87 @@ Quantization reduces the precision of model weights (e.g., from 32-bit floats to This process however, may introduce some accuracy loss which is usually measured in [Perplexity](https://huggingface.co/docs/transformers/en/perplexity) (ppl) and/or [Kullback–Leibler Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) (kld). This can be minimized by using a suitable imatrix file. -You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup. +You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup. It syncs from llama.cpp `main` every 6 hours. -Note: It is synced from llama.cpp `main` every 6 hours. +## Overview -Example usage: +Quantization is done in two phases: +- Convert the original model to GGUF format. +- Quantize the converted GGUF file. -```./llama-quantize [options] input-model-f32.gguf [output-model-quant.gguf] type [threads]``` +If the model supports multimodal inputs (images or audio), you also need to convert and quantize the multimodal encoders and projectors. + +To perform these tasks, you need to install the Python requirements: ```bash -# from Hugginface, obtain the official meta-llama/Llama-3.1-8B model weights and place them in ./models -ls ./models -config.json model-00001-of-00004.safetensors model-00004-of-00004.safetensors README.md tokenizer.json -generation_config.json model-00002-of-00004.safetensors model.safetensors.index.json special_tokens_map.json USE_POLICY.md -LICENSE model-00003-of-00004.safetensors original tokenizer_config.json +python3 -m pip install -r requirements.txt +``` -# [Optional] for PyTorch .bin models like Mistral-7B -ls ./models - +Or if you use `uv`: -# install Python dependencies -python3 -m pip install -r requirements.txt +```bash +uv pip install -r requirements.txt --index-strategy unsafe-best-match +``` -# convert the model to ggml FP16 format -python3 convert_hf_to_gguf.py ./models/mymodel/ +## Prepare the input GGUF file -# quantize the model to 4-bits (using Q4_K_M method) -./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M +To convert a model from a Hugging Face repo, you can use a command like the following: -# update the gguf filetype to current version if older version is now unsupported -./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY ``` +python convert_hf_to_gguf.py --outfile gemma-4-E2B-it-bf16.gguf --outtype bf16 --remote google/gemma-4-E2B-it +``` + +Notes: +- In the usual case where the model is distributed in 16-bit format, `--outtype auto` (or omitting `--outtype` entirely) also works well. +- If you have previously downloaded the model locally, specify the directory and remove the `--remote` flag. +- For compatibility reasons, the Python requirements install transformers 4, but more and more models (like Gemma 4) require transformers 5. You can safely `pip install -U transformers` to get the latest version. + +## Quantize the GGUF -Run the quantized model: +After you have created a high-quality GGUF version of the model, you use `llama-quantize` to apply quantization. For example, quantize to `Q4_K_M` using a command like the following: ```bash -# start inference on a gguf model -./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant" +./build/bin/llama-quantize gemma-4-E2B-it-bf16.gguf gemma-4-E2B-it-Q4_K_M.gguf Q4_K_M ``` +Various quantization methods are described [later in this document](#quantize). + Options: -* `--allow-requantize` allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit -* `--leave-output-tensor` will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing -* `--pure` disables k-quant mixtures and quantizes all tensors to the same type -* `--imatrix` uses data in file generated by `llama-imatrix` as importance matrix for quant optimizations (highly recommended) -* `--include-weights` use an importance matrix for tensor(s) in the list. Cannot be used with `--exclude-weights` -* `--exclude-weights` use an importance matrix for tensor(s) in the list. Cannot be used with `--include-weights` +* `--allow-requantize` allow requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit +* `--leave-output-tensor` leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing +* `--pure` disable k-quant mixtures and quantizes all tensors to the same type +* `--imatrix file_name` use data in file_name as importance matrix for quant optimizations +* `--include-weights tensor_name` use importance matrix for this tensor (can be specified multiple times) +* `--exclude-weights tensor_name` use importance matrix for the tensors **not** specified (include/exclude cannot be mixed) * `--output-tensor-type` use a specific quant type for the output.weight tensor * `--token-embedding-type` use a specific quant type for the token embeddings tensor -* `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file +* `--keep-split` generate the quantized model in the same shards as the input file instead of a single quantized file Advanced options: * `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times. * `--prune-layers` prune (remove) the layers in the list -* `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times +* `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times. + +## (Optional) Convert the multimodal components + +llama.cpp will convert the LLM portion of the source model, which is enough for conversational applications. If the model accepts multimodal inputs and you wish to take advantage of them, you need to create a separate GGUF file. This file is generically known as `mmproj`, for "multimedia projector"; however, it may contain various components such as vision or audio encoders in addition to projections. + +Multimodal components are usually much smaller than the LLMs they come with. In addition, their quality has a direct impact on the quality of LLM generations, because these components are in charge of preparing the inputs for the LLM: the closer inputs are to data seen during training, the better LLM results will be. + +For these reasons, multimodal components are usually kept in a high-quality format such as bf16 or q8. The impact on speed and memory from using a smaller quant is negligible, but overall quality could be impacted. + +```bash +python convert_hf_to_gguf.py --mmproj --outfile mmproj-gemma-4-E2B-it-Q8_0.gguf --outtype q8_0 --remote google/gemma-4-E2B-it +``` + +## Run the quantized model + + +```bash +./build/bin/llama cli -m ./gemma-4-E2B-it-Q4_K_M.gguf --mmproj ./mmproj-gemma-4-E2B-it-Q8_0.gguf --image --prompt "Describe this image" +``` -Examples: +## Quantization Examples ```bash # naive Q4_K_M quantization using default settings and 8 CPU threads. Output will be "ggml-model-Q4_K_M.gguf" From cc7bef34e2e1d7e0839d6371954106f6410c1c5a Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 5 Jun 2026 14:31:03 +0200 Subject: [PATCH 21/71] ui: add ignore-scripts=true to npmrc (#24149) --- tools/ui/.npmrc | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/ui/.npmrc b/tools/ui/.npmrc index b6f27f13595..32e6012709b 100644 --- a/tools/ui/.npmrc +++ b/tools/ui/.npmrc @@ -1 +1,2 @@ engine-strict=true +ignore-scripts=true From 9c955c48b0fc6c18c703ea5cba2cacb2db6332cb Mon Sep 17 00:00:00 2001 From: Mario <191101255+wariuccio@users.noreply.github.com> Date: Fri, 5 Jun 2026 13:39:32 +0100 Subject: [PATCH 22/71] Fix link to available UI settings (#24169) The current link is to a non-existent file. I had a look at the repo, spotted the file containing the UI configuration key and updated the link --- tools/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/README.md b/tools/server/README.md index f1eeec36aa0..3e14f5e6a20 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1870,4 +1870,4 @@ You can specify default preferences for the web UI using `--ui-config **Note:** The old flags `--webui-config` and `--webui-config-file` are deprecated but still work as aliases. -You may find available preferences in [settings-config.ts](../ui/src/lib/constants/settings-config.ts). +You may find available preferences in [settings-keys.ts](../ui/src/lib/constants/settings-keys.ts). From 2016bf2b3bca10e49e06a00586a8a2fde9f6cc32 Mon Sep 17 00:00:00 2001 From: Pascal Date: Fri, 5 Jun 2026 14:57:32 +0200 Subject: [PATCH 23/71] ui: run npm install when package-lock.json is newer than node_modules (#24171) --- scripts/ui-assets.cmake | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/scripts/ui-assets.cmake b/scripts/ui-assets.cmake index ae7a1cc26d3..f85c562bd0e 100644 --- a/scripts/ui-assets.cmake +++ b/scripts/ui-assets.cmake @@ -126,8 +126,22 @@ function(npm_build out_var) return() endif() - if(NOT EXISTS "${UI_SOURCE_DIR}/node_modules") - message(STATUS "UI: running npm install (first time)") + # npm writes node_modules/.package-lock.json on every successful install, + # so a package-lock.json newer than this marker means node_modules is stale + set(NPM_MARKER "${UI_SOURCE_DIR}/node_modules/.package-lock.json") + set(need_install FALSE) + if(NOT EXISTS "${NPM_MARKER}") + set(need_install TRUE) + else() + file(TIMESTAMP "${UI_SOURCE_DIR}/package-lock.json" lock_ts) + file(TIMESTAMP "${NPM_MARKER}" marker_ts) + if(lock_ts STRGREATER marker_ts) + set(need_install TRUE) + endif() + endif() + + if(need_install) + message(STATUS "UI: running npm install") execute_process( COMMAND ${NPM_EXECUTABLE} install WORKING_DIRECTORY "${UI_SOURCE_DIR}" From 96fbe0039337a999613a983d66e2bfcc4bb554d7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Jun 2026 17:11:42 +0300 Subject: [PATCH 24/71] model : fix llama_model::n_gpu_layers() (#24188) --- src/llama-model.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c98cb27e4d4..1f442d8a322 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1636,7 +1636,8 @@ const float * llama_model::tensor_split() const { } uint32_t llama_model::n_gpu_layers() const { - return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer() + 1; + // note: plus 1 for the "output" layer + return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer_all + 1; } llama_split_mode llama_model::split_mode() const { From 86591c7536ced84cea49ee5b3e24096632a33c5a Mon Sep 17 00:00:00 2001 From: therealkenc Date: Fri, 5 Jun 2026 08:29:41 -0700 Subject: [PATCH 25/71] cli: fix model params not propagated (#23893) Fixes #23847 --- tools/cli/cli.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index af40adbb4ce..e830f262de2 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -397,6 +397,8 @@ int llama_cli(int argc, char ** argv) { return 1; } + ctx_cli.defaults.sampling = params.sampling; + console::spinner::stop(); console::log("\n"); From 6effcecd0bf3cb2209999cecfa297ed4d8523b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 5 Jun 2026 17:35:13 +0200 Subject: [PATCH 26/71] TP: round up granularity to 128 (#24180) * TP: round up granularity to 128 * remove assert --- src/llama-model.cpp | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1f442d8a322..784deb70aff 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -553,10 +553,12 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str }; auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector> & segments) -> std::vector { + // for better performance it may make sense to round up blck_size to a higher power of 2 so that more efficient kernels can be used if (hparams.is_recr(il)) { // linear attention - const int64_t head_dim = hparams.ssm_d_state; - const int64_t granularity_qkv = std::lcm(blck_size, head_dim); + const int64_t head_dim = hparams.ssm_d_state; + const int64_t blck_size_perf = std::lcm(blck_size, 128); + const int64_t granularity_qkv = std::lcm(blck_size_perf, head_dim); if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d) || std::regex_match(tensor_name, pattern_ssm_out_weight)) { return std::vector(segments.size(), granularity_qkv); @@ -578,17 +580,24 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str // regular attention const uint32_t n_gqa = hparams.n_gqa(il); const uint32_t n_embd_q = n_gqa * hparams.n_embd_head_k(il); + + // to handle head sizes like 80, only increase granularity while it doesn't cause underutilization + int64_t blck_size_perf = blck_size; + while (blck_size_perf < 128 && blck_size_perf*ud->n_devices < n_embd_q) { + blck_size_perf *= 2; + } + if (std::regex_match(tensor_name, pattern_attn_sinks)) { GGML_ASSERT(segments.size() == 1); - return {std::lcm(n_embd_q, blck_size)/n_embd_q * n_gqa}; + return {std::lcm(n_embd_q, blck_size_perf)/n_embd_q * n_gqa}; } - const int64_t granularity_q = std::lcm(n_embd_q, blck_size); + const int64_t granularity_q = std::lcm(n_embd_q, blck_size_perf); if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_q_bias)) { GGML_ASSERT(segments.size() == 1); // some models have Q gate tensors, for those cases the granularity needs to be doubled: if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) { - return {std::lcm(2*n_embd_q, blck_size)}; + return {std::lcm(2*n_embd_q, blck_size_perf)}; } return {granularity_q}; } @@ -613,8 +622,9 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str // FFN if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) || std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) { + const int64_t blck_size_perf = std::lcm(blck_size, 128); GGML_ASSERT(segments.size() == 1); - return {blck_size}; + return {blck_size_perf}; } // everything else @@ -627,7 +637,6 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str tensor_config tc = get_tensor_config(); split_state.axis = tc.axis; if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) { - const int64_t ne_full = tensor->ne[split_state.axis]; const int64_t blck_size = ggml_blck_size(tc.tensor_axis_0->type); const float * tensor_split = ud->model->tensor_split(); std::vector tensor_split_scan; @@ -644,7 +653,6 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str const int64_t ne_s = segments[is].first; const uint32_t nr_s = segments[is].second; const int64_t g_s = granularity[is]; - GGML_ASSERT(ne_full % g_s == 0); int64_t low = 0; size_t j = 0; for (; j < ud->n_devices - 1; j++) { From 64086f2b2f222f7032611b021b0e1b6c4767b0f1 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 5 Jun 2026 09:44:59 -0600 Subject: [PATCH 27/71] model, mtmd: Granite4 Vision (#23545) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(convert): Get language model conversion working for 4.1 vision Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat(convert): Skip multimodal tensors for GraniteMoeHybrid (vision 4.0) Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Disable vocab padding for non-hybrid models that use GraniteMoeHybrid Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Plumb python-side vision projector names and mappings There are several awkward things here: 1. Most of these are essentially identical to the audio qformer tensors. On the c++ side, that's mapped using the prefix, so the rest of the GGUF name needs to align, but on the python side there's no prefix notion, so they all get duplicated. 2. There are a couple of net-new tensors for vision, in particular PROJ_NORM. In both speech and vision, the QF_PROJ_NORM is qualified as belonging to the qformer portion, but the GGUF name is simply proj_norm which conflicts with the ideal name for this new PROJ_NORM that is not qualified as part of the qformer. To get around this, I used "proj_layernorm" as the GGUF name. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add python side architecture name Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add python-side plumbing for setting FEATURE_LAYERS hparam Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add c++ side tensor naming defines NOTE: Usage of these hasn't been updated to include prefix yet Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat(mtmd): Convert vision_feature_layer to an ordered vector We need to preserve the ordering of these feature index values so that they can be mapped to the sub-tensors within the stacked projectors. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat(mtmd): Add architecture label plumbing Branch: Granite4Vision AI-usage: full (OpenCode + qwen3.5:122b) Signed-off-by: Gabe Goodhart * feat(wip): Add partial conversion for mmproj This handles stacking the projector tensors and setting the new harams Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add gguf_writer and constant support for new hparams and deepstack layer arr Branch: Granite4Vision AI-usage: draft (OpenCode + qwen3.5:122b) Signed-off-by: Gabe Goodhart * feat: Full conversion for mmproj w/ tensor mappings Branch: Granite4Vision AI-usage: full (OpenCode + qwen3.5:122b) Signed-off-by: Gabe Goodhart * fix: Add lm_head skip for mmproj for 4.0 Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: De-alias text_config architecture in convert_lora_to_gguf.py Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add --trust-remote-code arg to convert_lora_to_gguf.py This defaults to False, but allows a user to enable it programmaticly instead of using the interactive prompt. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: De-alias model.language_model. -> model. for lora adapters Branch: Granite4Vision AI-usage: full (OpenCode + qwen3.5:122b) Signed-off-by: Gabe Goodhart * fix: Extend language model tensor dealiasing in adapters Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Remove unnecessary registration for GraniteSpeech in language model Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Plumb through mm prefix formatting for qformer tensors Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: Refactor vision projector tensors to use predictor ID as the block This is cleaner than stacking them. The modeling file hard-codes single-layer qformers, so we can punt on the multiipule multi-layer projectors problem. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add spatial offests array hparam conversion Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add stub plumbing for granite vision in mtmd Branch: Granite4Vision AI-usage: draft (OpenCode + qwen3.5:122b) Signed-off-by: Gabe Goodhart * feat: Add new hparam and tensor naming in clip-impl.h New hparams: - KEY_PROJ_SAMPLE_QUERY_SIDE - KEY_PROJ_SAMPLE_WINDOW_SIDE - KEY_PROJ_SPATIAL_OFFSETS New tensors: - TN_MULTI_PROJ_IMG_POS - TN_MULTI_PROJ_QUERY - TN_MULTI_PROJ_LAYERNORM - TN_MULTI_PROJ_LINEAR - TN_MULTI_PROJ_NORM Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Move deepstack_layer_arr to llm hparam instead of mmproj Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Remove IS_DEEPSTACK_LAYERS This appears to have been added during Qwen3 VL (https://github.com/ggml-org/llama.cpp/pull/16780), but it was never actually used. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: n_deepstack_layers -> deepstack_layer_arr The old logic hard coded a correspondence between the first N layers of the LLM and the 1->N entries in the input embeddings. Now, that relationship is maintained at loading time if the GGUF value is single-valued. If it is multi-valued, it loads directly allowing for deepstack layers to be spaced out throughout the model. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Use try/catch for single/multi valued deepstack info The alternative would be to use get_key_or_arr, but then the single value would be populated through the entire array and we'd need to detect that and update it with the right correspondence. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add deepstack injection point for granite LLM The use of ggml_add here assumes that the elements of inp_embd will be pre- arranged to be the full embedding length with only the vision-mask'ed portions non-zero from the projector. This matches how Qwen3VL does it. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: add missing vision attn layernorm eps Branch: Granite4Vision AI-usage: full (OpenCode + Qwen 3.6-35B) Signed-off-by: Gabe Goodhart * refactor: Hoist qformer tensors into qf_block and hold a vector for multi-proj Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Fix missing prefix template for TN_QF_PROJ_LINEAR It's not strictly necessary since vision uses the blockwise version, but it makes the loading consistent. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Add embedding scale and image grid pinpoints hparams in conversion Also remove dead parsing for self._deepstack_layer_arr Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add mtmd KEY_ section for hparams shared with the LLM In this case, we need the EMBEDDING_SCALE so we can unscale the image embeddings to compensate for applying embedding scale to the input embeddings Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Implement c++ hparam parsing Branch: Granite4Vision AI-usage: draft (Claude Code) Co-authored-by: Eli Schwartz Signed-off-by: Gabe Goodhart * fix: Flatten pinpoints in conversion Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Add missing break Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: No reason to have modality prefix for img_pos Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add tensor loading Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix(convert): Fix confusion between proj.norm and proj.qformer.layernorm Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Use the right portion of speech for tensor loading! Also plumb through the layernorm -> post_norm naming change Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add logging of deepstack_layers_arr if set I also changed the print_f output type to int32_t to avoid printing overflow values for -1. This could cause overflows on the other side, but I can't imagine a value for any of the current array hparams that would trigger that. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Make sure input embeddings are cont before f_embedding_scale Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add init and mmproj_embd cases for g4v The n_mmproj_embd is 1+ to make space for the text embedding and all 8 projectors Branch: Granite4Vision AI-usage: draft (Bob) Signed-off-by: Gabe Goodhart * fix: Invert (h, w) -> (w, h) pinpoints Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Reorder projectors based on llm index and skip the first injection The multi-projector stack has a strange asymmetry based on how it's currently implemented for qwen3vl: on the mmproj side, it's all N projectors, but the output of the "first" (by inp_embd index) projector is automatically consumed as if it were a standard single-projector mmproj, so the deepstack portion needs to only contain the 1-N entries. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart Co-authored-by: Eli Schwartz * fix: Fix mmproj hparams in conversion Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart Co-authored-by: Eli Schwartz * fix: Fix ordering/logic for deepstack injection in granite Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart Co-authored-by: Eli Schwartz * fix: Fix preprocessing config to match what the model needs Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart Co-authored-by: Eli Schwartz * wip: Partial port of Eli's implementation This is still pretty broken, but it's getting closer. It now happily generates tokens, but the values are quite incorrect still. I suspect it's caused by the mapping of projectors from safetensors to their respective orders here. Also, this implementation breaks encapsulation pretty badly in mtmd_encode. This will need a big refactor to put the G4V-specific encoding logic somewhere more appropriate. Branch: Granite4Vision AI-usage: draft (Claude Code, Bob) Signed-off-by: Gabe Goodhart Co-authored-by: Eli Schwartz * fix: Fix the pre-scaling on the input embeddings to correctly invert the scale We've got tokens! They still don't line up quite right, so something's a little off, but we're getting much closer now. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: invert embedding multiplier -> base_scale at load Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Fix setting image_resize_pad after new enum introduced Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Add G4V to mmproj mapping in conversion Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Re-add padding disable for non-hybrid hybrid models Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: Simplify G4V n_tokens computation This is slightly more efficient and flexible for when we implement the unpad cropping. IMO, it's also clearer that it is adding the number of image_newline tokens (embeddings) to the grid, rather than recomputing the entire count. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Add new clip APIs for post-tile-encoding assembly Granite 4 Vision uses llava-next style pack-and-unpad which requires injecting the learned newline after each row of the tile grid. A row here is a single row of the grid which is composed of (grid_x * cols_per_tile) * (grid_y * rows_per_tile), so the result is newlines injected in between individual tile rows, thus not something that can be handled with the standard llava-uhd block-wise endcoding. Branch: Granite4Vision AI-usage: draft (Claude Code + Opus 4.7) Signed-off-by: Gabe Goodhart * feat: Add model interfaces for granite 4 vision assembler I'm on the fence about the best organization of this. These free functions allow the per-architecture logic in clip.cpp to access the model-specific graph building, but they still require a fair bit of model-specific logic in clip.cpp which is not ideal. I think a better approach may be to replicate what is done with the graph builders themselves (and possibly even make the assembler part of the model's existing graph builder). Branch: Granite4Vision AI-usage: full (Claude Code + Opus 4.7) Signed-off-by: Gabe Goodhart * refactor: Remove all g4v-specific branching from mtmd.cpp in favor of clip assembler Branch: Granite4Vision AI-usage: full (Claude Code + Opus 4.7) Signed-off-by: Gabe Goodhart * refactor(mtmd): Consolidate assembler logic into clip_assembler class family Just like `clip_graph` is the base class for building the model-specific encoder graphs, `clip_assembler` will be the base class for building the model-specific assembler graphs. This allows the assembly pattern to follow how the encoder pattern is implemented where the model-specific logic lives in a subclass co-located with the encoder graph builder that gets constructed by a simple factory method. Branch: Granite4Vision AI-usage: full (Claude Code + Opus 4.7) Signed-off-by: Gabe Goodhart * style: Comment improvement Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: granite_vision -> granite4_vision Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Remove dead codepath for Qwen3VL add_vision_is_deepstack These pieces were never used on the c++ side (removed there in an earlier commit), so this is just cleanup that I missed before. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Oops! I did not mean to commit one of my prompt files But now it's too far back in history to effectively rebase out, even with interactive and --rebase-merges :( Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Add missing include for std::find It seems that this was already pulled in on some platforms, but not on others Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Fix Flake8 warnings in granite conversion module Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: Remove clip_assembler in favor of clip_image_f32.append_token Per conversation in the PR, the clip_assembler pattern was too invasive. This is a compromise that limits model-specific blocks to add_media where each preprocessed tile is annotated with an injection type, after which all the token counting logic is generic and the newline injection itself is handled in the graph based on the value for the given tile image. Branch: Granite4Vision AI-usage: draft (Bob, OpenCode + Qwen 3.6 35b) Signed-off-by: Gabe Goodhart * refactor(convert): Split n_deepstack_layers and deepstack_layers (array) Branch: Granite4Vision AI-usage: full (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart * refactor(src): Handle n_deepstack_layers and deepstack_layers GGUF keys Branch: Granite4Vision AI-usage: draft (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart * fix: Fix GGUF key for deepstack_layers_arr Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: Remove pre-scaling embeddings and skip scaling for raw embd inputs This follows how gemma3 and gemma4 handle embedding scaling by skipping the multiplier for raw input embeddings. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: deepstack_layers(_arr) -> deepstack_mapping(_arr) Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: Fully revert changes to n_deepstack_layers and qwen3vl* Since we're going to keep the GGUF KVs separate, it makes sense to just keep the hparams separate too to limit the scope of this branch. The down side is that n_deepstack_layers and deepstack_mapping_arr are potentially conflicting. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Revert removal of "is_deepstack_layers" GGUF KV This KV is not used at all on the c++ side, so it's fully dead, but there's also no need to conflate this cleanup with the addition of G4V. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Remove unnecessary ggml_cont and build_forward_expand in cbx Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * style: Clean up comments Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Tighter and more flexible code for g4v_build_block This could be refactored to look a lot more like granite-speech, but the overall block constructs before/after the qformer are pretty different, so for now I'm going to leave it as is and just tighten a bit. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Remove unnecessary `unordered_set` include Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Add architecture guard on deepstack_mapping_arr printout Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Remove unnecessary AI-gen comment Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Always initialize deepstack_mapping_arr with -1 values This was causing `test-llama-archs` to fail, likely due to trying to save the uninitialized values, then re-loading them. It's safer to always initialize so that other models don't forget and end up with undefined behavior. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * style: Remove TODO about block/vs non-block tensor mapping Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: Move is_vision_feature_layer logic into clip_hparams Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: Use a bool for append_token Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * style: Remove unnecessary comment Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Remove unused get_model api yikes! Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: Rearrange helpers for g4v to be private members and use build_attn Branch: Granite4Vision AI-usage: full (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart * fix: Fix off-by-one in vision layer index This was inherited from the Claude Code implementation that pushed the negative index inversion down into the model file. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Fix norm/post_norm mixup in conversion face. palm. :( Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * style: More descriptive tensor names Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * fix: Apply PR cleanup for new conversion changes AI-usage: none Signed-off-by: Gabe Goodhart Co-authored-by: Sigbjørn Skjæret * fix(convert): Remove duplicate V_ENC_EMBD_IMGNL Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * refactor: append_token -> add_newline Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * style: Comment cleanup Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart * feat: Cleaner error handling/checking NOTE: format_string is not available in granite.cpp (and including clip-impl.h to get it doesn't compile, so I think it violates the intended encapsulation), so std::stringstream is the simplest answer. Branch: Granite4Vision AI-usage: none Signed-off-by: Gabe Goodhart --------- Signed-off-by: Gabe Goodhart --- conversion/__init__.py | 1 + conversion/granite.py | 158 +++++++++++- convert_lora_to_gguf.py | 16 +- gguf-py/gguf/constants.py | 82 ++++++- gguf-py/gguf/gguf_writer.py | 20 ++ gguf-py/gguf/tensor_mapping.py | 90 ++++++- src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-graph.cpp | 7 +- src/llama-hparams.h | 10 + src/llama-model-loader.cpp | 1 + src/llama-model-saver.cpp | 1 + src/llama-model.cpp | 15 +- src/models/granite.cpp | 37 +++ tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 73 +++--- tools/mtmd/clip-model.h | 35 ++- tools/mtmd/clip.cpp | 260 ++++++++++++++++---- tools/mtmd/clip.h | 2 - tools/mtmd/models/granite-speech.cpp | 10 +- tools/mtmd/models/granite4-vision.cpp | 339 ++++++++++++++++++++++++++ tools/mtmd/models/llava.cpp | 5 +- tools/mtmd/models/models.h | 23 ++ tools/mtmd/mtmd.cpp | 29 ++- 24 files changed, 1103 insertions(+), 114 deletions(-) create mode 100644 tools/mtmd/models/granite4-vision.cpp diff --git a/conversion/__init__.py b/conversion/__init__.py index 2c79580f8a3..c670798fc2b 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -253,6 +253,7 @@ "Glm4vMoeForConditionalGeneration": "qwen3vl", "GlmOcrForConditionalGeneration": "qwen3vl", "GlmasrModel": "ultravox", + "Granite4VisionForConditionalGeneration": "granite", "GraniteSpeechForConditionalGeneration": "granite", "HunYuanVLForConditionalGeneration": "hunyuan", "Idefics3ForConditionalGeneration": "smolvlm", diff --git a/conversion/granite.py b/conversion/granite.py index 647269ba740..53441fe5701 100644 --- a/conversion/granite.py +++ b/conversion/granite.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import Any, Callable, Iterable, TYPE_CHECKING import torch @@ -13,7 +14,7 @@ from .mamba import Mamba2Model -@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration") +@ModelBase.register("GraniteForCausalLM") class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE @@ -46,11 +47,29 @@ def set_gguf_parameters(self): self.gguf_writer.add_logit_scale(logits_scale) logger.info("gguf: (granite) logits_scale = %s", logits_scale) + # If being used as the base for Granite4 Vision, add deepstack_layer_arr + if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"): + normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams) + deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels + for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map): + # Skip the first projector which is handled as the base embedding + # stream like normal + if proj_idx == 0: + continue + deepstack_mapping_arr[llm_layer] = proj_idx + self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr) + @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: name, gen = item - if name.startswith("encoder."): - return None + # Skip multimodal tensors + if ( + name.startswith(("encoder.")) + or "image_" in name + or "layerwise_projectors" in name + or "spatial_projectors" in name + ): + return return super().filter_tensors(item) @@ -241,7 +260,8 @@ def set_gguf_parameters(self): assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" def set_vocab(self): - self.hparams["pad_vocab_size_multiple"] = 8 + # For models with no ssm layers, don't pad for mamba2 + self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1 Mamba2Model.set_vocab(self) @@ -326,3 +346,133 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.squeeze(1) yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Granite4VisionForConditionalGeneration") +class Granite4VisionMmprojModel(MmprojModel): + has_vision_encoder = True + has_audio_encoder = False + + @staticmethod + def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]: + """Normalize both deepstack and spatial projector maps to the form: + (vision_layer, llm_layer, , type_index) + + This is then used to populate the following mappings: + - vision_feature_layers (mmproj hparam): ordered list of all + vision_layer values where order corresponds with the order of the + stacked projector tensors + NOTE: Values may appear multiple times for spatial projectors + - tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to + the index of the corresponding projector in the stacked tensors + - deepstack_layer_arr (llm hparam): per-text-layer array indicating + which input vision feature should be injected at that layer + (-1 if none) + + Output: (vision_layer, llm_layer, , type_index) + """ + deepstack_map = global_config.get("deepstack_layer_map", []) # [[vis_layer, llm_layer], ...] + spatial_layers = global_config.get("spatial_target_layers", []) # [llm_layer, ...] + n_text_layers = global_config["text_config"]["num_hidden_layers"] + n_vision_layers = global_config["vision_config"]["num_hidden_layers"] + normalized_projector_map = [] + if deepstack_map: + for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)): + if vision_layer < 0: + vision_layer = n_vision_layers + vision_layer + if llm_layer < 0: + llm_layer = n_text_layers + llm_layer + normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx)) + if spatial_layers: + spatial_vision_layer = global_config.get("spatial_vision_layer", -1) + if spatial_vision_layer < 0: + spatial_vision_layer = n_vision_layers + spatial_vision_layer + for spatial_idx, llm_layer in enumerate(spatial_layers): + normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx)) + return list(sorted(normalized_projector_map, key=(lambda entry: entry[1]))) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + normalized_projector_map = self.get_normalized_projector_map(self.global_config) + self._n_proj = len(normalized_projector_map) + + self._tensor_prefix_map = { + f"model.{proj_type}_projectors.{type_idx}": proj_idx + for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map) + } + self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map] + self._spatial_offsets = [ + type_idx if proj_type == "spatial" else -1 + for _, _, proj_type, type_idx in normalized_projector_map + ] + + def set_gguf_parameters(self): + assert self.hparams_vision is not None + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION) + + # SigLIP encoder hparams + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + + # Preprocessor + self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384)) + + # QFormer projector config + ds_rate = self.global_config["downsample_rate"] + ds_parts = ds_rate.split("/") + assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}" + query_side, window_side = [int(p) for p in ds_parts] + self.gguf_writer.add_vision_projector_query_side(query_side) + self.gguf_writer.add_vision_projector_window_side(window_side) + + # Set vision feature layers + self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers) + + # Set the spatial offests per projector + self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets) + + # Add flattened image grind pinpoints (resolution candidates internally) + if pinpoints := self.global_config.get("image_grid_pinpoints"): + # Flatten with h, w -> w, h inversion + pinpoints = [val for h, w in pinpoints for val in (w, h)] + self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, _ = item + if ("vision_model.head" in name or name.startswith("lm_head")): + return None + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # Detect projector tensors and bin them + projector_idx = None + for prefix, proj_idx in self._tensor_prefix_map.items(): + if name.startswith(prefix): + projector_idx = proj_idx + break + if projector_idx is not None: + # If this projector tensor has a block id within the projector, + # alias the bid to projector_idx + # + # TODO: currently, none of the Granite 4 Vision models have + # projectors with multiple QFormer layers, so the `layer.{}` index + # is always 0. This allows us to simply map to a single `bid` that + # matches the projector index. If this changes, we'll need a + # convention that merges the two IDs. + id_matches = list(re.finditer(r"\.([0-9]+)\.", name)) + all_ids = [int(m.group(1)) for m in id_matches] + assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names" + # If not layer id, just use the projector index + new_bid = projector_idx + if len(all_ids) == 1: + new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:] + else: # len(all_ids) == 2 + new_bid = projector_idx # + all_ids[1] + new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:] + yield from super().modify_tensors(data_torch, new_name, new_bid) + return + yield from super().modify_tensors(data_torch, name, bid) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 9a6437beab1..45202b33387 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -311,6 +311,10 @@ def parse_args() -> argparse.Namespace: "--base-model-id", type=str, help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", ) + parser.add_argument( + "--trust-remote-code", default=False, action="store_true", + help="trust remote code in the model", + ) parser.add_argument( "lora_path", type=Path, help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", @@ -319,11 +323,11 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]: +def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]: from huggingface_hub import try_to_load_from_cache # normally, adapter does not come with base model config, we need to load it from AutoConfig - config = AutoConfig.from_pretrained(hf_model_id) + config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code) cache_dir = try_to_load_from_cache(hf_model_id, "config.json") cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None @@ -372,13 +376,13 @@ def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None] # load base model if base_model_id is not None: logger.info(f"Loading base model from Hugging Face: {base_model_id}") - hparams, dir_base_model = load_hparams_from_hf(base_model_id) + hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code) elif dir_base_model is None: if "base_model_name_or_path" in lparams: model_id = lparams["base_model_name_or_path"] logger.info(f"Loading base model from Hugging Face: {model_id}") try: - hparams, dir_base_model = load_hparams_from_hf(model_id) + hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code) except OSError as e: logger.error(f"Failed to load base model config: {e}") logger.error("Please try downloading the base model and add its path to --base") @@ -393,7 +397,9 @@ def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None] with torch.inference_mode(): try: - model_class = get_model_class(hparams["architectures"][0]) + model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0] + logger.info("Using model architecture: %s", model_arch) + model_class = get_model_class(model_arch) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ce556ec9b65..814980ce508 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -128,6 +128,7 @@ class LLM: MOE_LATENT_SIZE = "{arch}.moe_latent_size" NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers" NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers" + DEEPSTACK_MAPPING = "{arch}.deepstack_mapping" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" @@ -325,6 +326,8 @@ class ClipVision: WA_PATTERN_MODE = "clip.vision.wa_pattern_mode" # used by mimovl, per-layer -1/0/1 IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers" WINDOW_SIZE = "clip.vision.window_size" + FEATURE_LAYERS = "clip.vision.feature_layer" # Granite4 Vision + IMAGE_GRID_PINPOINTS = "clip.vision.image_grid_pinpoints" # Granite4 Vision class Attention: HEAD_COUNT = "clip.vision.attention.head_count" @@ -333,6 +336,9 @@ class Attention: class Projector: SCALE_FACTOR = "clip.vision.projector.scale_factor" + QUERY_SIDE = "clip.vision.projector.query_side" + WINDOW_SIDE = "clip.vision.projector.window_side" + SPATIAL_OFFSETS = "clip.vision.projector.spatial_offsets" class SAM: BLOCK_COUNT = "clip.vision.sam.block_count" @@ -821,6 +827,31 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2 V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2 + # qformer projector (vision) - Granite4 Vision + V_QF_PROJ_QUERY = auto() + V_QF_PROJ_NORM = auto() + V_QF_PROJ_LINEAR = auto() + V_QF_SELF_ATTN_Q = auto() + V_QF_SELF_ATTN_K = auto() + V_QF_SELF_ATTN_V = auto() + V_QF_SELF_ATTN_O = auto() + V_QF_SELF_ATTN_NORM = auto() + V_QF_CROSS_ATTN_Q = auto() + V_QF_CROSS_ATTN_K = auto() + V_QF_CROSS_ATTN_V = auto() + V_QF_CROSS_ATTN_O = auto() + V_QF_CROSS_ATTN_NORM = auto() + V_QF_FFN_UP = auto() + V_QF_FFN_DOWN = auto() + V_QF_FFN_NORM = auto() + V_PROJ_NORM = auto() + # multi-projector (bid => projector id) - Granite4 vision + V_MULTI_PROJ_IMG_POS = auto() + V_MULTI_PROJ_QUERY = auto() + V_MULTI_PROJ_NORM = auto() + V_MULTI_PROJ_LINEAR = auto() + V_MULTI_PROJ_POST_NORM = auto() + # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_EMBD_NORM = auto() @@ -885,7 +916,7 @@ class MODEL_TENSOR(IntEnum): A_CTC_OUT = auto() A_CTC_OUT_MID = auto() A_ENC_ATTN_REL_POS_EMB = auto() - # qformer projector + # audio qformer projector A_QF_PROJ_QUERY = auto() A_QF_PROJ_NORM = auto() A_QF_PROJ_LINEAR = auto() @@ -1337,10 +1368,33 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}", MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2", MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", - MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR + MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR, Granite4Vision MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2 MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2 + # Granite4 Vision + # qformer layers (bid => proj_id) + # NOTE: Names align with A_QF_* + MODEL_TENSOR.V_QF_SELF_ATTN_Q: "v.proj_blk.{bid}.self_attn_q", + MODEL_TENSOR.V_QF_SELF_ATTN_K: "v.proj_blk.{bid}.self_attn_k", + MODEL_TENSOR.V_QF_SELF_ATTN_V: "v.proj_blk.{bid}.self_attn_v", + MODEL_TENSOR.V_QF_SELF_ATTN_O: "v.proj_blk.{bid}.self_attn_out", + MODEL_TENSOR.V_QF_SELF_ATTN_NORM: "v.proj_blk.{bid}.self_attn_norm", + MODEL_TENSOR.V_QF_CROSS_ATTN_Q: "v.proj_blk.{bid}.cross_attn_q", + MODEL_TENSOR.V_QF_CROSS_ATTN_K: "v.proj_blk.{bid}.cross_attn_k", + MODEL_TENSOR.V_QF_CROSS_ATTN_V: "v.proj_blk.{bid}.cross_attn_v", + MODEL_TENSOR.V_QF_CROSS_ATTN_O: "v.proj_blk.{bid}.cross_attn_out", + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: "v.proj_blk.{bid}.cross_attn_norm", + MODEL_TENSOR.V_QF_FFN_UP: "v.proj_blk.{bid}.ffn_up", + MODEL_TENSOR.V_QF_FFN_DOWN: "v.proj_blk.{bid}.ffn_down", + MODEL_TENSOR.V_QF_FFN_NORM: "v.proj_blk.{bid}.ffn_norm", + # multi-projector (bid => projector ID) + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: "v.proj_blk.{bid}.img_pos", + MODEL_TENSOR.V_MULTI_PROJ_QUERY: "v.proj_blk.{bid}.query", + MODEL_TENSOR.V_MULTI_PROJ_NORM: "v.proj_blk.{bid}.norm", + MODEL_TENSOR.V_MULTI_PROJ_LINEAR: "v.proj_blk.{bid}.linear", + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: "v.proj_blk.{bid}.post_norm", + # audio (mtmd) # note: all audio tensor names must use prefix "a." or "mm.a." MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", @@ -1522,6 +1576,29 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_SAM_NET_3, MODEL_TENSOR.V_RESMPL_QUERY_768, MODEL_TENSOR.V_RESMPL_QUERY_1024, + MODEL_TENSOR.V_PROJ_NORM, + MODEL_TENSOR.V_QF_PROJ_QUERY, + MODEL_TENSOR.V_QF_PROJ_NORM, + MODEL_TENSOR.V_QF_PROJ_LINEAR, + MODEL_TENSOR.V_QF_SELF_ATTN_Q, + MODEL_TENSOR.V_QF_SELF_ATTN_K, + MODEL_TENSOR.V_QF_SELF_ATTN_V, + MODEL_TENSOR.V_QF_SELF_ATTN_O, + MODEL_TENSOR.V_QF_SELF_ATTN_NORM, + MODEL_TENSOR.V_QF_CROSS_ATTN_Q, + MODEL_TENSOR.V_QF_CROSS_ATTN_K, + MODEL_TENSOR.V_QF_CROSS_ATTN_V, + MODEL_TENSOR.V_QF_CROSS_ATTN_O, + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM, + MODEL_TENSOR.V_QF_FFN_UP, + MODEL_TENSOR.V_QF_FFN_DOWN, + MODEL_TENSOR.V_QF_FFN_NORM, + MODEL_TENSOR.V_QF_PROJ_NORM, + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS, + MODEL_TENSOR.V_MULTI_PROJ_QUERY, + MODEL_TENSOR.V_MULTI_PROJ_LINEAR, + MODEL_TENSOR.V_MULTI_PROJ_NORM, + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_EMBD_NORM, @@ -4388,6 +4465,7 @@ class VisionProjectorType: MINICPMV4_6 = "minicpmv4_6" GRANITE_SPEECH = "granite_speech" # audio MIMOVL = "mimovl" + GRANITE4_VISION = "granite4_vision" # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 875d0f73d96..182c9c54a53 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -959,8 +959,13 @@ def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) def add_num_deepstack_layers(self, count: int) -> None: + """Add scalar deepstack layer count (qwen3vl format)""" self.add_uint32(Keys.LLM.NUM_DEEPSTACK_LAYERS.format(arch=self.arch), count) + def add_deepstack_mapping(self, layers: Sequence[int]) -> None: + """Add per-layer deepstack projector indices (Granite4 Vision format)""" + self.add_array(Keys.LLM.DEEPSTACK_MAPPING.format(arch=self.arch), list(layers)) + def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) @@ -1184,6 +1189,15 @@ def add_vision_preproc_min_tiles(self, value: int) -> None: def add_vision_preproc_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value) + def add_vision_projector_query_side(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.QUERY_SIDE, value) + + def add_vision_projector_window_side(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.WINDOW_SIDE, value) + + def add_vision_spatial_offsets(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipVision.Projector.SPATIAL_OFFSETS, layers) + def add_vision_image_mean(self, values: Sequence[float]) -> None: self.add_array(Keys.ClipVision.IMAGE_MEAN, values) @@ -1240,6 +1254,12 @@ def add_vision_wa_pattern_mode(self, modes: Sequence[int]) -> None: def add_vision_window_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value) + def add_vision_feature_layers(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipVision.FEATURE_LAYERS, layers) + + def add_vision_image_grid_pinpoints(self, layers: Sequence[Sequence[int]]) -> None: + self.add_array(Keys.ClipVision.IMAGE_GRID_PINPOINTS, layers) + def add_vision_sam_layers_count(self, value: int) -> None: self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 82f26e7b303..3e63b216505 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1408,6 +1408,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "model.vision_tower.vision_model.embeddings.patch_embedding", # Granite4Vision "vision_tower.vision_model.embeddings.patch_embedding", "model.vision_tower.embeddings.patch_embedding", # minicpmv4_6 "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 @@ -1439,6 +1440,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_EMBD_POS: ( + "model.vision_tower.vision_model.embeddings.position_embedding", # Granite4Vision "vision_tower.vision_model.embeddings.position_embedding", "model.vision_tower.embeddings.position_embedding", # minicpmv4_6 "model.vision_tower.embeddings.position_embeddings", # Intern-S1 @@ -1456,8 +1458,9 @@ class TensorNameMap: "model.vision_embedder.pos_embedding", # gemma4 unified ), + # TODO: I think these should all be moved to mapping_cfg? MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( - "model.image_newline", # Deepseek-OCR + "model.image_newline", # Deepseek-OCR, Granite4Vision "vit.perceive.image_newline", # HunyuanVL ), @@ -1477,6 +1480,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_Q: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.q_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 @@ -1502,6 +1506,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_K: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.k_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 @@ -1527,6 +1532,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_V: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.v_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 @@ -1545,6 +1551,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", "model.vision_tower.encoder.layers.{bid}.layer_norm1", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL @@ -1567,6 +1574,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_O: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.out_proj", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL @@ -1595,6 +1603,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", "model.vision_tower.encoder.layers.{bid}.layer_norm2", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL @@ -1618,6 +1627,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_FFN_UP: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", "model.vision_tower.encoder.layers.{bid}.mlp.fc1", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 @@ -1649,6 +1659,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", "model.vision_tower.encoder.layers.{bid}.mlp.fc2", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 @@ -1706,6 +1717,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_POST_NORM: ( + "model.vision_tower.vision_model.post_layernorm", # Granite4Vision "vision_tower.vision_model.post_layernorm", "model.vision_tower.post_layernorm", # minicpmv4_6 "model.vision_model.post_layernorm", # SmolVLM @@ -1952,6 +1964,82 @@ class TensorNameMap: "model.vision_tower.std_scale", # gemma4 ), + # For these tensors, bid => projector ID + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: ( + "model.layerwise_projectors.{bid}.image_positions", # Granite4 Vision + "model.spatial_projectors.{bid}.image_positions", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_QUERY: ( + "model.layerwise_projectors.{bid}.query", # Granite4 Vision + "model.spatial_projectors.{bid}.query", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_LINEAR: ( + "model.layerwise_projectors.{bid}.out_linear", # Granite4 Vision + "model.spatial_projectors.{bid}.out_linear", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_NORM: ( + "model.layerwise_projectors.{bid}.norm", # Granite4 Vision + "model.spatial_projectors.{bid}.norm", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: ( + "model.layerwise_projectors.{bid}.qformer.layernorm", # Granite4 Vision + "model.spatial_projectors.{bid}.qformer.layernorm", # Granite4 Vision + ), + + # For these tensors, bid => proj-id + MODEL_TENSOR.V_QF_SELF_ATTN_Q: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_K: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_V: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_O: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_Q: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_K: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_V: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_O: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_UP: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_DOWN: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision + ), + # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index fea898deaf2..52963f8f1ed 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -196,6 +196,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" }, { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" }, { LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" }, + { LLM_KV_DEEPSTACK_MAPPING, "%s.deepstack_mapping" }, { LLM_KV_HIDDEN_ACT, "%s.hidden_activation" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index f364f6b0bae..dc9bca9bfc6 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -200,6 +200,7 @@ enum llm_kv { LLM_KV_MOE_LATENT_SIZE, LLM_KV_NEXTN_PREDICT_LAYERS, LLM_KV_NUM_DEEPSTACK_LAYERS, + LLM_KV_DEEPSTACK_MAPPING, LLM_KV_HIDDEN_ACT, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 172edf24cb1..3b8125cde7b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1859,7 +1859,12 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { res->t_inp_embd = cur; // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { + // NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be + // multimodal inputs that should not be scaled. + if (ubatch.token && hparams.f_embedding_scale != 0.0f) { + if (!ggml_is_contiguous(cur)) { + cur = ggml_cont(ctx0, cur); + } cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale); } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index fde6183e878..87db4a0dd30 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -219,8 +219,18 @@ struct llama_hparams { uint32_t indexer_top_k = 0; // qwen3vl deepstack + // When parsed from GGUF, this implies the first N layers consume the first + // N deepstack embeddings. Use deepstack_mapping_arr if you need a more + // complex mapping. If using deepstack_mapping_arr, also make sure to set + // n_deepstack_layers to the number of unique deepstack layers so that + // n_embd_imp is accurate (see granite.cpp). uint32_t n_deepstack_layers = 0; + // deepstack layer array (Granite4 Vision) + // -1 => no deepstack + // >=0 => input embedding index for deepstack injection + std::array deepstack_mapping_arr; + // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ba08a19ac76..0d1cf3cc33b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -393,6 +393,7 @@ namespace GGUFMeta { } template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); + template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array & result, bool required); template bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index b0522878090..67d4a9df0f0 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -229,6 +229,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers); add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn); add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers); + add_kv(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr); add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 784deb70aff..6808ad044c7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1100,6 +1100,9 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false); ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false); + // Populate deepstack_mapping_arr - initialized to -1 (no deepstack) + std::fill(hparams.deepstack_mapping_arr.begin(), hparams.deepstack_mapping_arr.end(), -1); + // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; @@ -1678,10 +1681,10 @@ uint64_t llama_model::n_elements() const { void llama_model::print_info() const { const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train); - auto print_f = [](const std::function & f, uint32_t n) { + auto print_f = [](const std::function & f, uint32_t n) { bool is_var = false; - std::vector v; + std::vector v; for (uint32_t i = 0; i < n; ++i) { v.push_back(f(i)); if (v[i] != v[0]) { @@ -1755,6 +1758,14 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); + if (arch == LLM_ARCH_GRANITE && + std::any_of(hparams.deepstack_mapping_arr.begin(), + hparams.deepstack_mapping_arr.end(), + [](const auto & entry) { return entry >= 0; })) { + LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__, + print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; }, + hparams.n_layer).c_str()); + } // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]); diff --git a/src/models/granite.cpp b/src/models/granite.cpp index 7aff942da01..4a75c5ff3cc 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -1,5 +1,7 @@ #include "models.h" +#include + void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); @@ -7,6 +9,27 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false); ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false); + // Granite4 Vision uses array deepstack_mapping + ml.get_arr(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr, false); + + // Count the unique deepstack input indices + std::unordered_set unique_deepstack_idxs; + for (const auto val : hparams.deepstack_mapping_arr) { + if (val >= 0) { + unique_deepstack_idxs.insert(val); + } + } + hparams.n_deepstack_layers = unique_deepstack_idxs.size(); + + // Ensure all values are valid (avoid overflow attacks) + for (const auto val : unique_deepstack_idxs) { + if (val > hparams.n_deepstack_layers) { + std::stringstream ss; + ss << "Invalid deepstack index: " << val << " > " << hparams.n_deepstack_layers; + throw std::runtime_error(ss.str()); + } + } + // Granite uses rope_finetuned as a switch for rope, so default to true bool rope_finetuned = true; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -112,6 +135,20 @@ llama_model_granite::graph::graph( ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + + // Granite Vision 4.1 deepstack: inject the projector stream that + // targets decoder layer `il` before the decoder runs. + // NOTE: skip the first deepstack layer since that's inpL + const auto & deepstack_emb_idx = hparams.deepstack_mapping_arr[il]; + if (il > 0 && deepstack_emb_idx >= 0) { + ggml_tensor * ds = ggml_view_2d(ctx0, + res->t_inp_embd, n_embd, n_tokens, + res->t_inp_embd->nb[1], + deepstack_emb_idx * n_embd * sizeof(float)); + inpL = ggml_add(ctx0, inpL, ds); + cb(inpL, "deepstack_in", il); + } + ggml_tensor * inpSA = inpL; // norm diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 93f005652b7..20c53178634 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(mtmd models/gemma4uv.cpp models/glm4v.cpp models/granite-speech.cpp + models/granite4-vision.cpp models/hunyuanvl.cpp models/internvl.cpp models/kimivl.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c055cfb7541..393e085f71e 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -35,20 +35,22 @@ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" // vision-specific -#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" -#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" -#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles" -#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles" -#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_FEATURE_LAYER "clip.vision.feature_layer" -#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" -#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" -#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" +#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" +#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" +#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles" +#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles" +#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side" +#define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side" +#define KEY_PROJ_SPATIAL_OFFSETS "clip.vision.projector.spatial_offsets" +#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -72,7 +74,6 @@ #define KEY_A_PROJ_DOWNSAMPLE_RATE "clip.audio.projector.downsample_rate" #define KEY_A_PROJ_HEAD_COUNT "clip.audio.projector.head_count" - // // tensor name constants // @@ -210,22 +211,28 @@ #define TN_CTC_OUT_MID "a.enc_ctc_out_mid.%s" #define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb" // qformer projector -#define TN_QF_PROJ_QUERY "a.proj_query" -#define TN_QF_PROJ_NORM "a.proj_norm.%s" -#define TN_QF_PROJ_LINEAR "a.proj_linear.%s" -#define TN_QF_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s" -#define TN_QF_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s" -#define TN_QF_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s" -#define TN_QF_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s" -#define TN_QF_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s" -#define TN_QF_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s" -#define TN_QF_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s" -#define TN_QF_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s" -#define TN_QF_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s" -#define TN_QF_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s" -#define TN_QF_FFN_UP "a.proj_blk.%d.ffn_up.%s" -#define TN_QF_FFN_DOWN "a.proj_blk.%d.ffn_down.%s" -#define TN_QF_FFN_NORM "a.proj_blk.%d.ffn_norm.%s" +#define TN_QF_PROJ_QUERY "%s.proj_query" +#define TN_QF_PROJ_NORM "%s.proj_norm.%s" +#define TN_QF_PROJ_LINEAR "%s.proj_linear.%s" +#define TN_QF_SELF_ATTN_Q "%s.proj_blk.%d.self_attn_q.%s" +#define TN_QF_SELF_ATTN_K "%s.proj_blk.%d.self_attn_k.%s" +#define TN_QF_SELF_ATTN_V "%s.proj_blk.%d.self_attn_v.%s" +#define TN_QF_SELF_ATTN_O "%s.proj_blk.%d.self_attn_out.%s" +#define TN_QF_SELF_ATTN_N "%s.proj_blk.%d.self_attn_norm.%s" +#define TN_QF_CROSS_ATTN_Q "%s.proj_blk.%d.cross_attn_q.%s" +#define TN_QF_CROSS_ATTN_K "%s.proj_blk.%d.cross_attn_k.%s" +#define TN_QF_CROSS_ATTN_V "%s.proj_blk.%d.cross_attn_v.%s" +#define TN_QF_CROSS_ATTN_O "%s.proj_blk.%d.cross_attn_out.%s" +#define TN_QF_CROSS_ATTN_N "%s.proj_blk.%d.cross_attn_norm.%s" +#define TN_QF_FFN_UP "%s.proj_blk.%d.ffn_up.%s" +#define TN_QF_FFN_DOWN "%s.proj_blk.%d.ffn_down.%s" +#define TN_QF_FFN_NORM "%s.proj_blk.%d.ffn_norm.%s" +// multi-projector qformer (bid => projector ID) +#define TN_MULTI_PROJ_IMG_POS "v.proj_blk.%d.img_pos" +#define TN_MULTI_PROJ_QUERY "%s.proj_blk.%d.query" +#define TN_MULTI_PROJ_LINEAR "%s.proj_blk.%d.linear.%s" +#define TN_MULTI_PROJ_NORM "%s.proj_blk.%d.norm.%s" +#define TN_MULTI_PROJ_POST_NORM "%s.proj_blk.%d.post_norm.%s" // gemma4 audio conformer #define TN_A_MM_INP_PROJ "mm.a.input_projection.%s" @@ -354,6 +361,7 @@ enum projector_type { PROJECTOR_TYPE_MINICPMV4_6, PROJECTOR_TYPE_GRANITE_SPEECH, PROJECTOR_TYPE_MIMOVL, + PROJECTOR_TYPE_GRANITE4_VISION, PROJECTOR_TYPE_UNKNOWN, }; @@ -407,6 +415,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"}, { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, { PROJECTOR_TYPE_MIMOVL, "mimovl"}, + { PROJECTOR_TYPE_GRANITE4_VISION, "granite4_vision"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { @@ -438,6 +447,8 @@ struct clip_image_f32 { // marks the global view in e.g., DeepSeek-OCR Models bool add_viewsep = false; + // whether a learned newline token should be appended after the image (eg Granite4 Vision) + bool add_newline = false; }; // diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 238f805a9aa..48796b6306f 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -4,6 +4,7 @@ #include "clip.h" #include "clip-impl.h" +#include #include #include #include @@ -90,7 +91,7 @@ struct clip_hparams { float eps = 1e-6; float rope_theta = 0.0; - std::unordered_set vision_feature_layer; + std::vector vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) @@ -101,6 +102,11 @@ struct clip_hparams { int32_t sam_n_head = 0; int32_t sam_n_embd = 0; + // Granite4 Vision + std::vector proj_spatial_offsets; + int32_t downsample_query_side; + int32_t downsample_window_side; + // audio int32_t n_mel_bins = 0; // whisper preprocessor int32_t proj_stack_factor = 0; // ultravox @@ -158,6 +164,10 @@ struct clip_hparams { return false; } + + bool is_vision_feature_layer(int32_t layer) const { + return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end(); + } }; struct clip_layer { @@ -325,6 +335,20 @@ struct yasa2_stage { std::vector blocks; }; +// QFormer projector block for models with 1 (or more) QFormer projectors +// Granite Speech, Granite4 Vision +struct qf_block { + ggml_tensor * qf_proj_query = nullptr; + ggml_tensor * qf_proj_norm_w = nullptr; + ggml_tensor * qf_proj_norm_b = nullptr; + ggml_tensor * qf_proj_linear_w = nullptr; + ggml_tensor * qf_proj_linear_b = nullptr; + ggml_tensor * qf_proj_post_norm_w = nullptr; + ggml_tensor * qf_proj_post_norm_b = nullptr; + ggml_tensor * qf_proj_img_pos = nullptr; // Vision only + std::vector qf_proj_layers; +}; + struct clip_model { clip_modality modality = CLIP_MODALITY_VISION; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -589,13 +613,8 @@ struct clip_model { ggml_tensor * ctc_out_b = nullptr; ggml_tensor * ctc_out_mid_w = nullptr; ggml_tensor * ctc_out_mid_b = nullptr; - // qformer projector - ggml_tensor * qf_proj_query = nullptr; - ggml_tensor * qf_proj_norm_w = nullptr; - ggml_tensor * qf_proj_norm_b = nullptr; - ggml_tensor * qf_proj_linear_w = nullptr; - ggml_tensor * qf_proj_linear_b = nullptr; - std::vector qf_proj_layers; + // qformer projector(s) + std::vector qf_proj_blocks; bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 80136ed8667..c12c910a1c8 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -997,6 +997,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } @@ -1234,12 +1238,7 @@ struct clip_model_loader { // to form the final visual features. // NOTE: gguf conversions should standardize the values of the vision feature layer to // be non-negative, since we use -1 to mark values as unset here. - std::vector vision_feature_layer; - get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false); - // convert std::vector to std::unordered_set - for (auto & layer : vision_feature_layer) { - hparams.vision_feature_layer.insert(layer); - } + get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false); // model-specific params switch (model.proj_type) { @@ -1627,6 +1626,23 @@ struct clip_model_loader { hparams.image_pad_color = {127, 127, 127}; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // SigLIP tower. + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; + hparams.image_resize_pad = PAD_CEIL; + + get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer); + get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets); + if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) { + throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d", + hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size())); + } + + get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side); + get_u32(KEY_PROJ_SAMPLE_WINDOW_SIDE, hparams.downsample_window_side); + hparams.warmup_image_size = hparams.image_size; + } break; default: throw std::runtime_error(string_format("%s: unknown vision projector type %s\n", __func__, proj_type.c_str())); } @@ -2628,47 +2644,106 @@ struct clip_model_loader { layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); } - model.qf_proj_query = get_tensor(TN_QF_PROJ_QUERY); - model.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, "weight")); - model.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, "bias")); - model.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, "weight")); - model.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, "bias")); + model.qf_proj_blocks.resize(1); + auto & qf = model.qf_proj_blocks[0]; + qf.qf_proj_query = get_tensor(string_format(TN_QF_PROJ_QUERY, prefix)); + qf.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "weight")); + qf.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "bias")); + qf.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "weight")); + qf.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "bias")); const int n_proj_layers = 2; - model.qf_proj_layers.resize(n_proj_layers); + qf.qf_proj_layers.resize(n_proj_layers); for (int il = 0; il < n_proj_layers; ++il) { - auto & pl = model.qf_proj_layers[il]; - - pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "weight")); - pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "bias")); - pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "weight")); - pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "bias")); - pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "weight")); - pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "bias")); - pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "weight")); - pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "bias")); - pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "weight")); - pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "bias")); - - pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "weight")); - pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "bias")); - pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "weight")); - pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "bias")); - pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "weight")); - pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "bias")); - pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "weight")); - pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "bias")); - pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "weight")); - pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "bias")); - - pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, il, "weight")); - pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, il, "bias")); - pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, il, "weight")); - pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, il, "bias")); - pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, il, "weight")); - pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, il, "bias")); + auto & pl = qf.qf_proj_layers[il]; + + pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "weight")); + pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "bias")); + pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "weight")); + pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "bias")); + pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "weight")); + pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "bias")); + pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "weight")); + pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "bias")); + pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "weight")); + pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "bias")); + + pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "weight")); + pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "bias")); + pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "weight")); + pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "bias")); + pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "weight")); + pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "bias")); + pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "weight")); + pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "bias")); + pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "weight")); + pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "bias")); + + pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, prefix, il, "weight")); + pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, prefix, il, "bias")); + pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "weight")); + pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "bias")); + pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "weight")); + pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "bias")); } } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // image_newline lives at the top-level. + model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + + // Load separate layerwise and spatial projector tensors + const auto projector_count = hparams.vision_feature_layer.size(); + model.qf_proj_blocks.resize(projector_count); + for (size_t bid = 0; bid < projector_count; ++bid) { + auto & b = model.qf_proj_blocks[bid]; + + // non-layerwise tensors + b.qf_proj_img_pos = get_tensor(string_format(TN_MULTI_PROJ_IMG_POS, bid)); + b.qf_proj_query = get_tensor(string_format(TN_MULTI_PROJ_QUERY, prefix, bid)); + b.qf_proj_linear_w = get_tensor(string_format(TN_MULTI_PROJ_LINEAR, prefix, bid, "weight")); + b.qf_proj_linear_b = get_tensor(string_format(TN_MULTI_PROJ_LINEAR, prefix, bid, "bias")); + b.qf_proj_norm_w = get_tensor(string_format(TN_MULTI_PROJ_NORM, prefix, bid, "weight")); + b.qf_proj_norm_b = get_tensor(string_format(TN_MULTI_PROJ_NORM, prefix, bid, "bias")); + b.qf_proj_post_norm_w = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "weight")); + b.qf_proj_post_norm_b = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "bias")); + + // laywerwise tensors + // NOTE: If any model uses multi-layer qformers, this will need to change + b.qf_proj_layers.resize(1); + auto & pl = b.qf_proj_layers[0]; + + pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "weight")); + pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "bias")); + pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "weight")); + pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "bias")); + pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "weight")); + pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "bias")); + pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "weight")); + pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "bias")); + pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "weight")); + pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "bias")); + + pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "weight")); + pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "bias")); + pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "weight")); + pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "bias")); + pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "weight")); + pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "bias")); + pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "weight")); + pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "bias")); + pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "weight")); + pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "bias")); + + pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, prefix, bid, "weight")); + pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, prefix, bid, "bias")); + pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "weight")); + pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "bias")); + pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "weight")); + pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "bias")); + } + + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -3085,10 +3160,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny memcpy(img->buf.data(), rgb_pixels, img->buf.size()); } -ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { - return ctx->model.image_newline; -} - void clip_free(clip_ctx * ctx) { if (ctx == nullptr) { return; @@ -3397,6 +3468,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im const int ds = ctx->model.hparams.audio_proj_downsample_rate; n_patches = ((img->nx + ws - 1) / ws) * (ws / ds); } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // Per-tile output token count: each projector block outputs + // query_side^2 tokens per window × n^2 windows. + // For 384×384 input: n = 24/8 = 3, query_side = 4 → 144. + const int window_side = ctx->model.hparams.downsample_window_side; + const int query_side = ctx->model.hparams.downsample_query_side; + const int side = img->nx / params.patch_size; + const int n = side / window_side; + n_patches = (query_side * n) * (query_side * n); + if (img->add_newline) { + // For single-tile case: append 1 newline row. + // For multi-tile rowwise: handled by caller, but here we + // report the per-tile count including one trailing newline. + n_patches += 1; + } + } break; default: GGML_ABORT("unsupported projector type"); } @@ -4229,6 +4317,82 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_f32("attn_mask", mask); } } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // Granite Vision 4.1 uses precomputed permutation index + // tensors to express the _win / _unwin / spatial sampling + // reshapes as ggml_get_rows gathers. The names are set + // by g4v_gather() in models/granite4-vision.cpp. + const int patch_size = model.hparams.patch_size; + const int image_side = imgs.entries.front()->nx / patch_size; + const int window_side = hparams.downsample_window_side; + const int query_side = hparams.downsample_query_side; + const int n = image_side / window_side; + const int new_side = n * query_side; + + // Builds the raster→window permutation indices for a + // (side, side) grid split into (n × n) windows of (win × win) + // tokens each. dst[w * win*win + p] = source raster index. + auto make_win_idx = [](int side, int win) { + const int nn = side / win; + std::vector idx(static_cast(side) * side); + for (int wy = 0; wy < nn; ++wy) { + for (int wx = 0; wx < nn; ++wx) { + for (int iy = 0; iy < win; ++iy) { + for (int ix = 0; ix < win; ++ix) { + const int w = wy * nn + wx; + const int p = iy * win + ix; + const int y = wy * win + iy; + const int x = wx * win + ix; + idx[static_cast(w) * (win*win) + p] = y * side + x; + } + } + } + } + return idx; + }; + + auto make_unwin_idx = [&](int side, int win) { + const std::vector fwd = make_win_idx(side, win); + std::vector inv(fwd.size()); + for (size_t i = 0; i < fwd.size(); ++i) { + inv[fwd[i]] = static_cast(i); + } + return inv; + }; + + auto make_spatial_idx = [](int side, int offset) { + const int off_y = (offset >> 1) & 1; + const int off_x = offset & 1; + const int new_s = side / 2; + std::vector idx(static_cast(new_s) * new_s); + for (int y = 0; y < new_s; ++y) { + for (int x = 0; x < new_s; ++x) { + idx[y * new_s + x] = (y * 2 + off_y) * side + (x * 2 + off_x); + } + } + return idx; + }; + + auto upload = [&](const std::string & name, const std::vector & idx) { + ggml_tensor * t = ggml_graph_get_tensor(gf, name.c_str()); + GGML_ASSERT(t); + ggml_backend_tensor_set(t, idx.data(), 0, idx.size() * sizeof(int32_t)); + }; + + // Stage 1b only uses block 0's permutations; future stages + // will upload all blocks. + for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) { + const std::string prefix = "g4v_blk" + std::to_string(bid) + "_"; + upload(prefix + "win_idx", make_win_idx(image_side, window_side)); + upload(prefix + "qwin_idx", make_win_idx(new_side, query_side)); + upload(prefix + "unwin_idx", make_unwin_idx(new_side, query_side)); + const auto spatial_offset = hparams.proj_spatial_offsets[bid]; + if (spatial_offset >= 0) { + upload(prefix + "spatial_idx", make_spatial_idx(image_side,spatial_offset)); + } + } + } break; default: GGML_ABORT("Unknown projector type"); } @@ -4384,7 +4548,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; case PROJECTOR_TYPE_GRANITE_SPEECH: - return ctx->model.qf_proj_linear_w->ne[1]; + return ctx->model.qf_proj_blocks[0].qf_proj_linear_w->ne[1]; + case PROJECTOR_TYPE_GRANITE4_VISION: + return ctx->model.qf_proj_blocks.size() * ctx->model.hparams.projection_dim; case PROJECTOR_TYPE_GLM4V: return ctx->model.mm_ffn_down_w->ne[1]; default: diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 9b807ffa77b..a62c9d61877 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -100,8 +100,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch */ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); -struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); - bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp index c7e3794a49e..5e66f75d0a9 100644 --- a/tools/mtmd/models/granite-speech.cpp +++ b/tools/mtmd/models/granite-speech.cpp @@ -199,8 +199,8 @@ ggml_cgraph * clip_graph_granite_speech::build() { ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj); - ggml_tensor * queries = build_norm(model.qf_proj_query, - model.qf_proj_norm_w, model.qf_proj_norm_b, + ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query, + model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b, NORM_TYPE_NORMAL, proj_eps, -1); { ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1); @@ -209,8 +209,8 @@ ggml_cgraph * clip_graph_granite_speech::build() { queries = ggml_repeat(ctx0, q_3d, q_shape); } - for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) { - const auto & pl = model.qf_proj_layers[il]; + for (int il = 0; il < (int)model.qf_proj_blocks[0].qf_proj_layers.size(); il++) { + const auto & pl = model.qf_proj_blocks[0].qf_proj_layers[il]; // self-attention { @@ -265,7 +265,7 @@ ggml_cgraph * clip_graph_granite_speech::build() { } cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj); - cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b); + cur = ggml_add(ctx0, build_mm(model.qf_proj_blocks[0].qf_proj_linear_w, cur), model.qf_proj_blocks[0].qf_proj_linear_b); cb(cur, "projector_out", -1); } diff --git a/tools/mtmd/models/granite4-vision.cpp b/tools/mtmd/models/granite4-vision.cpp new file mode 100644 index 00000000000..9adb6f0fdbf --- /dev/null +++ b/tools/mtmd/models/granite4-vision.cpp @@ -0,0 +1,339 @@ +#include "models.h" +#include "../clip-impl.h" +#include "../clip-model.h" + +#include +#include +#include +#include +#include + +/* + * Granite Vision 4.1 clip graph + * + * Stage 1a: SigLIP vision tower (N layers, post-norm) + * Stage 1b: WindowQFormer blocks (deepstack + spatial) + * Stage 1c: Concatenate and pack outputs + * Stage 1d: Append newline tokens if add_newline is set + */ + +// --------------------------------------------------------------------------- +// Member method implementations +// --------------------------------------------------------------------------- + +ggml_tensor * clip_graph_granite4_vision::gather( + ggml_tensor * src, + const std::string & name, + int idx_len) { + ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, idx_len); + ggml_set_name(idx, name.c_str()); + ggml_set_input(idx); + return ggml_get_rows(ctx0, src, idx); +} + +ggml_tensor * clip_graph_granite4_vision::interp_down( + ggml_tensor * src, + int side, + int new_side) { + const int n_embd = src->ne[0]; + ggml_tensor * t = ggml_reshape_4d(ctx0, src, n_embd, side, side, 1); + t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 0, 1, 3)); + const int kernel = side / new_side; + t = ggml_pool_2d(ctx0, t, GGML_OP_POOL_AVG, kernel, kernel, kernel, kernel, 0, 0); + t = ggml_cont(ctx0, ggml_permute(ctx0, t, 1, 2, 0, 3)); + return ggml_reshape_2d(ctx0, t, n_embd, new_side * new_side); +} + +// --------------------------------------------------------------------------- +// build_block - WindowQFormer block implementation +// --------------------------------------------------------------------------- + +ggml_tensor * clip_graph_granite4_vision::build_block( + const qf_block & blk, + ggml_tensor * h, + int bid, + int spatial_offset, + int image_side, + int window_side, + int query_side, + float qformer_eps) { + + const int n_embd = h->ne[0]; + GGML_ASSERT(h->ne[1] == image_side * image_side); + const int n = image_side / window_side; + const int new_side = n * query_side; + const int n_windows = n * n; + const int enc_len = window_side * window_side; + const int query_len = query_side * query_side; + + auto cbx = [&](ggml_tensor * & t, const char * step) { + const std::string name = "g4v_blk" + std::to_string(bid) + "_" + step; + ggml_set_name(t, name.c_str()); + }; + + // 1. Top-level LN + cbx(h, "inp"); + ggml_tensor * x = build_norm(h, blk.qf_proj_norm_w, blk.qf_proj_norm_b, NORM_TYPE_NORMAL, eps, bid); + cbx(x, "norm"); + + // 2. enc = _win(x, image_side, window_side) + ggml_tensor * enc; + { + ggml_tensor * enc_flat = gather(x, + "g4v_blk" + std::to_string(bid) + "_win_idx", + image_side * image_side); + enc = ggml_reshape_3d(ctx0, enc_flat, n_embd, enc_len, n_windows); + } + cbx(enc, "enc"); + + // 3. downsampled = downsampler(x) + ggml_tensor * d; + (void) spatial_offset; + if (spatial_offset >= 0) { + d = gather(x, + "g4v_blk" + std::to_string(bid) + "_spatial_idx", + new_side * new_side); + } else { + d = interp_down(x, image_side, new_side); + } + cbx(d, "downsampled"); + + // 4. query_embeds = query + _win(d, new_side, query_side) + ggml_tensor * q_in; + { + ggml_tensor * dw_flat = gather(d, + "g4v_blk" + std::to_string(bid) + "_qwin_idx", + new_side * new_side); + ggml_tensor * dw = ggml_reshape_3d(ctx0, dw_flat, n_embd, query_len, n_windows); + q_in = ggml_add(ctx0, dw, blk.qf_proj_query); + } + cbx(q_in, "query_embeds"); + + // 5. encoder_embeds = enc + image_positions → (C, enc_len, n_windows) + ggml_tensor * e_in = ggml_add(ctx0, enc, blk.qf_proj_img_pos); + cbx(e_in, "encoder_embeds"); + + // 6. Qformer forward. + ggml_tensor * q = build_norm(q_in, blk.qf_proj_post_norm_w, blk.qf_proj_post_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid); + + // Helper for linear projections with window batching + auto linear = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) -> ggml_tensor * { + ggml_tensor * t = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]); + t = build_mm(w, t); + if (b) t = ggml_add(ctx0, t, b); + return t; + }; + + // Get the single QFormer layer + GGML_ASSERT(blk.qf_proj_layers.size() == 1); + const auto & pl = blk.qf_proj_layers[0]; + + // 6a. Self-attention + ggml_tensor * sa_out; + { + const int d_h = 64; + const int n_head = n_embd / d_h; + const int nq = q->ne[1]; + const float scale = 1.0f / std::sqrt((float) d_h); + + ggml_tensor * Q = linear(q, pl.q_w, pl.q_b); + ggml_tensor * K = linear(q, pl.k_w, pl.k_b); + ggml_tensor * V = linear(q, pl.v_w, pl.v_b); + + Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows); + K = ggml_reshape_4d(ctx0, K, d_h, n_head, nq, n_windows); + V = ggml_reshape_4d(ctx0, V, d_h, n_head, nq, n_windows); + + sa_out = build_attn(pl.o_w, pl.o_b, Q, K, V, nullptr, scale, bid); + sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, nq, n_windows); + + sa_out = ggml_add(ctx0, sa_out, q); + sa_out = build_norm(sa_out, pl.ln_1_w, pl.ln_1_b, + NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(sa_out, "sa_out"); + + // 6b. Cross-attention + ggml_tensor * ca_out; + { + const int d_h = 64; + const int n_head = n_embd / d_h; + const int nq = sa_out->ne[1]; + const int nkv = e_in->ne[1]; + const float scale = 1.0f / std::sqrt((float) d_h); + + ggml_tensor * Q = linear(sa_out, pl.cross_attn_q_w, pl.cross_attn_q_b); + ggml_tensor * K = linear(e_in, pl.cross_attn_k_w, pl.cross_attn_k_b); + ggml_tensor * V = linear(e_in, pl.cross_attn_v_w, pl.cross_attn_v_b); + + Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows); + K = ggml_reshape_4d(ctx0, K, d_h, n_head, nkv, n_windows); + V = ggml_reshape_4d(ctx0, V, d_h, n_head, nkv, n_windows); + + ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b, + Q, K, V, nullptr, scale, bid); + ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, nq, n_windows); + + ca_out = ggml_add(ctx0, ca_out, sa_out); + ca_out = build_norm(ca_out, pl.cross_attn_norm_w, pl.cross_attn_norm_b, + NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(ca_out, "ca_out"); + + // 6c. FFN + ggml_tensor * ffn; + { + ggml_tensor * t = ggml_reshape_2d(ctx0, ca_out, n_embd, query_len * n_windows); + t = build_mm(pl.ff_up_w, t); + if (pl.ff_up_b) t = ggml_add(ctx0, t, pl.ff_up_b); + t = ggml_gelu_erf(ctx0, t); + t = build_mm(pl.ff_down_w, t); + if (pl.ff_down_b) t = ggml_add(ctx0, t, pl.ff_down_b); + t = ggml_reshape_3d(ctx0, t, n_embd, query_len, n_windows); + ffn = ggml_add(ctx0, t, ca_out); + ffn = build_norm(ffn, pl.ln_2_w, pl.ln_2_b, NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(ffn, "qformer_out"); + + // 7. _unwin back to raster + ggml_tensor * unwinned; + { + ggml_tensor * flat = ggml_reshape_2d(ctx0, ffn, n_embd, query_len * n_windows); + unwinned = gather(flat, + "g4v_blk" + std::to_string(bid) + "_unwin_idx", + new_side * new_side); + } + cbx(unwinned, "unwin"); + + // 8. out_linear + ggml_tensor * out = build_mm(blk.qf_proj_linear_w, unwinned); + if (blk.qf_proj_linear_b) out = ggml_add(ctx0, out, blk.qf_proj_linear_b); + cbx(out, "out"); + + return out; +} + +// --------------------------------------------------------------------------- +// build() - top-level graph +// --------------------------------------------------------------------------- + +// Build the K-tiled, base-scaled newline row tensor. +// Shape: (n_mmproj_embd, 1) +ggml_tensor * clip_graph_granite4_vision::build_newline_row(ggml_context * ctx0) { + const int K = (int) model.qf_proj_blocks.size(); + GGML_ASSERT(K > 0); + GGML_ASSERT(n_mmproj_embd % K == 0); + const int projection_dim = n_mmproj_embd / K; + GGML_ASSERT(model.image_newline != nullptr); + GGML_ASSERT(ggml_nelements(model.image_newline) == projection_dim); + + // Build newline_row[k*projection_dim + d] = nl[d] * (k == 0 ? base : 1.0) + ggml_tensor * nl = model.image_newline; // (projection_dim,) + ggml_tensor * nl_first_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1); + ggml_tensor * nl_row_2d; + if (K == 1) { + nl_row_2d = nl_first_2d; + } else { + ggml_tensor * nl_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1); + ggml_tensor * rest_template = ggml_new_tensor_2d( + ctx0, GGML_TYPE_F32, projection_dim, K - 1); + ggml_tensor * nl_rest = ggml_repeat(ctx0, nl_2d, rest_template); + nl_row_2d = ggml_concat(ctx0, nl_first_2d, nl_rest, 1); // (projection_dim, K) + } + nl_row_2d = ggml_cont(ctx0, nl_row_2d); + return ggml_reshape_2d(ctx0, nl_row_2d, n_mmproj_embd, 1); +} + +// Append a single newline row at the end of the tile output. +ggml_tensor * clip_graph_granite4_vision::append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output) { + // For the single-tile case, append one newline row at the end. + // For the multi-tile rowwise case, this will be called per-tile + // (though currently only the single-tile path uses it). + ggml_tensor * nl_row = build_newline_row(ctx0); + return ggml_concat(ctx0, tile_output, nl_row, 1); +} + +ggml_cgraph * clip_graph_granite4_vision::build() { + GGML_ASSERT(model.patch_embeddings_0 != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + GGML_ASSERT(!model.qf_proj_blocks.empty()); + + // --- Stage 1a: SigLIP encoder producing intermediate hidden states --- + ggml_tensor * inp = build_inp(); + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "pos_embed", -1); + + ggml_tensor * inpL = inp; + std::vector layer_outs(n_layer, nullptr); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + + // Self-attention + ggml_tensor * Qcur = build_mm(layer.q_w, cur); + if (layer.q_b) Qcur = ggml_add(ctx0, Qcur, layer.q_b); + ggml_tensor * Kcur = build_mm(layer.k_w, cur); + if (layer.k_b) Kcur = ggml_add(ctx0, Kcur, layer.k_b); + ggml_tensor * Vcur = build_mm(layer.v_w, cur); + if (layer.v_b) Vcur = ggml_add(ctx0, Vcur, layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + layer_outs[il] = cur; + inpL = cur; + } + + // --- Stage 1b/1c: WindowQFormer blocks --- + const int projector_count = hparams.vision_feature_layer.size(); + const float qformer_eps = 1e-12f; + + ggml_tensor * mmproj = nullptr; + for (int bid = 0; bid < projector_count; ++bid) { + const auto & blk = model.qf_proj_blocks[bid]; + + int vlayer = hparams.vision_feature_layer[bid]; + GGML_ASSERT(vlayer >= 0 && vlayer < n_layer); + ggml_tensor * h = layer_outs[vlayer]; + + ggml_tensor * stream = build_block( + blk, h, bid, + hparams.proj_spatial_offsets[bid], + n_patches_x, + hparams.downsample_window_side, + hparams.downsample_query_side, + qformer_eps); + cb(stream, (std::string("proj_") + std::to_string(bid) + std::string("_v_out")).c_str(), vlayer); + mmproj = mmproj ? ggml_concat(ctx0, mmproj, stream, 0) : stream; + } + + // --- Stage 1d: Append newline tokens if add_newline is set --- + if (add_newline) { + mmproj = append_rowwise_newlines(ctx0, mmproj); + ggml_set_name(mmproj, "g4v_mmproj_out_nl"); + } else { + ggml_set_name(mmproj, "g4v_mmproj_out"); + } + ggml_build_forward_expand(gf, mmproj); + + return gf; +} diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp index 4af17ccfe85..5aa3d2f0fac 100644 --- a/tools/mtmd/models/llava.cpp +++ b/tools/mtmd/models/llava.cpp @@ -51,7 +51,6 @@ ggml_cgraph * clip_graph_llava::build() { } std::vector embedding_stack; - const auto & vision_feature_layer = hparams.vision_feature_layer; // loop over layers for (int il = 0; il < max_feature_layer; il++) { @@ -60,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() { // If this is an embedding feature layer, save the output. // NOTE: 0 index here refers to the input to the encoder. - if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + if (hparams.is_vision_feature_layer(il)) { embedding_stack.push_back(cur); } @@ -135,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() { // process vision feature layers (used by granite) { // final layer is a vision feature layer - if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { + if (hparams.is_vision_feature_layer(max_feature_layer)) { embedding_stack.push_back(inpL); } diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index b882f800dd7..d1865103bcb 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -211,3 +211,26 @@ struct clip_graph_exaone4_5 : clip_graph { clip_graph_exaone4_5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; + +struct clip_graph_granite4_vision : clip_graph { + clip_graph_granite4_vision(clip_ctx * ctx, const clip_image_f32 & img) + : clip_graph(ctx, img), + add_newline(img.add_newline) {} + + ggml_cgraph * build() override; + +private: + // The graph is per-tile since only batch-size 1 is supported in clip. As + // such, this value is set at construct time based on the tile that will be + // encoded, then used during build to determine how to handle newlines. + const bool add_newline; + + ggml_tensor * gather(ggml_tensor * src, const std::string & name, int idx_len); + ggml_tensor * interp_down(ggml_tensor * src, int side, int new_side); + ggml_tensor * build_block(const qf_block & blk, ggml_tensor * h, int bid, + int spatial_offset, int image_side, int window_side, + int query_side, float qformer_eps); + + ggml_tensor * build_newline_row(ggml_context * ctx0); + ggml_tensor * append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output); +}; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 0b5caa6cb5c..260f307560a 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -513,6 +513,12 @@ struct mtmd_context { img_end = ""; image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + img_beg = ""; + img_end = ""; + image_preproc = std::make_unique(ctx_v); + } break; default: throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); } @@ -808,6 +814,21 @@ struct mtmd_tokenizer { return 2; } + // Annotate llava-next style tiles so clip_n_output_tokens accounts + // for per-tile newline injection. + if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) { + if (batch_f32.entries.size() == 1) { + // Single-tile (overview only): append one newline row. + batch_f32.entries[0]->add_newline = true; + } else { + // Multi-tile: overview gets no newline, grid tiles get one. + batch_f32.entries[0]->add_newline = false; + for (size_t i = 1; i < batch_f32.entries.size(); ++i) { + batch_f32.entries[i]->add_newline = true; + } + } + } + // handle llava-uhd style preprocessing const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0; if ( @@ -872,9 +893,10 @@ struct mtmd_tokenizer { } } else { + size_t n_tokens = 0; - for (const auto & entry : batch_f32.entries) { - n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get()); + for (const auto & e : batch_f32.entries) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get()); } mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); @@ -1111,7 +1133,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) || proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE || proj_type == PROJECTOR_TYPE_INTERNVL - || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2 + || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; // entries may have different token counts From c4a278d68efa17811006f2123a84081dac03fac7 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 5 Jun 2026 18:12:27 +0200 Subject: [PATCH 28/71] model: fix build failed (#24193) --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6808ad044c7..137d3501e01 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1764,7 +1764,7 @@ void llama_model::print_info() const { [](const auto & entry) { return entry >= 0; })) { LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__, print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; }, - hparams.n_layer).c_str()); + hparams.n_layer()).c_str()); } // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { From e82beaa60d3bef6b6234656389239facf5ba2a3a Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Fri, 5 Jun 2026 19:44:40 +0200 Subject: [PATCH 29/71] vulkan: add fwht support for Intel with shmem reduction (#23964) * vulkan: add fwht support for Intel with shmem reduction * don't use N as workgroup size * disable subgroup shuffle on MoltenVK AMD * disable fwht shader on Intel Windows due to driver bug --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 13 ++++ ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp | 78 +++++++++++++++---- .../vulkan-shaders/vulkan-shaders-gen.cpp | 1 + 3 files changed, 76 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e7d04634b8a..df410368a79 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5084,6 +5084,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { } ++idx; } + } else if (device->driver_id != vk::DriverId::eIntelProprietaryWindows) { + // Disabled on Intel Windows due to a driver bug: https://github.com/ggml-org/llama.cpp/pull/23964#issuecomment-4598226147 + int idx = 0; + for (uint32_t n : {64, 128, 256, 512}) { + const uint32_t block_size = std::min(device->subgroup_size, n); + ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { block_size, n }, 1); + ++idx; + } } const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; @@ -5630,6 +5638,11 @@ static vk_device ggml_vk_get_device(size_t idx) { #endif device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle); +#ifdef __APPLE__ + if (device->vendor_id == VK_VENDOR_ID_AMD) { + device->subgroup_shuffle = false; + } +#endif device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp index 72059d4afc2..a2069964adb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp @@ -1,14 +1,16 @@ #version 450 #extension GL_EXT_control_flow_attributes : require +#ifndef FWHT_SHMEM #extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_shuffle : enable +#endif -layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; - -layout(constant_id = 0) const uint WARP_SIZE = 32; +layout(constant_id = 0) const uint BLOCK_SIZE = 32; layout(constant_id = 1) const uint N = 128; +layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; + layout(push_constant) uniform parameter { uint n_rows; @@ -20,35 +22,72 @@ layout(push_constant) uniform parameter layout(binding = 0, std430) readonly buffer A { float data_a[]; }; layout(binding = 1, std430) writeonly buffer D { float data_d[]; }; -const uint EL_W = N / WARP_SIZE; +const uint EL_W = N / BLOCK_SIZE; + +#ifdef FWHT_SHMEM +shared float shmem[4 * N]; +#endif void main() { - const uint lane = gl_SubgroupInvocationID; - for (uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID; - row < n_rows; - row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) { +#ifdef FWHT_SHMEM + const uint tid = gl_LocalInvocationID.x; + const uint shmem_base = gl_LocalInvocationID.y * N; + const uint row_id = gl_LocalInvocationID.y; +#else + const uint tid = gl_SubgroupInvocationID; + const uint row_id = gl_SubgroupID; +#endif + + for (uint base_row = gl_WorkGroupID.x * gl_WorkGroupSize.y; + base_row < n_rows; + base_row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) { + const uint row = base_row + row_id; const uint row_offset = row * N; +#ifndef FWHT_SHMEM + if (row >= n_rows) { + continue; + } +#endif + float reg[EL_W]; [[unroll]] for (uint i = 0; i < EL_W; ++i) { - reg[i] = data_a[src_offset + row_offset + i * WARP_SIZE + lane] * scale; + reg[i] = row < n_rows ? data_a[src_offset + row_offset + i * BLOCK_SIZE + tid] * scale : 0.0; } +#ifdef FWHT_SHMEM + [[unroll]] + for (uint h = 1; h < BLOCK_SIZE; h <<= 1) { + [[unroll]] + for (uint i = 0; i < EL_W; ++i) { + shmem[shmem_base + i * BLOCK_SIZE + tid] = reg[i]; + } + barrier(); + [[unroll]] + for (uint j = 0; j < EL_W; ++j) { + const float val = reg[j]; + const float other = shmem[shmem_base + j * BLOCK_SIZE + (tid ^ h)]; + reg[j] = (tid & h) == 0 ? val + other : other - val; + } + barrier(); + } +#else [[unroll]] - for (uint h = 1; h < WARP_SIZE; h <<= 1) { + for (uint h = 1; h < BLOCK_SIZE; h <<= 1) { [[unroll]] for (uint j = 0; j < EL_W; ++j) { const float val = reg[j]; const float val2 = subgroupShuffleXor(val, h); - reg[j] = (lane & h) == 0 ? val + val2 : val2 - val; + reg[j] = (tid & h) == 0 ? val + val2 : val2 - val; } } +#endif [[unroll]] - for (uint h = WARP_SIZE; h < N; h <<= 1) { - const uint step = h / WARP_SIZE; + for (uint h = BLOCK_SIZE; h < N; h <<= 1) { + const uint step = h / BLOCK_SIZE; [[unroll]] for (uint j = 0; j < EL_W; j += 2 * step) { [[unroll]] @@ -61,9 +100,16 @@ void main() { } } - [[unroll]] - for (uint i = 0; i < EL_W; ++i) { - data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i]; +#ifdef FWHT_SHMEM + if (row < n_rows) { +#endif + [[unroll]] + for (uint i = 0; i < EL_W; ++i) { + data_d[dst_offset + row_offset + i * BLOCK_SIZE + tid] = reg[i]; + } +#ifdef FWHT_SHMEM } + barrier(); +#endif } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index de7dbec2c63..d65cd12b287 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -957,6 +957,7 @@ void process_shaders() { string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}})); string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("fwht_f32", "fwht.comp", {}); + string_to_spv("fwht_shmem_f32", "fwht.comp", {{"FWHT_SHMEM", "1"}}); string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}})); string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); From da87e9b612592ccd2a2e783eff631346459b9de0 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 5 Jun 2026 21:31:56 +0200 Subject: [PATCH 30/71] common/chat : unify and fix LFM2/LFM2.5 tool parser (#24178) --- common/chat-peg-parser.cpp | 60 +++++++++++++--- common/chat-peg-parser.h | 7 +- common/chat.cpp | 141 +++++++------------------------------ tests/test-chat.cpp | 68 +++++++++++++++--- 4 files changed, 143 insertions(+), 133 deletions(-) diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index 12e747d1ca1..9bc5ac98be6 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -87,6 +87,8 @@ static std::string normalize_quotes_to_json(const std::string & input) { bool in_single_quoted = false; bool in_double_quoted = false; + auto is_word_char = [](char ch) { return std::isalnum(static_cast(ch)) || ch == '_'; }; + for (size_t i = 0; i < input.size(); ++i) { char c = input[i]; @@ -151,6 +153,29 @@ static std::string normalize_quotes_to_json(const std::string & input) { in_single_quoted = true; result += '"'; } + } else if (!in_single_quoted && !in_double_quoted && (c == 'T' || c == 'F' || c == 'N') && + (i == 0 || !is_word_char(input[i - 1]))) { + // Python literals -> JSON; prefix match keeps streamed partials monotonic. + static constexpr std::pair literals[] = { + { "True", "true" }, { "False", "false" }, { "None", "null" }, + }; + size_t n = 0; + while (i + n < input.size() && is_word_char(input[i + n])) { + ++n; + } + std::string_view token(input.data() + i, n); + bool matched = false; + for (const auto & [py, js] : literals) { + if (py.substr(0, n) == token) { + result += js.substr(0, n); + i += n - 1; + matched = true; + break; + } + } + if (!matched) { + result += c; + } } else { result += c; } @@ -353,12 +378,8 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) { } value_to_add += escape_json_string_inner(value_content); } else if (!value_content.empty()) { - // For potential containers, normalize Python-style single quotes to JSON double quotes - bool is_potential_container = value_content[0] == '[' || value_content[0] == '{'; - if (is_potential_container) { - value_content = normalize_container_value(value_content); - } - value_to_add += value_content; + // Pythonic scalars/containers -> JSON. + value_to_add += normalize_container_value(value_content); } args_target() += value_to_add; @@ -466,11 +487,34 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools( return force_tool_calls ? section : optional(section); } +// Like python_value(), but the leaf also accepts JSON-cased true/false/null, used by LFM2/LFM2.5 +common_peg_parser common_chat_peg_builder::python_or_json_value() { + return rule("python-or-json-value", [this]() { + auto ws = space(); + auto value = python_or_json_value(); + + auto member = sequence({ python_string(), ws, literal(":"), ws, value }); + auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) }); + auto dict = rule("python-or-json-dict", [&]() { + return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }), ws }); + }); + + auto elements = sequence({ value, zero_or_more(sequence({ literal(","), ws, value })) }); + auto array = rule("python-or-json-array", [&]() { + return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }), ws }); + }); + + return choice({ dict, array, python_string(), python_number(), + python_bool(), python_null(), json_bool(), json_null() }); + }); +} + // Python-style tool calls: name(arg1="value1", arg2=123) // Used only by LFM2 for now, so we don't merge it into autoparser common_peg_parser common_chat_peg_builder::python_style_tool_calls( const ordered_json & tools, - bool parallel_tool_calls) { + bool parallel_tool_calls, + bool allow_json_literals) { if (!tools.is_array() || tools.empty()) { return eps(); } @@ -504,7 +548,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls( if (is_string_type) { arg_value_parser = string_value_parser; } else { - arg_value_parser = tool_arg_value(python_value()); + arg_value_parser = tool_arg_value(allow_json_literals ? python_or_json_value() : python_value()); } // Full argument: name="value" or name=value diff --git a/common/chat-peg-parser.h b/common/chat-peg-parser.h index be92f17d909..a4643fbea86 100644 --- a/common/chat-peg-parser.h +++ b/common/chat-peg-parser.h @@ -132,9 +132,13 @@ class common_chat_peg_builder : public common_peg_parser_builder { // Helper for Python-style function call format: name(arg1="value1", arg2=123) // Used by LFM2 and similar templates common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools, - bool parallel_tool_calls); + bool parallel_tool_calls, + bool allow_json_literals); private: + // Python values plus JSON true/false/null. + common_peg_parser python_or_json_value(); + // Implementation helpers for standard_json_tools — one per JSON tool call layout mode common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools, const std::string & args_key, @@ -195,4 +199,3 @@ struct tagged_peg_parser { tagged_peg_parser build_tagged_peg_parser( const std::function & fn); - diff --git a/common/chat.cpp b/common/chat.cpp index ef151691c38..b8f248dab4e 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1608,42 +1608,40 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp return data; } -// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt -// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls. -// - Reasoning: {reasoning} (optional) -// - Content: text before a tool call (optional) -// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")] -// Tool calls can appear multiple times (parallel tool calls supported) -static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, - const autoparser::generation_params & inputs) { +// LFM2/LFM2.5 parser. Tool calls are almost Python-style and parallel-capable +// (except dotted names and JSON literals true/false/null). +// Always wrapped in <|tool_call_start|>[name(args)]<|tool_call_end|> with optional reasoning. +// tool_list_tokens preserves LFM2 system tool-list markers. +static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, + const autoparser::generation_params & inputs, + bool tool_list_tokens) { common_chat_params data; - data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); - data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); - data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.supports_thinking = true; - data.preserved_tokens = { - "<|tool_list_start|>", - "<|tool_list_end|>", - "<|tool_call_start|>", - "<|tool_call_end|>", - "", - "", - }; - - auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); - auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; - auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; - const std::string TOOL_CALL_START = "<|tool_call_start|>"; const std::string TOOL_CALL_END = "<|tool_call_end|>"; + const std::string TOOL_LIST_START = "<|tool_list_start|>"; + const std::string TOOL_LIST_END = "<|tool_list_end|>"; const std::string THINK_START = ""; const std::string THINK_END = ""; const std::string GEN_PROMPT = "<|im_start|>assistant\n"; + data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + data.supports_thinking = true; + data.preserved_tokens = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END }; + if (tool_list_tokens) { + data.preserved_tokens.push_back(TOOL_LIST_START); + data.preserved_tokens.push_back(TOOL_LIST_END); + } + data.thinking_start_tag = THINK_START; data.thinking_end_tag = THINK_END; + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); + auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; + auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; + if (inputs.has_continuation()) { const auto & msg = inputs.continue_msg; @@ -1670,7 +1668,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat auto tool_calls = p.rule("tool-calls", p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) + - p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) + + p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls, /* allow_json_literals = */ true) + p.literal(TOOL_CALL_END) ) ); @@ -1697,93 +1695,6 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START } }; } - return data; -} - -// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens. -// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>. -// - Reasoning: {reasoning} (optional) -// - Content: text before a tool call (optional) -// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")] -// Tool calls can appear multiple times (parallel tool calls supported) -static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template & tmpl, - const autoparser::generation_params & inputs) { - common_chat_params data; - - data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); - data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); - data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.supports_thinking = true; - data.preserved_tokens = { - "<|tool_call_start|>", - "<|tool_call_end|>", - "", - "", - }; - - auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); - auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; - auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; - - const std::string THINK_START = ""; - const std::string THINK_END = ""; - const std::string GEN_PROMPT = "<|im_start|>assistant\n"; - - data.thinking_start_tag = THINK_START; - data.thinking_end_tag = THINK_END; - - if (inputs.has_continuation()) { - const auto & msg = inputs.continue_msg; - - data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content; - if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { - data.generation_prompt += THINK_END + msg.render_content(); - } - - data.prompt += data.generation_prompt; - } - - auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { - auto generation_prompt = p.literal(GEN_PROMPT); - auto end = p.end(); - - auto reasoning = p.eps(); - if (extract_reasoning && inputs.enable_thinking) { - reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END); - } - - if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { - return generation_prompt + reasoning + p.content(p.rest()) + end; - } - - auto tool_calls = p.rule("tool-calls", - p.trigger_rule("tool-call", - p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) - ) - ); - - auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["})); - auto maybe_start = p.optional(p.literal("<|tool_call_start|>")); - return generation_prompt + reasoning + content + maybe_start + tool_calls + end; - }); - - data.parser = parser.save(); - - if (include_grammar) { - data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool.at("function"); - auto schema = function.at("parameters"); - builder.resolve_refs(schema); - }); - parser.build_grammar(builder, data.grammar_lazy); - }); - foreach_function(inputs.tools, [&](const json & tool) { - const std::string name = tool.at("function").at("name"); - data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" }); - }); - } return data; } @@ -2298,14 +2209,14 @@ std::optional common_chat_try_specialized_template( if (is_lfm2_template(src)) { LOG_DBG("Using specialized template: LFM2\n"); - return common_chat_params_init_lfm2(tmpl, params); + return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true); } // LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens if (src.find("List of tools: [") != std::string::npos && src.find("<|tool_list_start|>") == std::string::npos) { LOG_DBG("Using specialized template: LFM2.5\n"); - return common_chat_params_init_lfm2_5(tmpl, params); + return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ false); } // GigaChatV3 format detection diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 30ea2c07213..3107045b4fc 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -684,6 +684,20 @@ static common_chat_tool config_tool{ })", }; +static common_chat_tool calendar_create_event_tool{ + /* .name = */ "Calendar.create_event", + /* .description = */ "Create a calendar event", + /* .parameters = */ R"({ + "type": "object", + "properties": { + "title": { "type": "string" }, + "participants": { "type": "array", "items": { "type": "string" } }, + "metadata": { "type": "object" } + }, + "required": ["title", "participants", "metadata"] + })", +}; + static common_chat_tool imaginary_number_tool{ /* .name = */ "imaginary_number", /* .description = */ "Imaginary number converter", @@ -4130,7 +4144,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); } - // LFM2.5 tests - uses plain "List of tools: [...]" and bare [name(args)] without wrapper tokens + // LFM2.5 tests - format <|tool_call_start|>[name(args)]<|tool_call_end|> { auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug); @@ -4138,19 +4152,57 @@ static void test_template_output_peg_parsers(bool detailed_debug) { tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); // Single tool call without reasoning - tst.test("[special_function(arg1=1)]") + tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") .tools({ special_function_tool }) .expect(message_assist_call) .run(); // Tool call with string argument - tst.test("[get_time(city=\"XYZCITY\")]") + tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>") .tools({ get_time_tool }) .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) .run(); + // Python literals become JSON. + tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>") + .tools({ toggle_tool }) + .expect(message_with_tool_calls("toggle", R"({"enabled": true})")) + .run(); + + tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>") + .tools({ nullable_tool }) + .expect(message_with_tool_calls("set_nullable", R"({"value": null})")) + .run(); + + // Nested Python literal. + tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>") + .tools({ config_tool }) + .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})")) + .run(); + + // JSON literals are accepted too. + tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>") + .tools({ config_tool }) + .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})")) + .run(); + + // Dotted function name with structured args. + tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], " + "metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>") + .tools({ calendar_create_event_tool }) + .expect(message_with_tool_calls( + "Calendar.create_event", + R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})")) + .run(); + + // Markdown links stay content. + tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).") + .tools({ get_time_tool }) + .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")) + .run(); + // Tool call with reasoning (enable_thinking=true) - tst.test("I'm\nthinking[special_function(arg1=1)]") + tst.test("I'm\nthinking<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") .enable_thinking(true) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ special_function_tool }) @@ -4158,7 +4210,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); // Multiple tool calls (parallel) - tst.test("[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]") + tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>") .parallel_tool_calls(true) .tools({ special_function_tool, special_function_tool_with_optional_param @@ -4170,7 +4222,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); // Tool call with content before tool call - tst.test("Let me check the time.[get_time(city=\"Paris\")]") + tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>") .tools({ get_time_tool }) .expect(message_with_reasoning_content_and_multiple_tool_calls( "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } } @@ -4178,14 +4230,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); // Partial tool call (streaming) - tst.test("[special_function(arg1=") + tst.test("<|tool_call_start|>[special_function(arg1=") .tools({ special_function_tool }) .is_partial(true) .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": ")) .run(); // Tool call with empty arguments - tst.test("[empty_args()]") + tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>") .tools({ empty_args_tool }) .expect(simple_assist_msg("", "", "empty_args", "{}")) .run(); From 308f61c31f083251ce8150f10b9ef97679b500b5 Mon Sep 17 00:00:00 2001 From: lhez Date: Fri, 5 Jun 2026 13:45:25 -0700 Subject: [PATCH 31/71] opencl: improve get_rows, cpy, concat and q6_k flat gemv (#24160) * opencl: allow multiple workgroups for large rows * opencl: improve small cpy * opencl: packed concat for small input * opencl: tweak flat q6_K gemv, increase N_DST and remap threads --- ggml/src/ggml-opencl/ggml-opencl.cpp | 71 +++++++++-- ggml/src/ggml-opencl/kernels/concat.cl | 67 +++++++++++ ggml/src/ggml-opencl/kernels/cpy.cl | 59 +++++++++ ggml/src/ggml-opencl/kernels/get_rows.cl | 24 ++-- .../kernels/mul_mv_q6_k_f32_flat.cl | 112 ++++++++---------- 5 files changed, 247 insertions(+), 86 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index c411e4aeaec..2a41215fd13 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -558,7 +558,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32; cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16; cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16; - cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32; + cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32; cl_kernel kernel_mul_mat_f32_f32; cl_kernel kernel_mul_mat_f16_f16; cl_kernel kernel_mul_mat_f16_f32_1row; @@ -639,7 +639,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc; cl_kernel kernel_upscale; cl_kernel kernel_upscale_bilinear; - cl_kernel kernel_concat_f32; + cl_kernel kernel_concat_f32, kernel_concat_f32_pack; cl_kernel kernel_conv_2d_f16; cl_kernel kernel_conv_2d_f32; cl_kernel kernel_conv_2d_f16_f32; @@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err)); CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err)); CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err)); + CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err)); CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err)); GGML_LOG_CONT("."); } @@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err)); + CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err)); CL_CHECK(clReleaseProgram(prog)); GGML_LOG_CONT("."); } @@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c nth *= 2; } - size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12}; + int nchunks = 1; + if (src0->type == GGML_TYPE_F32) { + const int chunk_target = nth * 4; + nchunks = (ne00 + chunk_target - 1) / chunk_target; + nchunks = MAX(1, MIN(nchunks, 64)); + } + + size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12}; size_t local_work_size[] = {(size_t)nth, 1, 1}; backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); @@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con int nth = MIN(64, ne0); - cl_kernel kernel = backend_ctx->kernel_concat_f32; + const bool concat_pack = (dim == 0 && ne0 < 32); + cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack + : backend_ctx->kernel_concat_f32; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); @@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim)); - size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3}; - size_t local_work_size[] = {(size_t)nth, 1, 1}; + if (concat_pack) { + // packed kernel needs the dst dims to unflatten its 1-D row index. + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3)); + + const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel); + const int base = MIN(64, maxwg); + const int tpr = MIN(ne0, base); // threads per row + const int rpw = MAX(1, base / tpr); // rows per workgroup + const int lsz = tpr * rpw; + const int nrows = ne1*ne2*ne3; + const int nwg = (nrows + rpw - 1) / rpw; + size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1}; + size_t local_work_size[] = {(size_t)lsz, 1, 1}; + backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst); + } else { + size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3}; + size_t local_work_size[] = {(size_t)nth, 1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } } static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { @@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } else if (backend_ctx->gpu_family == ADRENO) { nth0 = 64; nth1 = 2; - ndst = 4; + ndst = 16; } else { GGML_ASSERT(false && "TODO: Unknown GPU"); } @@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_cpy_f32_f16; break; case GGML_TYPE_F32: - kernel = backend_ctx->kernel_cpy_f32_f32; + kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack + : backend_ctx->kernel_cpy_f32_f32; break; default: GGML_ASSERT(false && "not implemented"); @@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); - const int nth = MIN(64, ne00); + if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) { + const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel); + const int base = MIN(64, maxwg); + const int tpr = MIN(ne00, base); // threads per row + const int rpw = MAX(1, base / tpr); // rows per workgroup + const int lsz = tpr * rpw; // <= base <= maxwg + const int nrows = ne01*ne02*ne03; + const int nwg = (nrows + rpw - 1) / rpw; - size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; - size_t local_work_size[] = {(size_t)nth, 1, 1}; + size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1}; + size_t local_work_size[] = {(size_t)lsz, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1); + } else { + const int nth = MIN(64, ne00); - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1); + size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; + size_t local_work_size[] = {(size_t)nth, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1); + } } static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { diff --git a/ggml/src/ggml-opencl/kernels/concat.cl b/ggml/src/ggml-opencl/kernels/concat.cl index 0c1b3d785ca..2fbd7851d3d 100644 --- a/ggml/src/ggml-opencl/kernels/concat.cl +++ b/ggml/src/ggml-opencl/kernels/concat.cl @@ -49,3 +49,70 @@ kernel void kernel_concat_f32( *y = *x; } } + +kernel void kernel_concat_f32_pack( + global const char * src0, + ulong offset0, + global const char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne03, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3, + int dim, + int ne1, + int ne2, + int ne3 +) { + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; + + int lsz = get_local_size(0); + int tpr = min(ne0, lsz); // threads per row + int rpw = lsz / tpr; // rows per workgroup + int lid = get_local_id(0); + int row = get_group_id(0)*rpw + lid / tpr; + int lane = lid - (lid / tpr) * tpr; + + int nrows = ne1*ne2*ne3; + if (row >= nrows) { + return; + } + + int i1 = row % ne1; + int t = row / ne1; + int i2 = t % ne2; + int i3 = t / ne2; + + int o[4] = {0, 0, 0, 0}; + o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); + + for (int i0 = lane; i0 < ne0; i0 += tpr) { + global const float * x; + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00); + } else { + x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10); + } + + global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + *y = *x; + } +} diff --git a/ggml/src/ggml-opencl/kernels/cpy.cl b/ggml/src/ggml-opencl/kernels/cpy.cl index 820aa538a34..adbd2e766d2 100644 --- a/ggml/src/ggml-opencl/kernels/cpy.cl +++ b/ggml/src/ggml-opencl/kernels/cpy.cl @@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32( } } +kernel void kernel_cpy_f32_f32_pack( + global float * src0, + ulong offset0, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne03, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne0, + int ne1, + int ne2, + int ne3, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3 +) { + src0 = (global float*)((global char*)src0 + offset0); + dst = (global float*)((global char*)dst + offsetd); + + int lsz = get_local_size(0); + int tpr = min(ne00, lsz); // threads per row + int rpw = lsz / tpr; // rows per workgroup + int lid = get_local_id(0); + int row = get_group_id(0)*rpw + lid / tpr; + int lane = lid - (lid / tpr) * tpr; + + int nrows = ne01*ne02*ne03; + if (row >= nrows) { + return; + } + + int i01 = row % ne01; + int t = row / ne01; + int i02 = t % ne02; + int i03 = t / ne02; + + // linear index of the first element of this row, unflattened over dst dims + long n = (long)row * ne00; + int i3 = (int)(n / ((long)ne2*ne1*ne0)); + long rm = n - (long)i3*ne2*ne1*ne0; + int i2 = (int)(rm / ((long)ne1*ne0)); + rm -= (long)i2*ne1*ne0; + int i1 = (int)(rm / ne0); + int i0 = (int)(rm - (long)i1*ne0); + + global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int i00 = lane; i00 < ne00; i00 += tpr) { + global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0]; + } +} + kernel void kernel_cpy_i32_i32( global int * src0, ulong offset0, diff --git a/ggml/src/ggml-opencl/kernels/get_rows.cl b/ggml/src/ggml-opencl/kernels/get_rows.cl index c2962edc983..9ae4fff09fc 100644 --- a/ggml/src/ggml-opencl/kernels/get_rows.cl +++ b/ggml/src/ggml-opencl/kernels/get_rows.cl @@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32( src1 = (global int*)((global char*)src1 + offset1); dst = (global float*)((global char*)dst + offsetd); - int i10 = get_group_id(0); - int i11 = get_group_id(1); - int i12 = get_group_id(2); + int nchunks = get_num_groups(0) / ne10; + int g = get_group_id(0); + int i10 = g / nchunks; + int chunk = g - i10 * nchunks; + int i11 = get_group_id(1); + int i12 = get_group_id(2); int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0]; int i02 = i11; int i03 = i12; - for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) { - if (ind >= ne00) { - return; - } - ((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] = - ((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind]; + global float * dst_row = (global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1); + global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03); + + int span = (ne00 + nchunks - 1) / nchunks; + int start = chunk * span; + int end = min(start + span, ne00); + + for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) { + dst_row[ind] = src_row[ind]; } } diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl index 86fe09c6dd6..57b90c05ae5 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl @@ -33,13 +33,15 @@ inline float block_q_6_K_dot_y_flat( global uchar * blk_qh, global char * blk_scales, global half * blk_d, - global float * yy, int ib, int ip, int is, - int l0 + int l0, + float4 y0, + float4 y1, + float4 y2, + float4 y3 ) { - int y_offset = 128*ip + l0; int q_offset_l = 64*ip + l0; int q_offset_h = 32*ip + l0; @@ -48,36 +50,28 @@ inline float block_q_6_K_dot_y_flat( global uchar * qh = blk_qh + ib*64 + q_offset_h; global char * sc = blk_scales + ib*16 + is; - global float * y = yy + ib * QK_K + y_offset; - float dall = blk_d[ib]; - float sumf = 0; - float4 sums = {0.f, 0.f, 0.f, 0.f}; - - sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f); - sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f); - sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f); - sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f); - - sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f); - sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f); - sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f); - sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f); - - sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f); - sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f); - sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f); - sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f); - - sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f); - sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f); - sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f); - sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f); - - sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]); - - return sumf; + // Vectorized loads: 3 uchar4 weight loads instead of 12 scalar byte reads. + // q_offset_l/h are 4-aligned, so these are aligned vector loads. + uchar4 q1v = vload4(0, q1); + uchar4 q2v = vload4(0, q2); + uchar4 qhv = vload4(0, qh); + + int4 q1i = convert_int4(q1v); + int4 q2i = convert_int4(q2v); + int4 qhi = convert_int4(qhv); + + // Reconstruct the four 6-bit weight groups (low/high nibble of ql OR'd with the + // matching 2-bit plane of qh), same arithmetic as the scalar version, then dot() + // against the cached activation lanes. + float4 w0 = convert_float4((q1i & 0xF) | ((qhi & Q6_K_MASK1) << 4)) - 32.f; + float4 w1 = convert_float4((q2i & 0xF) | ((qhi & Q6_K_MASK2) << 2)) - 32.f; + float4 w2 = convert_float4((q1i >> 4) | ((qhi & Q6_K_MASK3) )) - 32.f; + float4 w3 = convert_float4((q2i >> 4) | ((qhi & Q6_K_MASK4) >> 2)) - 32.f; + + return dall * (dot(y0, w0) * sc[0] + dot(y1, w1) * sc[2] + + dot(y2, w2) * sc[4] + dot(y3, w3) * sc[6]); } #undef N_DST @@ -89,7 +83,7 @@ inline float block_q_6_K_dot_y_flat( #define N_SIMDGROUP 2 #define N_SIMDWIDTH 16 #elif defined (ADRENO_GPU) -#define N_DST 4 +#define N_DST 16 #define N_SIMDGROUP 2 #define N_SIMDWIDTH 64 #endif @@ -146,49 +140,39 @@ kernel void kernel_mul_mv_q6_K_f32_flat( global half * blk_d = (global half *) src0_d + offset_src0_d; global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1; - int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0 - int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1 + int tid = get_sub_group_local_id()%(N_SIMDWIDTH/BLOCK_STRIDE); // within-super-block part, 0..15 + int ix = get_sub_group_local_id()/(N_SIMDWIDTH/BLOCK_STRIDE); // super-block selector, 0..BLOCK_STRIDE-1 int ip = tid/8; // first or second half of (super) block (0 or 1) int il = tid%8; // each half has 8 parts, one per scale int n = 4; // 4 scales at a time (and 4 sums) int l0 = n*il; // offset into half-block, 0..28 int is = 8*ip + l0/16; // 0, 1, 8, 9 - float4 sumf = 0; + float sumf[N_DST]; + for (int row = 0; row < N_DST; row++) { + sumf[row] = 0.f; + } for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) { - if (first_row + 0 < ne01) { - sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0); - } - if (first_row + 1 < ne01) { - sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0); - } - if (first_row + 2 < ne01) { - sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0); - } - if (first_row + 3 < ne01) { - sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0); + global float * y = yy + ib * QK_K + 128*ip + l0; + float4 y0 = vload4(0, y + 0); + float4 y1 = vload4(0, y + 32); + float4 y2 = vload4(0, y + 64); + float4 y3 = vload4(0, y + 96); + + for (int row = 0; row < N_DST; row++) { + if (first_row + row < ne01) { + sumf[row] += block_q_6_K_dot_y_flat( + blk_ql + row*nb*128, blk_qh + row*nb*64, blk_scales + row*nb*16, blk_d + row*nb, + ib, ip, is, l0, y0, y1, y2, y3); + } } } - float4 tot = (float4)( - sub_group_reduce_add(sumf.s0), - sub_group_reduce_add(sumf.s1), - sub_group_reduce_add(sumf.s2), - sub_group_reduce_add(sumf.s3) - ); - if (get_sub_group_local_id() == 0) { - if (first_row + 0 < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0; - } - if (first_row + 1 < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1; - } - if (first_row + 2 < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2; - } - if (first_row + 3 < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3; + for (int row = 0; row < N_DST; row++) { + float tot = sub_group_reduce_add(sumf[row]); + if (get_sub_group_local_id() == 0 && first_row + row < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; } } } From 603300b008e01934c242c541259449ad95e91a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 6 Jun 2026 06:06:47 +0200 Subject: [PATCH 32/71] context : fix off-by-one comparisons to n_gpu_layers (#24208) --- src/llama-context.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index eff1d8f89f2..d0c314199b5 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -341,7 +341,7 @@ llama_context::llama_context( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = model.n_devices() > 1 && - model.n_gpu_layers() > model.hparams.n_layer() && + model.n_gpu_layers() > model.hparams.n_layer_all && model.split_mode() == LLAMA_SPLIT_MODE_LAYER && cparams.offload_kqv && !model.has_tensor_overrides(); @@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const { // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer(); + const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer_all; if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); From 5343f4502ab5273d7cef85012af020cad0182376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 6 Jun 2026 06:07:20 +0200 Subject: [PATCH 33/71] model : rename local n_layer_all variable (#24209) --- src/llama-model.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 137d3501e01..0d23a605ee8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1205,7 +1205,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { const auto & use_mlock = params.use_mlock; const auto & tensor_split = params.tensor_split; - const int n_layer = hparams.n_layer_all; + const int n_layer_all = hparams.n_layer_all; const int n_gpu_layers = this->n_gpu_layers(); const bool use_mmap_buffer = true; @@ -1262,10 +1262,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { splits[i] /= split_sum; } - const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0); - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1); + const int i_gpu_start = std::max(n_layer_all + 1 - n_gpu_layers, 0); + const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer_all + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { - const bool is_swa = il < n_layer && hparams.is_swa(il); + const bool is_swa = il < n_layer_all && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; @@ -1281,13 +1281,13 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list }; // assign the repeating layers to the devices according to the splits - pimpl->dev_layer.resize(n_layer); - for (int il = 0; il < n_layer; ++il) { + pimpl->dev_layer.resize(n_layer_all); + for (int il = 0; il < n_layer_all; ++il) { pimpl->dev_layer[il] = get_layer_buft_list(il); } // assign the output layer - pimpl->dev_output = get_layer_buft_list(n_layer); + pimpl->dev_output = get_layer_buft_list(n_layer_all); const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED; @@ -1303,14 +1303,14 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { throw std::runtime_error("model has expert layers but no expert layers are used"); } - layers.resize(n_layer); + layers.resize(n_layer_all); // call the per-model loading function load_arch_tensors(ml); // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2) // this avoids having to add scale loading to every architecture - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { auto & layer = layers[i]; // attention weight scales (per-tensor, shape {1}) @@ -1568,7 +1568,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } if (llama_supports_gpu_offload()) { - const int n_gpu = std::min(n_gpu_layers, n_layer); + const int n_gpu = std::min(n_gpu_layers, n_layer_all); int n_repeating = n_gpu; if (n_repeating > 0) { @@ -1577,8 +1577,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating); - const int max_backend_supported_layers = n_layer + 1; - const int max_offloadable_layers = n_layer + 1; + const int max_backend_supported_layers = n_layer_all + 1; + const int max_offloadable_layers = n_layer_all + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); } From 5a69c974392020e514c3b2b2910bb92f847cb4c9 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sat, 6 Jun 2026 09:11:35 +0200 Subject: [PATCH 34/71] vulkan: check coopmat2 features before reporting support (#24186) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index df410368a79..fc9bc8fe376 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -6349,6 +6349,15 @@ static void ggml_vk_print_gpu_info(size_t idx) { } #endif +#if defined(VK_NV_cooperative_matrix2) + VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; + coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; + last_struct = (VkBaseOutStructure *)&coopmat2_features; + } +#endif + VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {}; coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV; if (coopmat2_decode_vector_support) { @@ -6380,6 +6389,19 @@ static void ggml_vk_print_gpu_info(size_t idx) { #endif && ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture); +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + coopmat2_support = coopmat2_support && + coopmat2_features.cooperativeMatrixWorkgroupScope && + coopmat2_features.cooperativeMatrixFlexibleDimensions && + coopmat2_features.cooperativeMatrixReductions && + coopmat2_features.cooperativeMatrixConversions && + coopmat2_features.cooperativeMatrixPerElementOperations && + coopmat2_features.cooperativeMatrixTensorAddressing && + coopmat2_features.cooperativeMatrixBlockLoads; +#else + coopmat2_support = false; +#endif + coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector; #if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) coopmat2_decode_vector_support = false; From f5c6ae18278b75712fc992587d9ea527d4ea2218 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 6 Jun 2026 11:06:51 +0200 Subject: [PATCH 35/71] mtmd, server: add "placeholder bitmap" for counting tokens , add */input_tokens API (#23913) * mtmd: add "placeholder bitmap" for counting tokens w/o preprocessing * fast path skip preproc for placeholder * fix build * correct the api * add server endpoint + tests * add object name * update docs * add proxy handling * fix build * fix audio input path * use is_placeholder in process_mtmd_prompt() * nits * nits (2) * docs: clarify chat/completions/input_tokens is not official * fix merge problem --- tools/mtmd/clip-impl.h | 147 +++++++- tools/mtmd/clip.cpp | 215 +++++------ tools/mtmd/clip.h | 20 +- tools/mtmd/models/conformer.cpp | 2 +- tools/mtmd/models/exaone4_5.cpp | 4 +- tools/mtmd/models/glm4v.cpp | 4 +- tools/mtmd/models/granite-speech.cpp | 2 +- tools/mtmd/models/kimik25.cpp | 4 +- tools/mtmd/models/mimovl.cpp | 4 +- tools/mtmd/models/qwen2vl.cpp | 4 +- tools/mtmd/models/qwen3vl.cpp | 4 +- tools/mtmd/models/whisper-enc.cpp | 2 +- tools/mtmd/mtmd-cli.cpp | 2 +- tools/mtmd/mtmd-helper.cpp | 11 +- tools/mtmd/mtmd-helper.h | 4 +- tools/mtmd/mtmd-image.cpp | 354 +++++++++--------- tools/mtmd/mtmd.cpp | 208 +++++++--- tools/mtmd/mtmd.h | 5 + tools/server/README.md | 30 ++ tools/server/server-common.cpp | 4 +- tools/server/server-common.h | 3 +- tools/server/server-context.cpp | 74 +++- tools/server/server-context.h | 3 + tools/server/server.cpp | 9 +- .../server/tests/unit/test_chat_completion.py | 16 + tools/server/tests/unit/test_vision_api.py | 19 + 26 files changed, 732 insertions(+), 422 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 393e085f71e..794cb4d2b27 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -4,6 +4,7 @@ #include "gguf.h" #include "clip.h" +#include #include #include #include @@ -429,10 +430,68 @@ static projector_type clip_projector_type_from_string(const std::string & str) { // RGB uint8 image struct clip_image_u8 { - int nx; - int ny; + clip_image_size get_size() const { + return { nx, ny }; + } + + void set_size(clip_image_size size, bool is_placeholder) { + nx = size.width; + ny = size.height; + if (is_placeholder) { + buf.clear(); + } else { + buf.resize((size_t) nx * (size_t) ny * 3); + } + } + + void cpy_buf(const std::vector & new_buf) { + buf = new_buf; + } + + const std::vector & get_ro_buf() const { + if (is_placeholder()) { + throw std::runtime_error("this clip_image_u8 is a placeholder"); + } + return buf; + } + // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation + + bool is_placeholder() const { + return buf.empty(); + } + + std::array get_pixel(int x, int y) const { + if (is_placeholder()) { + // return a dummy value, so that legacy code can still process image without errors + return { 0, 0, 0 }; + } + int idx = (y * nx + x) * 3; + return { buf[idx], buf[idx + 1], buf[idx + 2] }; + } + + void set_pixel(int x, int y, const std::array & rgb) { + if (is_placeholder()) { + return; // no-op + } + int idx = (y * nx + x) * 3; + buf[idx] = rgb[0]; + buf[idx + 1] = rgb[1]; + buf[idx + 2] = rgb[2]; + } + + size_t n_pixels() const { + return (size_t) nx * (size_t) ny; + } + + size_t n_elements() const { + return n_pixels() * 3; + } + + private: std::vector buf; + int nx = 0; + int ny = 0; }; // For images, buf.size() == nx*ny*3 @@ -440,15 +499,87 @@ struct clip_image_u8 { // For audio, only one channel is used, buf.size() == nx*ny // nx will be n_frames and ny will be n_mel struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; - // marks the global view in e.g., DeepSeek-OCR Models bool add_viewsep = false; - // whether a learned newline token should be appended after the image (eg Granite4 Vision) + // whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision) bool add_newline = false; + + clip_image_size get_size() const { + return { nx_, ny_ }; + } + + int nx() const { return nx_; } + int ny() const { return ny_; } + + void set_size(clip_image_size size, bool is_placeholder, bool is_audio) { + nx_ = size.width; + ny_ = size.height; + if (is_placeholder) { + buf.clear(); + } else { + if (is_audio) { + buf.resize((size_t) nx_ * (size_t) ny_); + } else { + buf.resize((size_t) nx_ * (size_t) ny_ * 3); + } + } + } + + void cpy_buf(const std::vector & new_buf) { + buf = new_buf; + } + + void from_u8(const clip_image_u8 & img) { + auto size = img.get_size(); + nx_ = size.width; + ny_ = size.height; + if (img.is_placeholder()) { + buf.clear(); + return; // no-op + } + buf.resize(img.n_elements()); + const auto & u8_buf = img.get_ro_buf(); + for (size_t i = 0; i < img.n_elements(); ++i) { + buf[i] = (float) u8_buf[i] / 255.0f; + } + } + + size_t n_pixels() const { + return (size_t) nx_ * (size_t) ny_; + } + + size_t n_elements() const { + return n_pixels() * 3; + } + + void normalize(const float mean[3], const float std[3]) { + if (is_placeholder()) { + return; // no-op + } + for (size_t i = 0; i < n_pixels(); ++i) { + buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0]; + buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1]; + buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2]; + } + } + + const std::vector & get_ro_buf() const { + if (is_placeholder()) { + throw std::runtime_error("this clip_image_f32 is a placeholder"); + } + return buf; + } + + // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern + + bool is_placeholder() const { + return buf.empty(); + } + + private: + std::vector buf; + int nx_ = 0; + int ny_ = 0; }; // diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index c12c910a1c8..6e54524da02 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -39,12 +39,14 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s } // PPM header: P6 format, width, height, and max color value - file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + const auto ppm_size = img.get_size(); + file << "P6\n" << ppm_size.width << " " << ppm_size.height << "\n255\n"; // Write pixel data - for (size_t i = 0; i < img.buf.size(); i += 3) { + const auto & ppm_buf = img.get_ro_buf(); + for (size_t i = 0; i < ppm_buf.size(); i += 3) { // PPM expects binary data in RGB format, which matches our image buffer - file.write(reinterpret_cast(&img.buf[i]), 3); + file.write(reinterpret_cast(&ppm_buf[i]), 3); } file.close(); @@ -57,9 +59,10 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& return; } - int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + const auto bmp_size = img.get_size(); + int fileSize = 54 + 3 * bmp_size.width * bmp_size.height; // File header + info header + pixel data int bytesPerPixel = 3; - int widthInBytes = img.nx * bytesPerPixel; + int widthInBytes = bmp_size.width * bytesPerPixel; int paddingAmount = (4 - (widthInBytes % 4)) % 4; int stride = widthInBytes + paddingAmount; @@ -72,7 +75,7 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& }; // Total file size - fileSize = 54 + (stride * img.ny); + fileSize = 54 + (stride * bmp_size.height); fileHeader[2] = (unsigned char)(fileSize); fileHeader[3] = (unsigned char)(fileSize >> 8); fileHeader[4] = (unsigned char)(fileSize >> 16); @@ -94,14 +97,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& }; // Width and height in the information header - infoHeader[4] = (unsigned char)(img.nx); - infoHeader[5] = (unsigned char)(img.nx >> 8); - infoHeader[6] = (unsigned char)(img.nx >> 16); - infoHeader[7] = (unsigned char)(img.nx >> 24); - infoHeader[8] = (unsigned char)(img.ny); - infoHeader[9] = (unsigned char)(img.ny >> 8); - infoHeader[10] = (unsigned char)(img.ny >> 16); - infoHeader[11] = (unsigned char)(img.ny >> 24); + infoHeader[4] = (unsigned char)(bmp_size.width); + infoHeader[5] = (unsigned char)(bmp_size.width >> 8); + infoHeader[6] = (unsigned char)(bmp_size.width >> 16); + infoHeader[7] = (unsigned char)(bmp_size.width >> 24); + infoHeader[8] = (unsigned char)(bmp_size.height); + infoHeader[9] = (unsigned char)(bmp_size.height >> 8); + infoHeader[10] = (unsigned char)(bmp_size.height >> 16); + infoHeader[11] = (unsigned char)(bmp_size.height >> 24); // Write file headers file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); @@ -109,14 +112,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& // Pixel data std::vector padding(3, 0); // Max padding size to be added to each row - for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top - for (int x = 0; x < img.nx; ++x) { + for (int y = bmp_size.height - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < bmp_size.width; ++x) { // Each pixel - size_t pixelIndex = (y * img.nx + x) * 3; + const auto px = img.get_pixel(x, y); unsigned char pixel[3] = { - img.buf[pixelIndex + 2], // BMP stores pixels in BGR format - img.buf[pixelIndex + 1], - img.buf[pixelIndex] + px[2], // BMP stores pixels in BGR format + px[1], + px[0] }; file.write(reinterpret_cast(pixel), 3); } @@ -129,12 +132,13 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& // debug function to convert f32 to u8 static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(3 * src.nx * src.ny); - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + dst.set_size(src.get_size(), false); + const auto & src_buf = src.get_ro_buf(); + std::vector dst_buf(src.n_elements()); + for (size_t i = 0; i < src.n_elements(); ++i) { + dst_buf[i] = static_cast(std::min(std::max(int(src_buf[i] * 255.0f), 0), 255)); } + dst.cpy_buf(dst_buf); } #endif @@ -241,8 +245,8 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : proj_type(ctx->proj_type()), img(img), patch_size(hparams.patch_size), - n_patches_x(img.nx / patch_size), - n_patches_y(img.ny / patch_size), + n_patches_x(img.nx() / patch_size), + n_patches_y(img.ny() / patch_size), n_patches(n_patches_x * n_patches_y), n_embd(hparams.n_embd), n_head(hparams.n_head), @@ -278,8 +282,8 @@ void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { // siglip2 naflex ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; - const int height = img.ny / patch_size; - const int width = img.nx / patch_size; + const int height = img.ny() / patch_size; + const int width = img.nx() / patch_size; const uint32_t mode = interpolation_mode; const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); @@ -523,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() { } ggml_tensor * clip_graph::build_inp_raw(int channels) { - ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); + ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); return inp_raw; @@ -816,8 +820,8 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale GGML_ASSERT(scale_factor > 1); const int n_embd = cur->ne[0]; - int width = img.nx / patch_size; - int height = img.ny / patch_size; + int width = img.nx() / patch_size; + int height = img.ny() / patch_size; // pad width and height to factor const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; @@ -2805,13 +2809,12 @@ struct clip_model_loader { clip_image_f32_batch batch; clip_image_f32_ptr img(clip_image_f32_init()); if (ctx_clip.model.modality == CLIP_MODALITY_VISION) { - img->nx = hparams.warmup_image_size; - img->ny = hparams.warmup_image_size; - LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny); + const int sz = hparams.warmup_image_size; + img->set_size({sz, sz}, false, false); + LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz); } else { - img->nx = hparams.warmup_audio_size; - img->ny = hparams.n_mel_bins; - LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx); + img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false); + LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size); } batch.entries.push_back(std::move(img)); warmup(ctx_clip, batch); @@ -3108,12 +3111,6 @@ struct clip_image_f32_batch * clip_image_f32_batch_init() { return new clip_image_f32_batch(); } -unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { - if (nx) *nx = img->nx; - if (ny) *ny = img->ny; - return img->buf.data(); -} - void clip_image_size_free(struct clip_image_size * load_image_size) { if (load_image_size == nullptr) { return; @@ -3134,7 +3131,7 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id LOG_ERR("%s: invalid index %d\n", __func__, idx); return 0; } - return batch->entries[idx]->nx; + return batch->entries[idx]->nx(); } size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { @@ -3142,7 +3139,7 @@ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int id LOG_ERR("%s: invalid index %d\n", __func__, idx); return 0; } - return batch->entries[idx]->ny; + return batch->entries[idx]->ny(); } clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { @@ -3153,13 +3150,6 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc return batch->entries[idx].get(); } -void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { - img->nx = nx; - img->ny = ny; - img->buf.resize(3 * nx * ny); - memcpy(img->buf.data(), rgb_pixels, img->buf.size()); -} - void clip_free(clip_ctx * ctx) { if (ctx == nullptr) { return; @@ -3167,20 +3157,6 @@ void clip_free(clip_ctx * ctx) { delete ctx; } -// deprecated -size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - const int32_t nx = ctx->model.hparams.image_size; - const int32_t ny = ctx->model.hparams.image_size; - return clip_embd_nbytes_by_img(ctx, nx, ny); -} - -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { - clip_image_f32 img; - img.nx = img_w; - img.ny = img_h; - return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); -} - int32_t clip_get_image_size(const struct clip_ctx * ctx) { return ctx->model.hparams.image_size; } @@ -3211,9 +3187,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: - return (img->nx / params.patch_size) / 2; + return (img->nx() / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: - return img->nx / (params.patch_size * params.n_merge); + return img->nx() / (params.patch_size * params.n_merge); default: break; } @@ -3233,9 +3209,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: - return (img->ny / params.patch_size) / 2; + return (img->ny() / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: - return img->ny / (params.patch_size * params.n_merge); + return img->ny() / (params.patch_size * params.n_merge); default: break; } @@ -3247,7 +3223,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // for models with fixed size image, the input image is already pre-processed and resized to square int patch_size = params.patch_size; - int n_patches = (img->nx / patch_size) * (img->ny / patch_size); + int n_patches = (img->nx() / patch_size) * (img->ny() / patch_size); projector_type proj = ctx->proj_type(); @@ -3313,14 +3289,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_YOUTUVL: { // dynamic size (2 conv, so double patch size) - int x_patch = img->nx / (params.patch_size * 2); - int y_patch = img->ny / (params.patch_size * 2); + int x_patch = img->nx() / (params.patch_size * 2); + int y_patch = img->ny() / (params.patch_size * 2); n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_STEP3VL: { - int x_patch = img->nx / (params.patch_size * params.n_merge); - int y_patch = img->ny / (params.patch_size * params.n_merge); + int x_patch = img->nx() / (params.patch_size * params.n_merge); + int y_patch = img->ny() / (params.patch_size * params.n_merge); n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_GEMMA3: @@ -3347,8 +3323,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // dynamic size int out_patch_size = params.patch_size * ctx->model.hparams.n_merge; - int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; - int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; + int x_patch = CLIP_ALIGN(img->nx(), out_patch_size) / out_patch_size; + int y_patch = CLIP_ALIGN(img->ny(), out_patch_size) / out_patch_size; n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_PADDLEOCR: @@ -3364,8 +3340,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // dynamic size int n_merge = ctx->model.hparams.n_merge; - int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); - int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1); if (ctx->model.token_embd_img_break) { n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } else { @@ -3378,7 +3354,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_MERALION: case PROJECTOR_TYPE_MUSIC_FLAMINGO: { - n_patches = img->nx; + n_patches = img->nx(); const int proj_stack_factor = ctx->model.hparams.proj_stack_factor; if (ctx->model.audio_has_stack_frames()) { @@ -3400,11 +3376,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk const int chunk_size = 100; const int tokens_per_chunk = 13; - n_patches = (img->nx / chunk_size) * tokens_per_chunk; + n_patches = (img->nx() / chunk_size) * tokens_per_chunk; } break; case PROJECTOR_TYPE_GLMA: { - n_patches = img->nx; + n_patches = img->nx(); // whisper downscales input token by half after conv1d n_patches /= 2; // reshape by merge_factor @@ -3431,8 +3407,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; - int ow = (img->nx / patch_size) / merge; - int oh = (img->ny / patch_size) / merge; + int ow = (img->nx() / patch_size) / merge; + int oh = (img->ny() / patch_size) / merge; n_patches = (ow + 1) * oh + 2; } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: @@ -3446,13 +3422,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_LFM2A: { - n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; + n_patches = ((((img->nx() + 1) / 2) + 1) / 2 + 1) / 2; } break; case PROJECTOR_TYPE_GEMMA4A: { // Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2 // O = floor((I - 1) / 2) + 1 - int n = img->nx; + int n = img->nx(); for (int i = 0; i < 2; i++) { n = (n - 1) / 2 + 1; } @@ -3460,13 +3436,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_GEMMA4UA: { - n_patches = img->nx; // no downsampling: one token per raw waveform frame + n_patches = img->nx(); // no downsampling: one token per raw waveform frame } break; case PROJECTOR_TYPE_GRANITE_SPEECH: { const int ws = ctx->model.hparams.audio_proj_window_size; const int ds = ctx->model.hparams.audio_proj_downsample_rate; - n_patches = ((img->nx + ws - 1) / ws) * (ws / ds); + n_patches = ((img->nx() + ws - 1) / ws) * (ws / ds); } break; case PROJECTOR_TYPE_GRANITE4_VISION: { @@ -3475,7 +3451,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // For 384×384 input: n = 24/8 = 3, query_side = 4 → 144. const int window_side = ctx->model.hparams.downsample_window_side; const int query_side = ctx->model.hparams.downsample_query_side; - const int side = img->nx / params.patch_size; + const int side = img->nx() / params.patch_size; const int n = side / window_side; n_patches = (query_side * n) * (query_side * n); if (img->add_newline) { @@ -3525,8 +3501,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const auto & model = ctx->model; const auto & hparams = model.hparams; - const int image_size_width = imgs.entries[0]->nx; - const int image_size_height = imgs.entries[0]->ny; + const int image_size_width = imgs.entries[0]->nx(); + const int image_size_height = imgs.entries[0]->ny(); const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); @@ -3546,7 +3522,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return inp; }; - auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + auto set_input_f32 = [&get_inp_tensor](const char * name, const std::vector & values) { ggml_tensor * cur = get_inp_tensor(name); GGML_ASSERT(cur->type == GGML_TYPE_F32); GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); @@ -3564,7 +3540,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (!imgs.is_audio) { size_t nelem = 0; for (const auto & img : imgs.entries) { - nelem += img->nx * img->ny * 3; + nelem += img->nx() * img->ny() * 3; } std::vector inp_raw(nelem); @@ -3580,19 +3556,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // ──────┘ x B for (size_t i = 0; i < imgs.entries.size(); i++) { - const int nx = imgs.entries[i]->nx; - const int ny = imgs.entries[i]->ny; + const int nx = imgs.entries[i]->nx(); + const int ny = imgs.entries[i]->ny(); const int n = nx * ny; for (int b = 0; b < batch_size; b++) { + const auto & buf = imgs.entries[b]->get_ro_buf(); float * batch_entry = inp_raw.data() + b * (3*n); for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { size_t base_src = 3*(y * nx + x); // idx of the first channel size_t base_dst = y * nx + x; // idx of the first channel - batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; - batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; - batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; + batch_entry[ base_dst] = buf[base_src ]; + batch_entry[1*n + base_dst] = buf[base_src + 1]; + batch_entry[2*n + base_dst] = buf[base_src + 2]; } } } @@ -3602,12 +3579,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } else { // audio input GGML_ASSERT(imgs.entries.size() == 1); + const auto & mel_inp = imgs.entries[0]; - const int n_step = mel_inp->nx; - const int n_mel = mel_inp->ny; - std::vector inp_raw(n_step * n_mel); - std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float)); - set_input_f32("inp_raw", inp_raw); + const auto & buf = mel_inp->get_ro_buf(); + const int n_step = mel_inp->nx(); + const int n_mel = mel_inp->ny(); + GGML_ASSERT((size_t)n_step * n_mel == buf.size()); + + set_input_f32("inp_raw", buf); } // set input per projector @@ -4218,7 +4197,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima GGML_ASSERT(imgs.entries.size() == 1); const auto & img0 = imgs.entries.front(); // Compute n_pos matching SSCP output: two stride-2 convs - int n_pos = img0->nx; + int n_pos = img0->nx(); for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; } // Chunked local attention: blocked causal mask and RPE @@ -4324,7 +4303,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // reshapes as ggml_get_rows gathers. The names are set // by g4v_gather() in models/granite4-vision.cpp. const int patch_size = model.hparams.patch_size; - const int image_side = imgs.entries.front()->nx / patch_size; + const int image_side = imgs.entries.front()->nx() / patch_size; const int window_side = hparams.downsample_window_side; const int query_side = hparams.downsample_query_side; const int n = image_side / window_side; @@ -4570,19 +4549,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } -bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { - clip_image_f32 clip_img; - clip_img.buf.resize(h * w * 3); - for (int i = 0; i < h*w*3; i++) - { - clip_img.buf[i] = img[i]; - } - clip_img.nx = w; - clip_img.ny = h; - clip_image_encode(ctx, n_threads, &clip_img, vec); - return true; -} - // // API used internally with mtmd // @@ -4591,17 +4557,6 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) { return ctx->proj_type(); } -void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) { - clip_image_f32 * audio = new clip_image_f32; - audio->nx = n_frames; - audio->ny = n_mel; - audio->buf.resize(n_frames * n_mel); - std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float)); - - batch->entries.push_back(clip_image_f32_ptr(audio)); - batch->is_audio = true; -} - const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index a62c9d61877..ba5b6197701 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -17,6 +17,9 @@ struct clip_ctx; struct clip_image_size { int width; int height; + bool operator==(const clip_image_size & other) const { + return width == other.width && height == other.height; + } }; struct clip_image_f32; @@ -54,9 +57,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params void clip_free(struct clip_ctx * ctx); -size_t clip_embd_nbytes(const struct clip_ctx * ctx); -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); - int32_t clip_get_image_size (const struct clip_ctx * ctx); int32_t clip_get_patch_size (const struct clip_ctx * ctx); int32_t clip_get_hidden_size(const struct clip_ctx * ctx); @@ -79,9 +79,6 @@ struct clip_image_u8 * clip_image_u8_init (void); struct clip_image_f32 * clip_image_f32_init(void); struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava -// nx, ny are the output image dimensions -unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny); - void clip_image_size_free (struct clip_image_size * img_size); void clip_image_u8_free (struct clip_image_u8 * img); void clip_image_f32_free(struct clip_image_f32 * img); @@ -94,12 +91,6 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data -/** - * Build image from pixels decoded by other libraries instead of stb_image.h for better performance. - * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes - */ -void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); - bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); @@ -107,11 +98,6 @@ bool clip_is_llava(const struct clip_ctx * ctx); // note for contributor: this clip_is_(model) pattern is deprecated // do NOT add new functions like this -bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); - -// use by audio input -void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel); - bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); diff --git a/tools/mtmd/models/conformer.cpp b/tools/mtmd/models/conformer.cpp index f58c5048f59..5f2c7b97314 100644 --- a/tools/mtmd/models/conformer.cpp +++ b/tools/mtmd/models/conformer.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_conformer::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int n_pos = n_frames / 2; const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); diff --git a/tools/mtmd/models/exaone4_5.cpp b/tools/mtmd/models/exaone4_5.cpp index 7bfbaca996b..bd9e8c74886 100644 --- a/tools/mtmd/models/exaone4_5.cpp +++ b/tools/mtmd/models/exaone4_5.cpp @@ -22,8 +22,8 @@ ggml_cgraph * clip_graph_exaone4_5::build() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); { ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index 623d2e384b6..0e1d596b41b 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_glm4v::build() { ggml_set_name(positions, "positions"); ggml_set_input(positions); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); // second conv dimension { diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp index 5e66f75d0a9..0bd4d75ac51 100644 --- a/tools/mtmd/models/granite-speech.cpp +++ b/tools/mtmd/models/granite-speech.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_granite_speech::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int context_size = hparams.audio_chunk_size; const int ctc_layer = n_layer / 2; const int conv_kernel = hparams.audio_conv_kernel_size; diff --git a/tools/mtmd/models/kimik25.cpp b/tools/mtmd/models/kimik25.cpp index cf9f27f63af..cb345f0fc62 100644 --- a/tools/mtmd/models/kimik25.cpp +++ b/tools/mtmd/models/kimik25.cpp @@ -7,8 +7,8 @@ // with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3). ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; - const int height = img.ny / patch_size; - const int width = img.nx / patch_size; + const int height = img.ny() / patch_size; + const int width = img.nx() / patch_size; const uint32_t mode = interpolation_mode; GGML_ASSERT(pos_embd); diff --git a/tools/mtmd/models/mimovl.cpp b/tools/mtmd/models/mimovl.cpp index 19db88f132a..6ff1124a02f 100644 --- a/tools/mtmd/models/mimovl.cpp +++ b/tools/mtmd/models/mimovl.cpp @@ -56,8 +56,8 @@ ggml_cgraph * clip_graph_mimovl::build() { patch_size, patch_size, 0, 0, 1, 1); inp = ggml_add(ctx0, inp, inp_1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b] inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp index ebf10757376..b196587373a 100644 --- a/tools/mtmd/models/qwen2vl.cpp +++ b/tools/mtmd/models/qwen2vl.cpp @@ -19,8 +19,8 @@ ggml_cgraph * clip_graph_qwen2vl::build() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); // second conv dimension { diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index fa1100dda8d..9968933ed6c 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_qwen3vl::build() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); // second conv dimension { diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp index 2a82ae50bf5..49d5dd5add3 100644 --- a/tools/mtmd/models/whisper-enc.cpp +++ b/tools/mtmd/models/whisper-enc.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_whisper_enc::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int n_pos = n_frames / 2; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index d6e551618e8..bd7f9871c3c 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -166,7 +166,7 @@ struct mtmd_cli_context { } bool load_media(const std::string & fname) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false)); if (!bmp.ptr) { return false; } diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 40940741637..94ad01511ed 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -478,7 +478,7 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int } // namespace audio_helpers -mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) { +mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) { if (audio_helpers::is_audio_file((const char *)buf, len)) { std::vector pcmf32; const int sample_rate = mtmd_get_audio_sample_rate(ctx); @@ -490,7 +490,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne LOG_ERR("Unable to read WAV audio file from buffer\n"); return nullptr; } - return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); + return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data()); } // otherwise, we assume it's an image @@ -502,13 +502,13 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne LOG_ERR("%s: failed to decode image bytes\n", __func__); return nullptr; } - result = mtmd_bitmap_init(nx, ny, data); + result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data); stbi_image_free(data); } return result; } -mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) { +mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) { std::vector buf; FILE * f = fopen(fname, "rb"); if (!f) { @@ -533,5 +533,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * return nullptr; } - return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); + return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder); } + diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 57da78a754f..7eecbb06723 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -29,7 +29,7 @@ MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_da // it calls mtmd_helper_bitmap_init_from_buf() internally // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); // helper function to construct a mtmd_bitmap from a buffer containing a file // supported formats: @@ -38,7 +38,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con // note: audio files will be auto-detected based on magic bytes // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index caf72d53621..c86a065c814 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -9,25 +9,12 @@ // void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - // TODO @ngxson : seems like this could be done more efficiently on cgraph - for (size_t i = 0; i < src.buf.size(); ++i) { - int c = i % 3; // rgb - dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; - } + dst.from_u8(src); + dst.normalize(mean, std); } void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(src.buf[i]); - } + dst.from_u8(src); } // set of tools to manipulate images @@ -40,13 +27,16 @@ struct img_tool { resize_algo algo, pad_style padding = PAD_CEIL, std::array pad_color = {0, 0, 0}) { - dst.nx = target_resolution.width; - dst.ny = target_resolution.height; - dst.buf.resize(3 * dst.nx * dst.ny); + dst.set_size(target_resolution, src.is_placeholder()); - if (dst.nx == src.nx && dst.ny == src.ny) { + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + if (dst.get_size() == src.get_size()) { // no resize needed, simple copy - dst.buf = src.buf; + dst.cpy_buf(src.get_ro_buf()); return; } @@ -68,17 +58,17 @@ struct img_tool { } else { // resize with padding clip_image_u8 resized_image; - float scale_w = static_cast(target_resolution.width) / src.nx; - float scale_h = static_cast(target_resolution.height) / src.ny; + float scale_w = static_cast(target_resolution.width) / src.get_size().width; + float scale_h = static_cast(target_resolution.height) / src.get_size().height; float scale = std::min(scale_w, scale_h); int new_width, new_height; if (padding == PAD_NEAREST) { - new_width = std::min(static_cast(std::round(src.nx * scale)), target_resolution.width); - new_height = std::min(static_cast(std::round(src.ny * scale)), target_resolution.height); + new_width = std::min(static_cast(std::round(src.get_size().width * scale)), target_resolution.width); + new_height = std::min(static_cast(std::round(src.get_size().height * scale)), target_resolution.height); } else { - new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); - new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + new_width = std::min(static_cast(std::ceil(src.get_size().width * scale)), target_resolution.width); + new_height = std::min(static_cast(std::ceil(src.get_size().height * scale)), target_resolution.height); } switch (algo) { @@ -112,18 +102,17 @@ struct img_tool { static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0); - GGML_ASSERT(x + w <= image.nx && y + h <= image.ny); - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h); + GGML_ASSERT(x + w <= image.get_size().width && y + h <= image.get_size().height); + dst.set_size({w, h}, image.is_placeholder()); + + if (image.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { - int src_idx = 3 * ((y + i)*image.nx + (x + j)); - int dst_idx = 3 * (i*w + j); - dst.buf[dst_idx] = image.buf[src_idx]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + dst.set_pixel(j, i, image.get_pixel(x + j, y + i)); } } } @@ -181,81 +170,101 @@ struct img_tool { // draw src image into dst image at offset (offset_x, offset_y) static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { - for (int y = 0; y < src.ny; ++y) { - for (int x = 0; x < src.nx; ++x) { + if (src.is_placeholder()) { + // no-op for placeholder image + return; + } + + const auto src_size = src.get_size(); + const auto dst_size = dst.get_size(); + for (int y = 0; y < src_size.height; ++y) { + for (int x = 0; x < src_size.width; ++x) { int dx = x + offset_x; int dy = y + offset_y; // skip pixels that would be out of bounds in the destination - if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { + if (dx < 0 || dy < 0 || dx >= dst_size.width || dy >= dst_size.height) { continue; } - size_t dst_idx = 3 * (static_cast(dy) * dst.nx + static_cast(dx)); - size_t src_idx = 3 * (static_cast(y) * src.nx + static_cast(x)); - dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; - dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; + dst.set_pixel(dx, dy, src.get_pixel(x, y)); } } } // fill the image with a solid color static void fill(clip_image_u8 & img, const std::array & color) { - for (size_t i = 0; i < img.buf.size(); i += 3) { - img.buf[i] = color[0]; - img.buf[i + 1] = color[1]; - img.buf[i + 2] = color[2]; + if (img.is_placeholder()) { + // no-op for placeholder image + return; + } + + const auto size = img.get_size(); + for (int y = 0; y < size.height; ++y) { + for (int x = 0; x < size.width; ++x) { + img.set_pixel(x, y, color); + } } } private: // Bilinear resize function static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { - if (src.nx == 0 || src.ny == 0) { dst.nx = dst.ny = 0; dst.buf.clear(); return; } + const auto src_size = src.get_size(); + if (src_size.width == 0 || src_size.height == 0) { dst.set_size({0, 0}, false); return; } if (target_width <= 0) target_width = 1; if (target_height <= 0) target_height = 1; - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + dst.set_size({target_width, target_height}, false); - float x_ratio = target_width > 1 ? static_cast(src.nx - 1) / (target_width - 1) : 0.0f; - float y_ratio = target_height > 1 ? static_cast(src.ny - 1) / (target_height - 1) : 0.0f; + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + float x_ratio = target_width > 1 ? static_cast(src_size.width - 1) / (target_width - 1) : 0.0f; + float y_ratio = target_height > 1 ? static_cast(src_size.height - 1) / (target_height - 1) : 0.0f; for (int y = 0; y < target_height; ++y) { for (int x = 0; x < target_width; ++x) { float px = x * x_ratio; float py = y * y_ratio; - int x0 = std::min(static_cast(px), src.nx - 1); - int y0 = std::min(static_cast(py), src.ny - 1); - int x1 = std::min(x0 + 1, src.nx - 1); - int y1 = std::min(y0 + 1, src.ny - 1); + int x0 = std::min(static_cast(px), src_size.width - 1); + int y0 = std::min(static_cast(py), src_size.height - 1); + int x1 = std::min(x0 + 1, src_size.width - 1); + int y1 = std::min(y0 + 1, src_size.height - 1); float xf = px - x0; float yf = py - y0; + const auto p00 = src.get_pixel(x0, y0); + const auto p10 = src.get_pixel(x1, y0); + const auto p01 = src.get_pixel(x0, y1); + const auto p11 = src.get_pixel(x1, y1); + + std::array pixel; for (int c = 0; c < 3; ++c) { - float top = lerp(static_cast(src.buf[3 * (y0 * src.nx + x0) + c]), - static_cast(src.buf[3 * (y0 * src.nx + x1) + c]), - xf); - float bottom = lerp(static_cast(src.buf[3 * (y1 * src.nx + x0) + c]), - static_cast(src.buf[3 * (y1 * src.nx + x1) + c]), - xf); - dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, yf)); + float top = lerp(static_cast(p00[c]), static_cast(p10[c]), xf); + float bottom = lerp(static_cast(p01[c]), static_cast(p11[c]), xf); + pixel[c] = static_cast(lerp(top, bottom, yf)); } + dst.set_pixel(x, y, pixel); } } } // Bicubic resize function // part of image will be cropped if the aspect ratio is different - static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { - const int nx = img.nx; - const int ny = img.ny; + static void resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + const auto img_size = img.get_size(); + const int nx = img_size.width; + const int ny = img_size.height; + + dst.set_size({target_width, target_height}, false); - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + if (img.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } float Cc; float C[5] = {}; @@ -280,12 +289,13 @@ struct img_tool { dx = tx * j - x; dy = ty * i - y; + std::array pixel; for (k = 0; k < 3; k++) { for (jj = 0; jj <= 3; jj++) { - d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d0 = img.get_pixel(clip(x - 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + d2 = img.get_pixel(clip(x + 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + d3 = img.get_pixel(clip(x + 2, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + a0 = img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; @@ -303,13 +313,12 @@ struct img_tool { Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); - dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + pixel[k] = Cc2; } } + dst.set_pixel(j, i, pixel); } } - - return true; } // Bicubic resize function using Pillow's ImagingResample algorithm @@ -455,16 +464,17 @@ struct img_tool { }; // Horizontal resampling pass - // Resizes width from imIn.nx to imOut.nx, preserving height + // Resizes width from imIn to out_nx, preserving height auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int out_nx, int ksize, const std::vector & bounds, const std::vector & weights) { - imOut.ny = imIn.ny; - imOut.buf.resize(3 * imOut.nx * imOut.ny); + const int in_ny = imIn.get_size().height; + imOut.set_size({out_nx, in_ny}, false); // Process each row independently - for (int yy = 0; yy < imOut.ny; yy++) { + for (int yy = 0; yy < in_ny; yy++) { // For each output pixel in this row - for (int xx = 0; xx < imOut.nx; xx++) { + for (int xx = 0; xx < out_nx; xx++) { // Get the range of input pixels and filter coefficients int xmin = bounds[xx * 2 + 0]; // First input pixel index int xcnt = bounds[xx * 2 + 1]; // Number of input pixels @@ -476,36 +486,36 @@ struct img_tool { // Convolve: sum weighted input pixels for (int x = 0; x < xcnt; x++) { - int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3; - ss0 += static_cast(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel - ss1 += static_cast(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel - ss2 += static_cast(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel + const auto src_px = imIn.get_pixel(x + xmin, yy); + ss0 += src_px[0] * weights[xx * ksize + x]; // R channel + ss1 += src_px[1] * weights[xx * ksize + x]; // G channel + ss2 += src_px[2] * weights[xx * ksize + x]; // B channel } // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255] - int dst_idx = (yy * imOut.nx + xx) * 3; - imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); - imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); - imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS), + clip8(ss1 >> PRECISION_BITS), + clip8(ss2 >> PRECISION_BITS)}); } } }; // Vertical resampling pass - // Resizes height from imIn.ny to imOut.ny, preserving width + // Resizes height from imIn to out_ny, preserving width auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int out_ny, int ksize, const std::vector & bounds, const std::vector & weight) { - imOut.nx = imIn.nx; - imOut.buf.resize(3 * imOut.nx * imOut.ny); + const int in_nx = imIn.get_size().width; + imOut.set_size({in_nx, out_ny}, false); // For each output row - for (int yy = 0; yy < imOut.ny; yy++) { + for (int yy = 0; yy < out_ny; yy++) { // Get the range of input rows and filter coefficients int ymin = bounds[yy * 2 + 0]; // First input row index int ycnt = bounds[yy * 2 + 1]; // Number of input rows // Process each column in this output row - for (int xx = 0; xx < imOut.nx; xx++) { + for (int xx = 0; xx < in_nx; xx++) { // Initialize accumulators for RGB channels with rounding bias int32_t ss0 = 1 << (PRECISION_BITS - 1); int32_t ss1 = 1 << (PRECISION_BITS - 1); @@ -513,27 +523,23 @@ struct img_tool { // Convolve: sum weighted input pixels vertically for (int y = 0; y < ycnt; y++) { - int src_idx = ((y + ymin) * imIn.nx + xx) * 3; - ss0 += static_cast(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel - ss1 += static_cast(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel - ss2 += static_cast(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel + const auto src_px = imIn.get_pixel(xx, y + ymin); + ss0 += src_px[0] * weight[yy * ksize + y]; // R channel + ss1 += src_px[1] * weight[yy * ksize + y]; // G channel + ss2 += src_px[2] * weight[yy * ksize + y]; // B channel } // Convert back from fixed-point and clamp to [0,255] - int dst_idx = (yy * imOut.nx + xx) * 3; - imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); - imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); - imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS), + clip8(ss1 >> PRECISION_BITS), + clip8(ss2 >> PRECISION_BITS)}); } } }; // Main resampling logic using separable two-pass approach - const int src_width = img.nx; - const int src_height = img.ny; - - dst.nx = target_width; - dst.ny = target_height; + const int src_width = img.get_size().width; + const int src_height = img.get_size().height; bool need_horizontal = (target_width != src_width); bool need_vertical = (target_height != src_height); @@ -555,18 +561,20 @@ struct img_tool { if (need_horizontal && need_vertical) { // Both horizontal and vertical clip_image_u8 temp; - temp.nx = target_width; - resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz); - resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert); + resample_horizontal(img, temp, target_width, ksize_horiz, bounds_horiz, weights_horiz); + resample_vertical(temp, dst, target_height, ksize_vert, bounds_vert, weights_vert); } else if (need_horizontal) { // Only horizontal - resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz); + resample_horizontal(img, dst, target_width, ksize_horiz, bounds_horiz, weights_horiz); } else if (need_vertical) { // Only vertical - resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert); + resample_vertical(img, dst, target_height, ksize_vert, bounds_vert, weights_vert); } else { // No resizing needed - direct copy - dst.buf = img.buf; + dst.set_size(img.get_size(), img.is_placeholder()); + if (!img.is_placeholder()) { + dst.cpy_buf(img.get_ro_buf()); + } } return true; @@ -588,7 +596,7 @@ struct img_tool { // bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); std::vector imgs = slice_image(img, inst); @@ -883,7 +891,7 @@ bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, c bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0); clip_image_u8 resized_image; - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); // the original pixtral model doesn't have n_merge const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( @@ -908,7 +916,7 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(hparams.image_longest_edge > 0); clip_image_u8 resized_image; - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); // the original pixtral model doesn't have n_merge const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( @@ -1040,7 +1048,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli // multiples of image_size (always rounding up) // // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( original_size, hparams.image_size, hparams.image_longest_edge); // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", @@ -1088,7 +1096,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(!hparams.image_res_candidates.empty()); - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); std::vector imgs = slice_image(img, inst, false); @@ -1108,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; // TODO: support 512 (tiny) and 640 (small) once we have eval data for them - const int64_t orig_area = static_cast(img.nx) * img.ny; + const int64_t orig_area = static_cast(img.n_pixels()); size_t mode_i = 0; int64_t min_diff = std::numeric_limits::max(); @@ -1201,10 +1209,11 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, // emit 768x768 local tiles when the image is larger than a tile in either // dimension, then always a 1024x1024 global view. order: [tiles..., global]. - if (img.nx > tile_size || img.ny > tile_size) { - const float aspect_ratio = static_cast(img.nx) / img.ny; + const auto img_size = img.get_size(); + if (img_size.width > tile_size || img_size.height > tile_size) { + const float aspect_ratio = static_cast(img_size.width) / img_size.height; const auto target_ratios = get_target_ratios(); - const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny); + const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height); // stretch onto the grid (no aspect preserve), then crop tiles row-major. clip_image_u8 refined; @@ -1247,50 +1256,57 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32( int target_height, const float mean[3], const float std[3]) { - if (src.nx == target_width && src.ny == target_height) { + const auto src_size = src.get_size(); + if (src_size.width == target_width && src_size.height == target_height) { img_u8_to_f32(src, dst, mean, std); return; } - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + dst.set_size({target_width, target_height}, false, false); + + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + const float scale_x = static_cast(src_size.width) / target_width; + const float scale_y = static_cast(src_size.height) / target_height; - const float scale_x = static_cast(src.nx) / target_width; - const float scale_y = static_cast(src.ny) / target_height; + std::vector local_buf(3 * target_width * target_height); for (int y = 0; y < target_height; ++y) { const float src_y = (static_cast(y) + 0.5f) * scale_y - 0.5f; const int y0_floor = static_cast(std::floor(src_y)); - const int y0 = std::max(0, std::min(y0_floor, src.ny - 1)); - const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1)); + const int y0 = std::max(0, std::min(y0_floor, src_size.height - 1)); + const int y1 = std::max(0, std::min(y0_floor + 1, src_size.height - 1)); const float ly = src_y - y0_floor; for (int x = 0; x < target_width; ++x) { const float src_x = (static_cast(x) + 0.5f) * scale_x - 0.5f; const int x0_floor = static_cast(std::floor(src_x)); - const int x0 = std::max(0, std::min(x0_floor, src.nx - 1)); - const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1)); + const int x0 = std::max(0, std::min(x0_floor, src_size.width - 1)); + const int x1 = std::max(0, std::min(x0_floor + 1, src_size.width - 1)); const float lx = src_x - x0_floor; - const size_t idx00 = 3 * (y0 * src.nx + x0); - const size_t idx01 = 3 * (y0 * src.nx + x1); - const size_t idx10 = 3 * (y1 * src.nx + x0); - const size_t idx11 = 3 * (y1 * src.nx + x1); - const size_t idx_dst = 3 * (y * target_width + x); + const auto p00 = src.get_pixel(x0, y0); + const auto p01 = src.get_pixel(x1, y0); + const auto p10 = src.get_pixel(x0, y1); + const auto p11 = src.get_pixel(x1, y1); + const size_t idx_dst = 3 * (y * target_width + x); for (int c = 0; c < 3; ++c) { - const float v00 = (static_cast(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c]; - const float v01 = (static_cast(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c]; - const float v10 = (static_cast(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c]; - const float v11 = (static_cast(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c]; + const float v00 = (static_cast(p00[c]) / 255.0f - mean[c]) / std[c]; + const float v01 = (static_cast(p01[c]) / 255.0f - mean[c]) / std[c]; + const float v10 = (static_cast(p10[c]) / 255.0f - mean[c]) / std[c]; + const float v11 = (static_cast(p11[c]) / 255.0f - mean[c]) / std[c]; const float top = v00 + (v01 - v00) * lx; const float bot = v10 + (v11 - v10) * lx; - dst.buf[idx_dst + c] = top + (bot - top) * ly; + local_buf[idx_dst + c] = top + (bot - top) * ly; } } } + dst.cpy_buf(local_buf); } int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) { @@ -1341,26 +1357,26 @@ std::vector mtmd_image_preprocessor_step3vl::calc_grid(int length, int wind clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) { clip_image_u8 resized = img; - const float aspect_ratio = img.ny > 0 ? static_cast(img.nx) / img.ny : 1.0f; - if (std::min(img.nx, img.ny) < 32 && + const auto img_size = img.get_size(); + const float aspect_ratio = img_size.height > 0 ? static_cast(img_size.width) / img_size.height : 1.0f; + if (std::min(img_size.width, img_size.height) < 32 && (aspect_ratio > wide_aspect_ratio_limit || aspect_ratio < 1.0f / wide_aspect_ratio_limit)) { - const int square_size = std::max(img.nx, img.ny); + const int square_size = std::max(img_size.width, img_size.height); clip_image_u8 padded; - padded.nx = square_size; - padded.ny = square_size; - padded.buf.resize(3 * square_size * square_size); + padded.set_size({square_size, square_size}, false); img_tool::fill(padded, {0, 0, 0}); img_tool::composite(padded, img, 0, 0); resized = std::move(padded); } const int max_image_size = get_image_longest_edge(params); - if (std::max(resized.nx, resized.ny) > max_image_size) { - const float scale = static_cast(max_image_size) / std::max(resized.nx, resized.ny); + const auto resized_size = resized.get_size(); + if (std::max(resized_size.width, resized_size.height) > max_image_size) { + const float scale = static_cast(max_image_size) / std::max(resized_size.width, resized_size.height); const clip_image_size new_size = { - std::max(1, static_cast(std::floor(resized.nx * scale))), - std::max(1, static_cast(std::floor(resized.ny * scale))), + std::max(1, static_cast(std::floor(resized_size.width * scale))), + std::max(1, static_cast(std::floor(resized_size.height * scale))), }; clip_image_u8 scaled; img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE); @@ -1372,14 +1388,14 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) { clip_image_u8 dst; - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h, 0); + dst.set_size({w, h}, false); + img_tool::fill(dst, {0, 0, 0}); + const auto img_size = image.get_size(); const int src_x0 = std::max(0, x); const int src_y0 = std::max(0, y); - const int src_x1 = std::min(image.nx, x + w); - const int src_y1 = std::min(image.ny, y + h); + const int src_x1 = std::min(img_size.width, x + w); + const int src_y1 = std::min(img_size.height, y + h); if (src_x0 >= src_x1 || src_y0 >= src_y1) { return dst; @@ -1390,11 +1406,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const cli for (int yy = 0; yy < src_y1 - src_y0; ++yy) { for (int xx = 0; xx < src_x1 - src_x0; ++xx) { - const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx)); - const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx)); - dst.buf[dst_idx + 0] = image.buf[src_idx + 0]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + dst.set_pixel(dst_x0 + xx, dst_y0 + yy, image.get_pixel(src_x0 + xx, src_y0 + yy)); } } @@ -1443,7 +1455,7 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { clip_image_u8 prepared = prepare_image(img, hparams); - const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny}); + const auto instructions = build_slice_instructions(hparams, prepared.get_size()); clip_image_f32_ptr overview_f32(clip_image_f32_init()); img_u8_resize_bilinear_to_f32( @@ -1462,7 +1474,8 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip } clip_image_u8 img_for_crop = prepared; - if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) { + const auto prepared_size = prepared.get_size(); + if (instructions.refined_size.width != prepared_size.width || instructions.refined_size.height != prepared_size.height) { clip_image_u8 refined; img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE); img_for_crop = std::move(refined); @@ -1503,9 +1516,10 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip hparams.image_max_pixels / (patch_size * patch_size) : 256; // Linear search for optimal scale to fit within max_num_patches + const auto img_size = img.get_size(); float scale = 1.0f; - int target_height = img.ny; - int target_width = img.nx; + int target_height = img_size.height; + int target_width = img_size.width; auto get_scaled_image_size = [align_size](float scale, int size) -> int { float scaled_size = size * scale; @@ -1517,8 +1531,8 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip // Linear search with 0.02 step size while (scale > 0.0f) { - target_height = get_scaled_image_size(scale, img.ny); - target_width = get_scaled_image_size(scale, img.nx); + target_height = get_scaled_image_size(scale, img_size.height); + target_width = get_scaled_image_size(scale, img_size.width); int num_patches_h = target_height / patch_size; int num_patches_w = target_width / patch_size; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 260f307560a..e1f8e2a3359 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -26,12 +26,46 @@ // represents raw image data, layout is RGBRGBRGB... // length of data must be nx * ny * 3 +// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ... +// length of data must be nx * sizeof(float) struct mtmd_bitmap { - uint32_t nx; - uint32_t ny; - std::vector data; + uint32_t nx = 0; + uint32_t ny = 0; std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking bool is_audio = false; // true if the bitmap is audio + + mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny) + : nx(nx), ny(ny) { + if (data) { + size_t data_size = (size_t)nx * ny * 3; + this->data.resize(data_size); + std::memcpy(this->data.data(), data, data_size); + } + } + + mtmd_bitmap(const unsigned char * data, uint32_t n_samples) + : nx(n_samples), ny(1), is_audio(true) { + if (data) { + size_t data_size = (size_t)nx * sizeof(float); + this->data.resize(data_size); + std::memcpy(this->data.data(), data, data_size); + } + } + + const std::vector & get_ro_buf() const { + return data; + } + + bool is_placeholder() const { + return data.empty(); + } + + size_t n_bytes() const { + return data.size(); + } + + private: + std::vector data; }; // position indexing for decoder model @@ -42,8 +76,8 @@ enum mtmd_pos_type { }; struct mtmd_image_tokens { - uint32_t nx; // number of tokens in x direction - uint32_t ny; // number of tokens in y direction + uint32_t nx = 0; // number of tokens in x direction + uint32_t ny = 0; // number of tokens in y direction mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL; uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL) uint32_t n_tokens() const { @@ -56,6 +90,16 @@ struct mtmd_image_tokens { clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking + // true if one of entries in batch_f32 is a placeholder + bool is_placeholder() const { + for (const auto & entry : batch_f32.entries) { + if (entry->is_placeholder()) { + return true; + } + } + return false; + } + mtmd_image_tokens clone() { return mtmd_image_tokens{ nx, @@ -70,10 +114,20 @@ struct mtmd_image_tokens { using mtmd_image_tokens_ptr = std::unique_ptr; struct mtmd_audio_tokens { - uint32_t n_tokens; // number of tokens + uint32_t n_tokens = 0; // number of tokens clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking + // true if one of entries in batch_f32 is a placeholder + bool is_placeholder() const { + for (const auto & entry : batch_f32.entries) { + if (entry->is_placeholder()) { + return true; + } + } + return false; + } + mtmd_audio_tokens clone() { return mtmd_audio_tokens{ n_tokens, @@ -795,16 +849,19 @@ struct mtmd_tokenizer { } // sanity check - GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0); - GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3); + if (bitmap->nx <= 0 || bitmap->ny <= 0) { + LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n", + __func__, bitmap->nx, bitmap->ny); + return 2; + } GGML_ASSERT(ctx->image_preproc != nullptr); // convert mtmd_bitmap to clip_image_u8 clip_image_u8_ptr img_u8(clip_image_u8_init()); - img_u8->nx = bitmap->nx; - img_u8->ny = bitmap->ny; - img_u8->buf.resize(bitmap->data.size()); - std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3); + img_u8->set_size( + {(int)bitmap->nx, (int)bitmap->ny}, + bitmap->is_placeholder()); + img_u8->cpy_buf(bitmap->get_ro_buf()); // preprocess image clip_image_f32_batch batch_f32; @@ -949,7 +1006,7 @@ struct mtmd_tokenizer { return 2; } - if (bitmap->data.size() == 0) { + if (bitmap->nx == 0) { LOG_ERR("%s: error: empty audio data\n", __func__); return 2; } @@ -960,26 +1017,46 @@ struct mtmd_tokenizer { // sanity check GGML_ASSERT(ctx->audio_preproc != nullptr); - GGML_ASSERT(bitmap->data.size() > sizeof(float)); - GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0); // preprocess audio std::vector mel_spec_chunks; - const float * samples = (const float *)bitmap->data.data(); - size_t n_samples = bitmap->data.size() / sizeof(float); - bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks); - if (!ok) { - LOG_ERR("Unable to preprocess audio\n"); - return 2; + { + std::vector dummy; + const float * samples = nullptr; + size_t n_samples = 0; + if (bitmap->is_placeholder()) { + // TODO @ngxson : skip underlay processing if bitmap is placeholder + GGML_ASSERT(bitmap->ny == 1); + + dummy.resize(bitmap->nx); + samples = dummy.data(); + n_samples = dummy.size(); + } else { + const auto & buf = bitmap->get_ro_buf(); + GGML_ASSERT(buf.size() > sizeof(float)); + GGML_ASSERT(buf.size() % sizeof(float) == 0); + + samples = (const float *)buf.data(); + n_samples = buf.size() / sizeof(float); + } + bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks); + if (!ok) { + LOG_ERR("Unable to preprocess audio\n"); + return 2; + } } // consider each mel_spec as a separate audio chunk // TODO: maybe support batching, but this may come with memory cost for (auto & mel_spec : mel_spec_chunks) { + const bool is_placeholder = mel_spec.data.empty(); + clip_image_f32_ptr mel_f32(clip_image_f32_init()); - mel_f32->nx = mel_spec.n_len; - mel_f32->ny = mel_spec.n_mel; - mel_f32->buf = std::move(mel_spec.data); + mel_f32->set_size( + {mel_spec.n_len, mel_spec.n_mel}, + is_placeholder, /* is_audio */ true); + mel_f32->cpy_buf(mel_spec.data); + size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get()); clip_image_f32_batch batch_f32; @@ -1098,12 +1175,28 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { LOG_ERR("%s: model does not support vision input\n", __func__); return 1; } + if (chunk->tokens_image == nullptr) { + LOG_ERR("%s: image tokens are null\n", __func__); + return 1; + } + if (chunk->tokens_image->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; + } return mtmd_encode(ctx, chunk->tokens_image.get()); } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { if (!ctx->ctx_a) { LOG_ERR("%s: model does not support audio input\n", __func__); return 1; } + if (chunk->tokens_audio == nullptr) { + LOG_ERR("%s: audio tokens are null\n", __func__); + return 1; + } + if (chunk->tokens_audio->is_placeholder()) { + LOG_ERR("%s: audio tokens batch is placeholder\n", __func__); + return 1; + } int n_mmproj_embd = ctx->n_embd_text; ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( @@ -1141,6 +1234,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view size_t offset = 0; for (size_t i = 0; i < entries.size(); i++) { + if (entries[i]->is_placeholder()) { + LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i); + return 1; + } int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); ok = clip_image_encode( ctx_clip, @@ -1150,6 +1247,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } } else { + if (image_tokens->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; + } ok = clip_image_batch_encode( ctx_clip, ctx->n_threads, @@ -1207,24 +1308,17 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) { mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data) { - mtmd_bitmap * bitmap = new mtmd_bitmap; - bitmap->nx = nx; - bitmap->ny = ny; - size_t data_size = (size_t)nx * ny * 3; - bitmap->data.resize(data_size); - std::memcpy(bitmap->data.data(), data, data_size); + mtmd_bitmap * bitmap = new mtmd_bitmap(data, nx, ny); return bitmap; } mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data) { - mtmd_bitmap * bitmap = new mtmd_bitmap; - bitmap->nx = n_samples; - bitmap->ny = 1; - bitmap->is_audio = true; - size_t data_size = n_samples * sizeof(float); - bitmap->data.resize(data_size); - std::memcpy(bitmap->data.data(), data, data_size); + mtmd_bitmap * bitmap = new mtmd_bitmap((const unsigned char *)data, n_samples); + GGML_ASSERT(bitmap->is_audio); + if (!bitmap->is_placeholder()) { + GGML_ASSERT(bitmap->get_ro_buf().size() == n_samples * sizeof(float)); + } return bitmap; } @@ -1237,11 +1331,11 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) { } const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) { - return bitmap->data.data(); + return bitmap->get_ro_buf().data(); } size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) { - return bitmap->data.size(); + return bitmap->get_ro_buf().size(); } bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { @@ -1535,14 +1629,16 @@ void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector img_buf; + img_buf.reserve(img_sz * img_sz); for (const auto & row : image) { - inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end()); + img_buf.insert(img_buf.end(), row.begin(), row.end()); } - LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny); + clip_image_f32 inp_image; + inp_image.set_size({img_sz, img_sz}, false, false); + inp_image.cpy_buf(img_buf); + LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, img_sz, img_sz); mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image); } @@ -1552,16 +1648,17 @@ void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector & inpu return; } int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins; - clip_image_f32 inp_audio; - inp_audio.nx = input.size(); - inp_audio.ny = n_mel; - inp_audio.buf.resize(input.size() * n_mel); - for (size_t i = 0; i < input.size(); i++) { + const int audio_nx = (int)input.size(); + std::vector audio_buf(audio_nx * n_mel); + for (int i = 0; i < audio_nx; i++) { for (int j = 0; j < n_mel; j++) { - inp_audio.buf[j * inp_audio.nx + i] = input[i]; + audio_buf[j * audio_nx + i] = input[i]; } } - LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny); + clip_image_f32 inp_audio; + inp_audio.set_size({audio_nx, n_mel}, false, true); + inp_audio.cpy_buf(audio_buf); + LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, audio_nx, n_mel); mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio); } @@ -1571,9 +1668,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector return; } clip_image_u8 img_u8; - img_u8.nx = nx; - img_u8.ny = ny; - img_u8.buf = rgb_values; + img_u8.set_size({nx, ny}, false); + img_u8.cpy_buf(rgb_values); clip_image_f32_batch batch_f32; GGML_ASSERT(ctx->image_preproc != nullptr); bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32); @@ -1583,7 +1679,7 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector } LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size()); for (size_t i = 0; i < batch_f32.entries.size(); i++) { - LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny); + LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny()); // TODO: better way to dump entry content? } } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 5d518df799e..b3154c8d55d 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -136,6 +136,11 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); // if bitmap is audio: // length of data must be n_samples * sizeof(float) // the data is in float format (PCM F32) +// +// if data == nullptr: +// the bitmap is considered "empty", and will be treated as a placeholder for counting tokens +// you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens +// note: passing a placeholder bitmap to mtmd_encode() will return an error MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); diff --git a/tools/server/README.md b/tools/server/README.md index 3e14f5e6a20..bf056dc60b1 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1447,6 +1447,36 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r }' ``` +### POST `/v1/responses/input_tokens`: Token Counting + +Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count). + +Example response: + +```json +{ + "object": "response.input_tokens", + "input_tokens": 11 +} +``` + +### POST `/v1/chat/completions/input_tokens`: Token Counting + +Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count), but accepts a chat completion body as input. + +Note: This is not an official OAI endpoint, but is added for completeness and convenience. + +Example response: + +```json +{ + "object": "response.input_tokens", + "input_tokens": 11 +} +``` + +## Anthropic-compatible API Endpoints + ### POST `/v1/messages`: Anthropic-compatible Messages API Given a list of `messages`, returns the assistant's response. Streaming is supported via Server-Sent Events. While no strong claims of compatibility with the Anthropic API spec are made, in our experience it suffices to support many apps. diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 4c3f16a0a3d..dfd286d24e2 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -713,10 +713,10 @@ static std::string fnv_hash(const uint8_t * data, size_t len) { return std::to_string(hash); } -server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files) { +server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector & files, bool is_placeholder) { mtmd::bitmaps bitmaps; for (auto & file : files) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder)); if (!bmp.ptr) { throw std::runtime_error("Failed to load image or audio file"); } diff --git a/tools/server/server-common.h b/tools/server/server-common.h index c28558d8b7b..51b16131782 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -258,7 +258,8 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, size_t validate_utf8(const std::string& text); // process mtmd prompt, return the server_tokens containing both text tokens and media chunks -server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files); +// if is_placeholder is true, the media chunk will be treated as placeholder for counting tokens; the output tokens are not usable for actual inference (e.g. for submitting a task to server_queue) +server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector & files, bool is_placeholder = false); /** * break the input "prompt" object into multiple prompt if needed, then tokenize them diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ab0d5944763..5d546d09c22 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -4333,6 +4333,10 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_CHAT); }; + this->post_chat_completions_tok = [this](const server_http_req & req) { + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_CHAT); + }; + this->post_control = [this](const server_http_req & req) { auto res = create_response(); const json body = json::parse(req.body); @@ -4388,6 +4392,10 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_RESP); }; + this->post_responses_tok_oai = [this](const server_http_req & req) { + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_RESP); + }; + this->post_transcriptions_oai = [this](const server_http_req & req) { auto res = create_response(); @@ -4435,20 +4443,7 @@ void server_routes::init_routes() { }; this->post_anthropic_count_tokens = [this](const server_http_req & req) { - auto res = create_response(); - std::vector files; - json body = server_chat_convert_anthropic_to_oai(json::parse(req.body)); - SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions"); - SRV_DBG("converted request: %s\n", body.dump().c_str()); - json body_parsed = oaicompat_chat_params_parse( - body, - meta->chat_params, - files); - - json prompt = body_parsed.at("prompt"); - llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true); - res->ok({{"input_tokens", static_cast(tokens.size())}}); - return res; + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_ANTHROPIC); }; // same with handle_chat_completions, but without inference part @@ -4928,3 +4923,54 @@ std::unique_ptr server_routes::handle_embeddings_impl(cons res->ok(root); return res; } + +std::unique_ptr server_routes::handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type) { + auto res = create_response(); + std::vector files; + json body = json::parse(req.body); + bool is_oai = false; + + switch (res_type) { + case TASK_RESPONSE_TYPE_OAI_CHAT: + { + is_oai = true; + } break; + case TASK_RESPONSE_TYPE_OAI_RESP: + { + is_oai = true; + body = server_chat_convert_responses_to_chatcmpl(body); + } break; + case TASK_RESPONSE_TYPE_ANTHROPIC: + { + body = server_chat_convert_anthropic_to_oai(body); + } break; + default: + res->error(format_error_response("invalid res_type", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + json body_parsed = oaicompat_chat_params_parse( + body, + meta->chat_params, + files); + json prompt = body_parsed.at("prompt"); + // SRV_DBG("prompt = %s\n", prompt.dump().c_str()); + + // TODO @ngxson : refactor this code block, move this to server-common and reuse it in other places + size_t n_tokens; + if (mctx != nullptr) { + if (!prompt.is_string()) { + throw std::runtime_error("for mtmd, input prompt must be a string."); + } + n_tokens = process_mtmd_prompt(mctx, prompt.get(), files, true).size(); + } else { + n_tokens = tokenize_mixed(vocab, prompt, true, true).size(); + } + + json response = {{"input_tokens", static_cast(n_tokens)}}; + if (is_oai) { + response["object"] = "response.input_tokens"; + } + res->ok(response); + return res; +} diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 73caff54a46..72a1f40e014 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -110,8 +110,10 @@ struct server_routes { server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_chat_completions_tok; server_http_context::handler_t post_control; server_http_context::handler_t post_responses_oai; + server_http_context::handler_t post_responses_tok_oai; server_http_context::handler_t post_transcriptions_oai; server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; @@ -139,6 +141,7 @@ struct server_routes { std::unique_ptr handle_slots_restore(const server_http_req & req, int id_slot); std::unique_ptr handle_slots_erase(const server_http_req &, int id_slot); std::unique_ptr handle_embeddings_impl(const server_http_req & req, task_response_type res_type); + std::unique_ptr handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type); // using unique_ptr to allow late initialization of const std::unique_ptr meta; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 769e80a802f..a6ea749d0c3 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -161,6 +161,8 @@ int llama_server(int argc, char ** argv) { routes.post_tokenize = models_routes->proxy_post; routes.post_detokenize = models_routes->proxy_post; routes.post_apply_template = models_routes->proxy_post; + routes.post_chat_completions_tok = models_routes->proxy_post; + routes.post_responses_tok_oai = models_routes->proxy_post; routes.get_lora_adapters = models_routes->proxy_get; routes.post_lora_adapters = models_routes->proxy_post; routes.get_slots = models_routes->proxy_get; @@ -192,7 +194,6 @@ int llama_server(int argc, char ** argv) { ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); ctx_http.post("/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API - ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting ctx_http.post("/infill", ex_wrapper(routes.post_infill)); ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); @@ -204,6 +205,12 @@ int llama_server(int argc, char ** argv) { ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); + // token counting + ctx_http.post("/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok)); + ctx_http.post("/v1/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok)); + ctx_http.post("/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai)); + ctx_http.post("/v1/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai)); + ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting // LoRA adapters hotswap ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index f80e46133c7..fe55dc5ab17 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -573,3 +573,19 @@ def test_chat_completions_multiple_choices(): for choice in res.body["choices"]: assert "assistant" == choice["message"]["role"] assert choice["finish_reason"] == "length" + + +def test_chat_completions_token_count(): + global server + server.start() + # make sure cache can be reused across multiple choices and multiple requests + # ref: https://github.com/ggml-org/llama.cpp/pull/18663 + for _ in range(2): + res = server.make_request("POST", "/chat/completions/input_tokens", data={ + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + }) + assert res.status_code == 200 + assert res.body["input_tokens"] > 5 diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py index fb77084c89b..d74cc3a43ed 100644 --- a/tools/server/tests/unit/test_vision_api.py +++ b/tools/server/tests/unit/test_vision_api.py @@ -98,6 +98,25 @@ def test_vision_chat_completion(prompt, image_url, success, re_content): assert res.status_code != 200 +def test_vision_chat_completion_token_count(): + global server + server.start() + res = server.make_request("POST", "/chat/completions/input_tokens", data={ + "temperature": 0.0, + "top_k": 1, + "messages": [ + {"role": "user", "content": [ + {"type": "text", "text": "What is this:"}, + {"type": "image_url", "image_url": { + "url": get_img_url("IMG_URL_0"), + }}, + ]}, + ], + }) + assert res.status_code == 200 + assert res.body["input_tokens"] > 10 + + @pytest.mark.parametrize( "prompt, image_data, success, re_content", [ From 588f0dc2ce844f469797b5870e7876ddac654f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 6 Jun 2026 11:24:27 +0200 Subject: [PATCH 36/71] completion : fix format specifier in LOG_INF (#24213) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- tools/completion/completion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 6d2dcb56b2f..2a957963f81 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -989,7 +989,7 @@ int llama_completion(int argc, char ** argv) { LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - LOG_INF("saved final session to %s, n_tokens = %ld\n", path_session.data(), session_tokens.size()); + LOG_INF("saved final session to %s, n_tokens = %zu\n", path_session.data(), session_tokens.size()); } From 6b80c74f285390368b3c99c5e750f19e9b096e98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 6 Jun 2026 12:16:16 +0200 Subject: [PATCH 37/71] completion : remove useless statics (#24226) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- tools/completion/completion.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 2a957963f81..6747558fc54 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -33,12 +33,8 @@ #endif static llama_context ** g_ctx; -static llama_model ** g_model; static common_sampler ** g_smpl; static common_params * g_params; -static std::vector * g_input_tokens; -static std::ostringstream * g_output_ss; -static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; @@ -136,7 +132,6 @@ int llama_completion(int argc, char ** argv) { llama_context * ctx = nullptr; common_sampler * smpl = nullptr; - g_model = &model; g_ctx = &ctx; g_smpl = &smpl; @@ -549,9 +544,9 @@ int llama_completion(int argc, char ** argv) { int n_consumed = 0; int n_session_consumed = 0; - std::vector input_tokens; g_input_tokens = &input_tokens; - std::vector output_tokens; g_output_tokens = &output_tokens; - std::ostringstream output_ss; g_output_ss = &output_ss; + std::vector input_tokens; + std::vector output_tokens; + std::ostringstream output_ss; std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode // the first thing we will do is to output the prompt, so set color accordingly From 31e82494c0a3913c919c1027fa70500fbf4c07dd Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 6 Jun 2026 21:17:25 +0200 Subject: [PATCH 38/71] mtmd: support "frame merge" for qwen-vl-based models (#21858) * feat: add video support for Qwen3.5 * various clean up * revise the design * fix llava-uhd case * nits * nits 2 --------- Co-authored-by: andrewmd5 <1297077+andrewmd5@users.noreply.github.com> --- tools/mtmd/clip-graph.h | 3 + tools/mtmd/clip-impl.h | 19 +++-- tools/mtmd/clip.cpp | 43 +++++++---- tools/mtmd/clip.h | 8 ++ tools/mtmd/models/models.h | 5 +- tools/mtmd/models/qwen2vl.cpp | 38 +++++++-- tools/mtmd/models/qwen3vl.cpp | 11 +-- tools/mtmd/mtmd-image.cpp | 2 +- tools/mtmd/mtmd.cpp | 140 ++++++++++++++++++++++++++-------- tools/mtmd/mtmd.h | 2 + 10 files changed, 197 insertions(+), 74 deletions(-) diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 1d9f6a136a9..7d10586217b 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -37,6 +37,9 @@ struct clip_graph { float kq_scale; // TODO: maybe move this to hparams const clip_flash_attn_type flash_attn_type; + // TODO [QWEN_VIDEO]: improve this in the future + int n_batch = 1; + ggml_context_ptr ctx0_ptr; ggml_context * ctx0; ggml_cgraph * gf; diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 794cb4d2b27..b104f373618 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -480,10 +480,6 @@ struct clip_image_u8 { buf[idx + 2] = rgb[2]; } - size_t n_pixels() const { - return (size_t) nx * (size_t) ny; - } - size_t n_elements() const { return n_pixels() * 3; } @@ -492,10 +488,16 @@ struct clip_image_u8 { std::vector buf; int nx = 0; int ny = 0; + + size_t n_pixels() const { + return (size_t) nx * (size_t) ny; + } }; // For images, buf.size() == nx*ny*3 // Memory layout: RGBRGBRGB... +// For seq, buf.size() == nx*ny*3*nt +// Memory layout: RGBRGB...RGBRGB... (nt times) // For audio, only one channel is used, buf.size() == nx*ny // nx will be n_frames and ny will be n_mel struct clip_image_f32 { @@ -544,10 +546,6 @@ struct clip_image_f32 { } } - size_t n_pixels() const { - return (size_t) nx_ * (size_t) ny_; - } - size_t n_elements() const { return n_pixels() * 3; } @@ -580,6 +578,10 @@ struct clip_image_f32 { std::vector buf; int nx_ = 0; int ny_ = 0; + + size_t n_pixels() const { + return (size_t) nx_ * (size_t) ny_; + } }; // @@ -627,6 +629,7 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, .. va_end(args); } +#define LOG_TRC(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__) #define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 6e54524da02..bd33f430625 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -527,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() { } ggml_tensor * clip_graph::build_inp_raw(int channels) { - ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels); + ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels, n_batch); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); return inp_raw; @@ -848,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale } static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { - GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); - const clip_image_f32 & img = *imgs.entries[0]; std::unique_ptr builder; @@ -1009,6 +1007,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 GGML_ABORT("missing cgraph builder"); } + // TODO [QWEN_VIDEO]: improve this in the future + builder->n_batch = imgs.entries.size(); + return builder->build(); } @@ -3479,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { const clip_image_f32_batch & imgs = *imgs_c_ptr; - int batch_size = imgs.entries.size(); + int n_batch_cur = imgs.entries.size(); + + // maximum supported batch size, usually == 2 for qwen-vl-based models + int n_batch_max = clip_model_n_batch_max(ctx); // TODO @ngxson : implement batch size > 1 as a loop // we don't need true batching support because the cgraph will gonna be big anyway - if (batch_size != 1) { - return false; // only support batch size of 1 + if (n_batch_cur > n_batch_max) { + return false; } // if buffers are not allocated, we need to do a warmup run to allocate them @@ -3555,18 +3559,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // └─────┘ │ // ──────┘ x B - for (size_t i = 0; i < imgs.entries.size(); i++) { - const int nx = imgs.entries[i]->nx(); - const int ny = imgs.entries[i]->ny(); - const int n = nx * ny; + // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models + // All entries must have the same spatial size (enforced by can_batch_with() during merging) + { + const int nx = imgs.entries[0]->nx(); + const int ny = imgs.entries[0]->ny(); + const int n = nx * ny; - for (int b = 0; b < batch_size; b++) { + for (int b = 0; b < n_batch_cur; b++) { const auto & buf = imgs.entries[b]->get_ro_buf(); float * batch_entry = inp_raw.data() + b * (3*n); for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { - size_t base_src = 3*(y * nx + x); // idx of the first channel - size_t base_dst = y * nx + x; // idx of the first channel + size_t base_src = 3*(y * nx + x); + size_t base_dst = y * nx + x; batch_entry[ base_dst] = buf[base_src ]; batch_entry[1*n + base_dst] = buf[base_src + 1]; batch_entry[2*n + base_dst] = buf[base_src + 2]; @@ -4549,6 +4555,17 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } +int clip_model_n_batch_max(const struct clip_ctx * ctx) { + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + return 2; + default: + return 1; + } +} + // // API used internally with mtmd // diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index ba5b6197701..18c7a1d1a7c 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -20,6 +20,12 @@ struct clip_image_size { bool operator==(const clip_image_size & other) const { return width == other.width && height == other.height; } + bool operator!=(const clip_image_size & other) const { + return !(*this == other); + } + int area() const { + return width * height; + } }; struct clip_image_f32; @@ -101,6 +107,8 @@ bool clip_is_llava(const struct clip_ctx * ctx); bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); +int clip_model_n_batch_max(const struct clip_ctx * ctx); + std::map clip_get_mem_usage(const struct clip_ctx * ctx); struct clip_cap { diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index d1865103bcb..12082a5280a 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -31,10 +31,11 @@ struct clip_graph_pixtral : clip_graph { struct clip_graph_qwen2vl : clip_graph { clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; + ggml_tensor * build_inp_with_temporal_merge(); }; -struct clip_graph_qwen3vl : clip_graph { - clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} +struct clip_graph_qwen3vl : clip_graph_qwen2vl { + clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {} ggml_cgraph * build() override; }; diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp index b196587373a..2220c2692a1 100644 --- a/tools/mtmd/models/qwen2vl.cpp +++ b/tools/mtmd/models/qwen2vl.cpp @@ -1,5 +1,34 @@ #include "models.h" +ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() { + ggml_tensor * inp_raw = build_inp_raw(); + + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); + + const size_t nb1 = ggml_row_size(inp_raw->type, img.nx()); + const size_t nb2 = ggml_row_size(inp_raw->type, img.nx() * img.ny()); + + if (n_batch == 1) { + // still image input + return ggml_add(ctx0, + ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1), + ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1)); + } else if (n_batch == 2) { + // 2 frames input (video input) + ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw, + img.nx(), img.ny(), 3, nb1, nb2, 0); + ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw, + img.nx(), img.ny(), 3, nb1, nb2, + nb2 * 3); // move to the second frame + return ggml_add(ctx0, + ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1), + ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1)); + } else { + GGML_ASSERT(false && "n_batch > 2 is not supported"); + } +} + ggml_cgraph * clip_graph_qwen2vl::build() { GGML_ASSERT(model.patch_bias == nullptr); GGML_ASSERT(model.class_embedding == nullptr); @@ -16,17 +45,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() { int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - GGML_ASSERT(img.nx() % (patch_size * 2) == 0); - GGML_ASSERT(img.ny() % (patch_size * 2) == 0); + ggml_tensor * inp = build_inp_with_temporal_merge(); // second conv dimension { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] inp = ggml_cont_4d( ctx0, inp, diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index 9968933ed6c..261e77a198a 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() { int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + ggml_tensor * inp = build_inp_with_temporal_merge(); - GGML_ASSERT(img.nx() % (patch_size * 2) == 0); - GGML_ASSERT(img.ny() % (patch_size * 2) == 0); - - // second conv dimension + // spatial merge { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] inp = ggml_cont_4d( ctx0, inp, diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index c86a065c814..bedf44e07cf 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -1116,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; // TODO: support 512 (tiny) and 640 (small) once we have eval data for them - const int64_t orig_area = static_cast(img.n_pixels()); + const int64_t orig_area = static_cast(img.get_size().area()); size_t mode_i = 0; int64_t min_diff = std::numeric_limits::max(); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index e1f8e2a3359..c93fb1e0a4a 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -24,10 +24,11 @@ #include #include -// represents raw image data, layout is RGBRGBRGB... -// length of data must be nx * ny * 3 +// for still image data, layout is RGBRGBRGB... +// length of data must be nx * ny * 3 bytes +// // for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ... -// length of data must be nx * sizeof(float) +// length of data must be nx * sizeof(float) bytes struct mtmd_bitmap { uint32_t nx = 0; uint32_t ny = 0; @@ -35,7 +36,7 @@ struct mtmd_bitmap { bool is_audio = false; // true if the bitmap is audio mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny) - : nx(nx), ny(ny) { + : nx(nx), ny(ny), is_audio(false) { if (data) { size_t data_size = (size_t)nx * ny * 3; this->data.resize(data_size); @@ -64,6 +65,11 @@ struct mtmd_bitmap { return data.size(); } + bool can_batch_with(const mtmd_bitmap & other) const { + // [QWEN_VIDEO] can batch if both are images with same size + return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny; + } + private: std::vector data; }; @@ -750,16 +756,55 @@ struct mtmd_tokenizer { cur.entries.clear(); std::vector parts = split_text(input_text, ctx->media_marker); size_t i_bm = 0; // index of the current bitmap + + // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl) + int n_merge_frames = 1; + if (ctx->ctx_v) { + n_merge_frames = clip_model_n_batch_max(ctx->ctx_v); + GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more"); + } + + std::vector> merged_bitmaps; + if (n_merge_frames > 1) { + size_t i_bm_scan = 0; + for (size_t i = 0; i < parts.size(); ++i) { + if (parts[i] != ctx->media_marker) { + continue; + } + if (i + 1 < parts.size() + && parts[i + 1] == ctx->media_marker + && i_bm_scan + 1 < bitmaps.size()) { + const mtmd_bitmap * bm_a = bitmaps[i_bm_scan]; + const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1]; + if (bm_a->can_batch_with(*bm_b)) { + LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1); + merged_bitmaps.push_back({bm_a, bm_b}); + parts.erase(parts.begin() + i + 1); // remove the second marker + i_bm_scan += 2; + continue; + } + } + LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan); + merged_bitmaps.push_back({bitmaps[i_bm_scan]}); + ++i_bm_scan; + } + } else { + for (size_t i = 0; i < bitmaps.size(); ++i) { + merged_bitmaps.push_back({bitmaps[i]}); + } + } + + i_bm = 0; for (auto & part : parts) { if (part == ctx->media_marker) { // this is a marker, we should add the next bitmap - if (i_bm >= bitmaps.size()) { + if (i_bm >= merged_bitmaps.size()) { LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n", - __func__, bitmaps.size(), parts.size() - 1); + __func__, merged_bitmaps.size(), parts.size() - 1); return 1; } - const mtmd_bitmap * bitmap = bitmaps[i_bm++]; - int32_t res = add_media(bitmap); + auto & bmps = merged_bitmaps[i_bm++]; + int32_t res = add_media(bmps); if (res != 0) { return res; } @@ -794,9 +839,9 @@ struct mtmd_tokenizer { } } - if (i_bm != bitmaps.size()) { + if (i_bm != merged_bitmaps.size()) { LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n", - __func__, bitmaps.size(), parts.size() - 1); + __func__, merged_bitmaps.size(), parts.size() - 1); return 1; } @@ -835,8 +880,10 @@ struct mtmd_tokenizer { } } - int32_t add_media(const mtmd_bitmap * bitmap) { - if (!bitmap->is_audio) { + int32_t add_media(std::vector & bitmaps) { + GGML_ASSERT(!bitmaps.empty()); + + if (!bitmaps[0]->is_audio) { // handle image if (!ctx->ctx_v) { @@ -848,27 +895,44 @@ struct mtmd_tokenizer { add_text(ctx->img_beg, true); // add image begin token } - // sanity check - if (bitmap->nx <= 0 || bitmap->ny <= 0) { - LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n", - __func__, bitmap->nx, bitmap->ny); - return 2; - } - GGML_ASSERT(ctx->image_preproc != nullptr); - - // convert mtmd_bitmap to clip_image_u8 - clip_image_u8_ptr img_u8(clip_image_u8_init()); - img_u8->set_size( - {(int)bitmap->nx, (int)bitmap->ny}, - bitmap->is_placeholder()); - img_u8->cpy_buf(bitmap->get_ro_buf()); + // TODO @ngxson : this is quite hacky because preprocessor only support batch with one single element, that need to be fixed in the future (e.g. by changing the preprocessor interface always take single input) - // preprocess image clip_image_f32_batch batch_f32; - bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32); - if (!ok) { - LOG_ERR("Unable to preprocess image\n"); - return 2; + + for (const auto * bmp : bitmaps) { + // sanity check + GGML_ASSERT(!bmp->is_audio); + GGML_ASSERT(ctx->image_preproc != nullptr); + if (bmp->nx <= 0 || bmp->ny <= 0) { + LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n", + __func__, bmp->nx, bmp->ny); + return 2; + } + + // convert mtmd_bitmap to clip_image_u8 + clip_image_u8_ptr img_u8(clip_image_u8_init()); + img_u8->set_size( + {(int)bmp->nx, (int)bmp->ny}, + bmp->is_placeholder()); + img_u8->cpy_buf(bmp->get_ro_buf()); + + // preprocess image + clip_image_f32_batch tmp_batch; + bool ok = ctx->image_preproc->preprocess(*img_u8, tmp_batch); + if (!ok) { + LOG_ERR("Unable to preprocess image\n"); + return 2; + } + + // move entries and grid dimensions to the "global" batch_f32 + for (auto & entry : tmp_batch.entries) { + batch_f32.entries.emplace_back(std::move(entry)); + } + + // for llava-uhd style, we need to handle grid too + // we don't care about overwriting these values for now because llama-uhd doesn't support batching anyway + batch_f32.grid_x = tmp_batch.grid_x; + batch_f32.grid_y = tmp_batch.grid_y; } // Annotate llava-next style tiles so clip_n_output_tokens accounts @@ -896,11 +960,14 @@ struct mtmd_tokenizer { || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid) ) { + // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now + GGML_ASSERT(bitmaps.size() == 1); + const int n_col = batch_f32.grid_x; const int n_row = batch_f32.grid_y; // split batch into chunks of single images // NOTE: batch_f32 will be invalidated after this call - auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id); + auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[0]->id); GGML_ASSERT(chunks.size() > 0); auto ov_chunk = std::move(chunks.front()); @@ -954,6 +1021,10 @@ struct mtmd_tokenizer { size_t n_tokens = 0; for (const auto & e : batch_f32.entries) { n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get()); + if (clip_model_n_batch_max(ctx->ctx_v) == 2) { + // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image + break; + } } mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); @@ -976,7 +1047,7 @@ struct mtmd_tokenizer { GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens()); } image_tokens->batch_f32 = std::move(batch_f32); - image_tokens->id = bitmap->id; // optional + image_tokens->id = bitmaps[0]->id; // optional LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx); LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); @@ -1001,6 +1072,9 @@ struct mtmd_tokenizer { } else { // handle audio + GGML_ASSERT(bitmaps.size() == 1); // no batching support for now + auto & bitmap = bitmaps[0]; + if (!ctx->ctx_a) { LOG_ERR("%s: error: model does not support audio input\n", __func__); return 2; diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index b3154c8d55d..128fb18261b 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -133,6 +133,8 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); // if bitmap is image: // length of data must be nx * ny * 3 // the data is in RGBRGBRGB... format +// note: some video-capable models (i.e. qwen-vl) can merge consecutive bitmaps +// into one chunk, mtmd_tokenize() will automatically handle this // if bitmap is audio: // length of data must be n_samples * sizeof(float) // the data is in float format (PCM F32) From 98d5e8ba8a2642710c9871d05ac1033a3328b884 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Sat, 6 Jun 2026 22:39:21 +0200 Subject: [PATCH 39/71] common/chat : fix LFM2/LFM2.5 reasoning round-trip and leak (#24234) * common/chat : fix LFM2 reasoning round-trip and stray leak * Gate by reasoning format and whether the template supports --- common/chat.cpp | 19 +- models/templates/LFM2.5-8B-A1B.jinja | 115 ++++++++++ tests/test-chat.cpp | 308 ++++++++++++--------------- 3 files changed, 263 insertions(+), 179 deletions(-) create mode 100644 models/templates/LFM2.5-8B-A1B.jinja diff --git a/common/chat.cpp b/common/chat.cpp index b8f248dab4e..24e58ab0640 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1625,8 +1625,17 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat const std::string THINK_END = ""; const std::string GEN_PROMPT = "<|im_start|>assistant\n"; - data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); - data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); + // Copy reasoning to the "thinking" field the template expects + auto adjusted_messages = json::array(); + for (auto msg : inputs.messages) { + if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) { + msg["thinking"] = msg.at("reasoning_content"); + } + adjusted_messages.push_back(msg); + } + + data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, adjusted_messages); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, adjusted_messages); data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.supports_thinking = true; data.preserved_tokens = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END }; @@ -1639,7 +1648,9 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat data.thinking_end_tag = THINK_END; auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); - auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; + // Gate by reasoning format and whether the template supports + auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE && + tmpl.source().find(THINK_START) != std::string::npos; auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; if (inputs.has_continuation()) { @@ -1658,7 +1669,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat auto end = p.end(); auto reasoning = p.eps(); - if (extract_reasoning && inputs.enable_thinking) { + if (extract_reasoning) { reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END); } diff --git a/models/templates/LFM2.5-8B-A1B.jinja b/models/templates/LFM2.5-8B-A1B.jinja new file mode 100644 index 00000000000..8bca4a545e9 --- /dev/null +++ b/models/templates/LFM2.5-8B-A1B.jinja @@ -0,0 +1,115 @@ +{{- bos_token -}} +{%- set preserve_thinking = preserve_thinking | default(false) -%} + +{%- macro format_arg_value(arg_value) -%} + {%- if arg_value is string -%} + {{- "'" + arg_value + "'" -}} + {%- elif arg_value is mapping -%} + {{- arg_value | tojson -}} + {%- else -%} + {{- arg_value | string -}} + {%- endif -%} +{%- endmacro -%} + +{%- macro parse_content(content) -%} + {%- if content is string -%} + {{- content -}} + {%- else -%} + {%- set _ns = namespace(result="") -%} + {%- for item in content -%} + {%- if item["type"] == "image" -%} + {%- set _ns.result = _ns.result + "" -%} + {%- elif item["type"] == "text" -%} + {%- set _ns.result = _ns.result + item["text"] -%} + {%- else -%} + {%- set _ns.result = _ns.result + item | tojson -%} + {%- endif -%} + {%- endfor -%} + {{- _ns.result -}} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_calls(tool_calls) -%} + {%- set tool_calls_ns = namespace(tool_calls=[]) -%} + {%- for tool_call in tool_calls -%} + {%- set func_name = tool_call["function"]["name"] -%} + {%- set func_args = tool_call["function"]["arguments"] -%} + {%- set args_ns = namespace(arg_strings=[]) -%} + {%- for arg_name, arg_value in func_args.items() -%} + {%- set args_ns.arg_strings = args_ns.arg_strings + [arg_name + "=" + format_arg_value(arg_value)] -%} + {%- endfor -%} + {%- set tool_calls_ns.tool_calls = tool_calls_ns.tool_calls + [func_name + "(" + (args_ns.arg_strings | join(", ")) + ")"] -%} + {%- endfor -%} + {{- "<|tool_call_start|>[" + (tool_calls_ns.tool_calls | join(", ")) + "]<|tool_call_end|>" -}} +{%- endmacro -%} + +{%- set ns = namespace(system_prompt="", last_user_index=-1) -%} +{%- if messages[0]["role"] == "system" -%} + {%- if messages[0].get("content") -%} + {%- set ns.system_prompt = parse_content(messages[0]["content"]) -%} + {%- endif -%} + {%- set messages = messages[1:] -%} +{%- endif -%} +{%- if tools -%} + {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%} + {%- for tool in tools -%} + {%- if tool is not string -%} + {%- set tool = tool | tojson -%} + {%- endif -%} + {%- set ns.system_prompt = ns.system_prompt + tool -%} + {%- if not loop.last -%} + {%- set ns.system_prompt = ns.system_prompt + ", " -%} + {%- endif -%} + {%- endfor -%} + {%- set ns.system_prompt = ns.system_prompt + "]" -%} +{%- endif -%} +{%- if ns.system_prompt -%} + {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}} +{%- endif -%} +{%- for message in messages -%} + {%- if message["role"] == "user" -%} + {%- set ns.last_user_index = loop.index0 -%} + {%- endif -%} +{%- endfor -%} +{%- for message in messages -%} + {{- "<|im_start|>" + message.role + "\n" -}} + {%- if message.role == "assistant" -%} + {%- generation -%} + {%- if message.thinking is defined and (preserve_thinking or loop.index0 > ns.last_user_index) -%} + {{- "" + message.thinking + "" -}} + {%- endif -%} + {%- set _cfm_tag = "CONTINUE_FINAL_MESSAGE_TAG " -%} + {%- set _has_cfm = false -%} + {%- if message.content is defined -%} + {%- set content = parse_content(message.content) -%} + {%- if not (preserve_thinking or loop.index0 > ns.last_user_index) -%} + {%- if "" in content -%} + {%- set content = content.split("")[-1] | trim -%} + {%- endif -%} + {%- endif -%} + {%- if message.tool_calls is defined and content.endswith(_cfm_tag) -%} + {%- set _has_cfm = true -%} + {%- set _trunc_len = (content | length) - (_cfm_tag | length) -%} + {{- content[:_trunc_len] -}} + {%- else -%} + {{- content -}} + {%- endif -%} + {%- endif -%} + {%- if message.tool_calls is defined -%} + {{- render_tool_calls(message.tool_calls) -}} + {%- endif -%} + {%- if _has_cfm -%} + {{- _cfm_tag -}} + {%- endif -%} + {{- "<|im_end|>\n" -}} + {%- endgeneration -%} + {%- else %} + {%- if message.get("content") -%} + {{- parse_content(message["content"]) -}} + {%- endif -%} + {{- "<|im_end|>\n" -}} + {%- endif %} +{%- endfor -%} +{%- if add_generation_prompt -%} + {{- "<|im_start|>assistant\n" -}} +{%- endif -%} \ No newline at end of file diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 3107045b4fc..c1be9eb5a99 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1825,6 +1825,104 @@ static void test_convert_responses_to_chatcmpl() { } } +// Shared LFM2 parser cases - all variants use one output format and parser +static void test_lfm2_parser(const std::string & template_path, bool detailed_debug) { + auto tst = peg_tester(template_path, detailed_debug); + + // Basic content only + tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); + + // Single tool call without reasoning + tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") + .tools({ special_function_tool }) + .expect(message_assist_call) + .run(); + + // Tool call with string argument + tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>") + .tools({ get_time_tool }) + .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) + .run(); + + // Python literals become JSON + tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>") + .tools({ toggle_tool }) + .expect(message_with_tool_calls("toggle", R"({"enabled": true})")) + .run(); + + tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>") + .tools({ nullable_tool }) + .expect(message_with_tool_calls("set_nullable", R"({"value": null})")) + .run(); + + // Nested Python literal + tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>") + .tools({ config_tool }) + .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})")) + .run(); + + // JSON literals are accepted too + tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>") + .tools({ config_tool }) + .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})")) + .run(); + + // Dotted function name with structured args + tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], " + "metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>") + .tools({ calendar_create_event_tool }) + .expect(message_with_tool_calls( + "Calendar.create_event", + R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})")) + .run(); + + // Markdown links stay content + tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).") + .tools({ get_time_tool }) + .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")) + .run(); + + // Python tool with multiline code in string + tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>") + .tools({ python_tool }) + .expect_tool_calls({ + { "python", R"#({"code": "def hello():\\n print('hey')"})#", "" } + }) + .run(); + + // Content before tool call (no reasoning) + tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>") + .tools({ get_time_tool }) + .expect(message_with_reasoning_content_and_multiple_tool_calls( + "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } } + )) + .run(); + + // Multiple tool calls (parallel) + tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>") + .parallel_tool_calls(true) + .tools({ special_function_tool, special_function_tool_with_optional_param }) + .expect_tool_calls({ + { "special_function", R"({"arg1": 1})", {} }, + { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} }, + }) + .run(); + + // Partial tool call (streaming) + tst.test("<|tool_call_start|>[special_function(arg1=") + .tools({ special_function_tool }) + .is_partial(true) + .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": ")) + .run(); + + // Tool call with empty arguments + tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>") + .tools({ empty_args_tool }) + .expect(simple_assist_msg("", "", "empty_args", "{}")) + .run(); + +} + static void test_template_output_peg_parsers(bool detailed_debug) { LOG_DBG("%s\n", __func__); @@ -4038,49 +4136,30 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); } - // LFM2-8B-A1B tests - uses <|tool_list_start|>/<|tool_list_end|> and <|tool_call_start|>[name(args)]<|tool_call_end|> - { - auto tst = peg_tester("models/templates/LFM2-8B-A1B.jinja", detailed_debug); - - // Basic content only - tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); + for (const char * tmpl : { + "models/templates/LFM2-8B-A1B.jinja", + "models/templates/LFM2.5-Instruct.jinja", + "models/templates/LFM2.5-8B-A1B.jinja", + }) { + test_lfm2_parser(tmpl, detailed_debug); + } - // Single tool call without reasoning - tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") - .tools({ special_function_tool }) - .expect(message_assist_call) - .run(); + // Thinking cases only apply to LFM2.5-8B-A1B, the one LFM2 template that emits + { + auto tst = peg_tester("models/templates/LFM2.5-8B-A1B.jinja", detailed_debug); - // Tool call with string argument - tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>") - .tools({ get_time_tool }) - .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) - .run(); + // Reasoning is parsed independent of enable_thinking - // Tool call with reasoning (enable_thinking=true) + // Tool call with reasoning tst.test("I'm\nthinking<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") - .enable_thinking(true) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ special_function_tool }) .expect(message_assist_call_thoughts) .run(); - // Multiple tool calls (parallel) - tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>") - .parallel_tool_calls(true) - .tools({ - special_function_tool, special_function_tool_with_optional_param - }) - .expect_tool_calls({ - { "special_function", R"({"arg1": 1})", {} }, - { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} }, - }) - .run(); - // Tool call with reasoning and content tst.test("I need to call a function" "Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>") - .enable_thinking(true) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ get_time_tool }) .expect(message_with_reasoning_content_and_multiple_tool_calls( @@ -4088,32 +4167,9 @@ static void test_template_output_peg_parsers(bool detailed_debug) { )) .run(); - // Python tool with multiline code in string - tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>") - .tools({ python_tool }) - .expect_tool_calls({ - { "python", R"#({"code": "def hello():\\n print('hey')"})#", "" } - }) - .run(); - - // Partial tool call (streaming) - tst.test("<|tool_call_start|>[special_function(arg1=") - .tools({ special_function_tool }) - .is_partial(true) - .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": ")) - .run(); - - // Tool call with empty arguments - tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>") - .tools({ empty_args_tool }) - .expect(simple_assist_msg("", "", "empty_args", "{}")) - .run(); - - // fake tool call marker in reasoning - tst.test( - "Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm" - "<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") - .enable_thinking(true) + // Fake tool call marker inside reasoning is not parsed as a call + tst.test("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm" + "<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ special_function_tool }) .expect_reasoning("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm") @@ -4122,127 +4178,21 @@ static void test_template_output_peg_parsers(bool detailed_debug) { }) .run(); - // Continuation tests - tst.test("world!\nWhat's up?") - .reasoning_format(COMMON_REASONING_FORMAT_AUTO) - .enable_thinking(true) - .messages({ message_user, message_assist_prefill_content }) - .add_generation_prompt(false) - .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) - .expect_reasoning("I'm thinking") - .expect_content("Hello, world!\nWhat's up?") - .run(); - - tst.test(" thinkingHello, world!\nWhat's up?") + // enable_thinking=false still captures emitted reasoning + tst.test("I'm\nthinkingHello, world!\nWhat's up?") + .enable_thinking(false) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) - .enable_thinking(true) - .messages({ message_user, message_assist_prefill_reasoning }) - .add_generation_prompt(false) - .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) - .expect_reasoning("I'm thinking") - .expect_content("Hello, world!\nWhat's up?") - .run(); - } - - // LFM2.5 tests - format <|tool_call_start|>[name(args)]<|tool_call_end|> - { - auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug); - - // Basic content only - tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); - - // Single tool call without reasoning - tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") - .tools({ special_function_tool }) - .expect(message_assist_call) - .run(); - - // Tool call with string argument - tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>") - .tools({ get_time_tool }) - .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) - .run(); - - // Python literals become JSON. - tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>") - .tools({ toggle_tool }) - .expect(message_with_tool_calls("toggle", R"({"enabled": true})")) - .run(); - - tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>") - .tools({ nullable_tool }) - .expect(message_with_tool_calls("set_nullable", R"({"value": null})")) - .run(); - - // Nested Python literal. - tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>") - .tools({ config_tool }) - .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})")) - .run(); - - // JSON literals are accepted too. - tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>") - .tools({ config_tool }) - .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})")) - .run(); - - // Dotted function name with structured args. - tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], " - "metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>") - .tools({ calendar_create_event_tool }) - .expect(message_with_tool_calls( - "Calendar.create_event", - R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})")) - .run(); - - // Markdown links stay content. - tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).") - .tools({ get_time_tool }) - .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")) + .expect(message_assist_thoughts) .run(); - // Tool call with reasoning (enable_thinking=true) tst.test("I'm\nthinking<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>") - .enable_thinking(true) + .enable_thinking(false) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ special_function_tool }) .expect(message_assist_call_thoughts) .run(); - // Multiple tool calls (parallel) - tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>") - .parallel_tool_calls(true) - .tools({ - special_function_tool, special_function_tool_with_optional_param - }) - .expect_tool_calls({ - { "special_function", R"({"arg1": 1})", {} }, - { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} }, - }) - .run(); - - // Tool call with content before tool call - tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>") - .tools({ get_time_tool }) - .expect(message_with_reasoning_content_and_multiple_tool_calls( - "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } } - )) - .run(); - - // Partial tool call (streaming) - tst.test("<|tool_call_start|>[special_function(arg1=") - .tools({ special_function_tool }) - .is_partial(true) - .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": ")) - .run(); - - // Tool call with empty arguments - tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>") - .tools({ empty_args_tool }) - .expect(simple_assist_msg("", "", "empty_args", "{}")) - .run(); - - // Continuation tests + // Continuation: prefill content tst.test("world!\nWhat's up?") .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .enable_thinking(true) @@ -4253,6 +4203,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect_content("Hello, world!\nWhat's up?") .run(); + // Continuation: prefill reasoning tst.test(" thinkingHello, world!\nWhat's up?") .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .enable_thinking(true) @@ -5478,18 +5429,25 @@ static void test_template_generation_prompt() { check(tmpls, continuation_reasoning(), "<|im_assistant|>assistant<|im_middle|>I'm"); } - { - auto tmpls = read_templates("models/templates/LFM2-8B-A1B.jinja"); + for (const char * tmpl : { + "models/templates/LFM2-8B-A1B.jinja", + "models/templates/LFM2.5-Instruct.jinja", + "models/templates/LFM2.5-8B-A1B.jinja", + }) { + auto tmpls = read_templates(tmpl); check(tmpls, basic(), "<|im_start|>assistant\n"); check(tmpls, continuation_content(), "<|im_start|>assistant\nI'm thinkingHello, "); check(tmpls, continuation_reasoning(), "<|im_start|>assistant\nI'm"); } { - auto tmpls = read_templates("models/templates/LFM2.5-Instruct.jinja"); - check(tmpls, basic(), "<|im_start|>assistant\n"); - check(tmpls, continuation_content(), "<|im_start|>assistant\nI'm thinkingHello, "); - check(tmpls, continuation_reasoning(), "<|im_start|>assistant\nI'm"); + // 8B-A1B renders prior-turn reasoning via the "thinking" field + auto tmpls = read_templates("models/templates/LFM2.5-8B-A1B.jinja"); + common_chat_templates_inputs inputs; + inputs.messages = { message_user, message_assist_call_thoughts, tool_msg }; + inputs.add_generation_prompt = true; + auto params = common_chat_templates_apply(tmpls.get(), inputs); + assert_contains(params.prompt, "I'm\nthinking"); } { From 3f7c79d7b5bb4c0e5af3b9359078079441216e1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 7 Jun 2026 08:31:58 +0200 Subject: [PATCH 40/71] docker : bump cuda13 to 13.3.0 (#24228) --- .github/workflows/docker.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 6f1f2721e45..8195a55ff28 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -82,8 +82,8 @@ jobs: { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" }, { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, - { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, - { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, + { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, + { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }, From f71af352a52b8efe824c7a698d0632afa4794c01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 7 Jun 2026 08:43:05 +0200 Subject: [PATCH 41/71] convert : fix Gemma4 with no audio encoder (#24242) --- conversion/gemma.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/conversion/gemma.py b/conversion/gemma.py index 379876629fb..1258428b046 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -812,10 +812,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) # audio params - assert self.hparams_audio is not None - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6)) + if self.has_audio_encoder: + assert self.hparams_audio is not None + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6)) def is_audio_tensor(self, name: str) -> bool: return "audio_tower" in name or "embed_audio" in name From 465b1f0e75c590426cff3ca998bcd25297071a5b Mon Sep 17 00:00:00 2001 From: konradmb Date: Sun, 7 Jun 2026 11:18:44 +0200 Subject: [PATCH 42/71] arg: Skip mmproj download when user supplied mmproj (#24239) --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1ffaf704858..a859aac4fe2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -444,7 +444,7 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex) opts.offline = params.offline; opts.skip_download = params.skip_download; opts.download_mtp = spec_type_draft_mtp; - opts.download_mmproj = !params.no_mmproj; + opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty(); // sub-models (draft, mmproj, vocoder) are explicitly specified by the user, // so we should not auto-discover mtp/mmproj siblings for them From 94246d1b9d981d8ab5f33eb2b890bcf3f65f27a6 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sun, 7 Jun 2026 13:45:35 +0200 Subject: [PATCH 43/71] chore(sync): adapt DFlash to hparams.n_layer() method post-#24060 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/models/dflash.cpp had three direct uses of `hparams.n_layer`. The upstream hparams refactor (ggml-org/llama.cpp#24060) turned that into a method `n_layer()` (effective count, excludes nextn layers). DFlash drafter has no nextn layers, so `n_layer()` and the raw field `n_layer_all` are numerically equal — picked `n_layer()` to match the new accessor convention. Behavior-preserving. --- src/models/dflash.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/models/dflash.cpp b/src/models/dflash.cpp index bee180e1c15..5d14884cfbc 100644 --- a/src/models/dflash.cpp +++ b/src/models/dflash.cpp @@ -238,13 +238,13 @@ void llama_model_dflash::load_arch_hparams(llama_model_loader & ml) { // to receive the BOOL/INT array. Filled with 0 by default so unset slots are dense. std::array pattern{}; if (ml.get_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, pattern, false)) { - const uint32_t n = std::min(pattern.size(), hparams.n_layer); + const uint32_t n = std::min(pattern.size(), hparams.n_layer()); for (uint32_t il = 0; il < n; ++il) { hparams.is_swa_impl[il] = pattern[il] != 0 ? 1u : 0u; } } else { // No per-layer pattern: assume all layers are SWA. - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_swa_impl[il] = 1u; } } @@ -272,7 +272,7 @@ void llama_model_dflash::load_arch_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); // Layers for draft generation - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { auto & layer = layers[il]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), {n_embd}, 0); From 9f003edc7b85944c46958cdfeb0efb947c4b3a39 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 19 May 2026 20:18:00 +0800 Subject: [PATCH 44/71] llama: Gemma 4 MTP --- common/speculative.cpp | 76 +++++------ conversion/__init__.py | 1 + conversion/gemma.py | 10 ++ gguf-py/gguf/constants.py | 24 ++++ gguf-py/gguf/tensor_mapping.py | 8 ++ src/llama-arch.cpp | 5 + src/llama-arch.h | 3 + src/llama-context.cpp | 84 ++++++++++-- src/llama-context.h | 8 ++ src/llama-ext.h | 6 + src/llama-graph.cpp | 71 ++++++++++ src/llama-graph.h | 56 ++++++++ src/llama-kv-cache.cpp | 4 + src/llama-kv-cache.h | 5 + src/llama-model.cpp | 7 + src/llama-model.h | 4 + src/models/gemma4.cpp | 224 +++++++++++++++++++++++++++++++- src/models/models.h | 13 ++ tools/server/server-context.cpp | 10 ++ 19 files changed, 571 insertions(+), 48 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 3f25c0eb57d..f452ad3ca68 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -419,6 +419,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { int32_t n_embd = 0; + bool kv_shared_with_target = false; + // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. // The last h-row of one process() call needs the first token of the NEXT // call to pair with, so it's stashed here until that next call fires. @@ -445,7 +447,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set"); - n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); + n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft)); + GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) && + "MTP input row width must match the target h_nextn width"); LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling); @@ -490,6 +494,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false); llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); + llama_set_mtp_source(ctx_dft, ctx_tgt); + + kv_shared_with_target = llama_model_n_layer_kv(llama_get_model(ctx_dft)) == 0; pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); @@ -527,9 +534,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { if (N <= 0) { return; } + auto * ctx_dft = this->params.ctx_dft; const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); - if (pos_max < N - 1) { + if (pos_max < N - 1 && !kv_shared_with_target) { LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " "process() hook may not have run on every prefill ubatch " "(need_embd / logits=1 on every prompt position?). " @@ -572,48 +580,42 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { const size_t row_bytes = (size_t) n_embd * sizeof(float); - common_batch_clear(batch); + // if kv is shared with target (e.g Gemma4), then we can skip this catch-up decode + if (!kv_shared_with_target) { + common_batch_clear(batch); - for (int k = 0; k < n_tokens; ++k) { - common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); - } + for (int k = 0; k < n_tokens; ++k) { + common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); + } - // shift the tgt embeddings to the right by one position - // assumes that the tokens in the batch are sequential for each sequence - // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] - // ^--- this is a problem - // TODO:this is generally true, but would be nice to assert it - { - const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt); - std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + // shift the tgt embeddings to the right by one position + // assumes that the tokens in the batch are sequential for each sequence + // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] + // ^--- this is a problem + // TODO:this is generally true, but would be nice to assert it + { + const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt); + std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + } - //{ - // // string with seq_ids in the batch - // std::stringstream ss; - // for (int i = 0; i < n_tokens; ++i) { - // ss << batch_in.seq_id[i][0] << ","; - // } - // LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str()); - //} - } + // fill the pending embeddings from a previous run + auto set_h = [&](int idx, const float * h_row) { + std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); + }; - // fill the pending embeddings from a previous run - auto set_h = [&](int idx, const float * h_row) { - std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); - }; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_beg[seq_id] < 0) { + continue; + } - for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { - if (i_batch_beg[seq_id] < 0) { - continue; + set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); } - set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); - } - - const int32_t rc = llama_decode(ctx_dft, batch); - if (rc != 0) { - LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); - return false; + const int32_t rc = llama_decode(ctx_dft, batch); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); + return false; + } } for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { diff --git a/conversion/__init__.py b/conversion/__init__.py index c670798fc2b..87d2f80550d 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -75,6 +75,7 @@ "Gemma3TextModel": "gemma", "Gemma3nForCausalLM": "gemma", "Gemma3nForConditionalGeneration": "gemma", + "Gemma4AssistantForCausalLM": "gemma", "Gemma4ForConditionalGeneration": "gemma", "Gemma4ForCausalLM": "gemma", "Gemma4UnifiedForConditionalGeneration": "gemma", diff --git a/conversion/gemma.py b/conversion/gemma.py index 1258428b046..f72d5081840 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -785,6 +785,16 @@ def set_gguf_parameters(self): self.gguf_writer.add_suppress_tokens(suppress_tokens) +@ModelBase.register("Gemma4AssistantForCausalLM") +class Gemma4AssistantModel(Gemma4Model): + model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"]) + self.gguf_writer.add_nextn_predict_layers(self.block_count) + + @ModelBase.register("Gemma4ForConditionalGeneration") class Gemma4VisionAudioModel(MmprojModel): has_audio_encoder = True diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 814980ce508..b48bc0bcb8f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -440,6 +440,7 @@ class MODEL_ARCH(IntEnum): GEMMA3 = auto() GEMMA3N = auto() GEMMA4 = auto() + GEMMA4_ASSISTANT = auto() GEMMA_EMBEDDING = auto() STARCODER2 = auto() RWKV6 = auto() @@ -897,6 +898,8 @@ class MODEL_TENSOR(IntEnum): A_PER_DIM_K_SCALE = auto() # gemma4 A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp + NEXTN_PRE_PROJ = auto() + NEXTN_POST_PROJ = auto() NEXTN_EH_PROJ = auto() NEXTN_EMBED_TOKENS = auto() NEXTN_ENORM = auto() @@ -986,6 +989,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA3: "gemma3", MODEL_ARCH.GEMMA3N: "gemma3n", MODEL_ARCH.GEMMA4: "gemma4", + MODEL_ARCH.GEMMA4_ASSISTANT: "gemma4-assistant", MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", @@ -1471,6 +1475,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down", MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm", # NextN/MTP + MODEL_TENSOR.NEXTN_PRE_PROJ: "nextn.pre_projection", + MODEL_TENSOR.NEXTN_POST_PROJ: "nextn.post_projection", MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm", @@ -2577,6 +2583,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.PER_LAYER_PROJ_NORM, MODEL_TENSOR.PER_LAYER_POST_NORM, ], + MODEL_ARCH.GEMMA4_ASSISTANT: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.NEXTN_PRE_PROJ, + MODEL_TENSOR.NEXTN_POST_PROJ, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + MODEL_TENSOR.LAYER_OUT_SCALE, + ], MODEL_ARCH.GEMMA_EMBEDDING: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 3e63b216505..34feade0783 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2367,6 +2367,14 @@ class TensorNameMap: ), # NextN/MTP tensors + MODEL_TENSOR.NEXTN_PRE_PROJ: ( + "pre_projection", + ), + + MODEL_TENSOR.NEXTN_POST_PROJ: ( + "post_projection", + ), + MODEL_TENSOR.NEXTN_EH_PROJ: ( "model.layers.{bid}.eh_proj", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 63fc9559bad..55daa600a92 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -57,6 +57,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA3, "gemma3" }, { LLM_ARCH_GEMMA3N, "gemma3n" }, { LLM_ARCH_GEMMA4, "gemma4" }, + { LLM_ARCH_GEMMA4_ASSISTANT, "gemma4-assistant" }, { LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, @@ -458,6 +459,8 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, + { LLM_TENSOR_NEXTN_PRE_PROJ, "nextn.pre_projection" }, + { LLM_TENSOR_NEXTN_POST_PROJ, "nextn.post_projection" }, { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, { LLM_TENSOR_DFLASH_FC, "dflash_fc" }, { LLM_TENSOR_DFLASH_HIDDEN_NORM, "dflash_hidden_norm" }, @@ -774,6 +777,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PRE_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_POST_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so // the model loader doesn't fault on the block index. diff --git a/src/llama-arch.h b/src/llama-arch.h index 7f42af697e3..e38e5c150e2 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -61,6 +61,7 @@ enum llm_arch { LLM_ARCH_GEMMA3, LLM_ARCH_GEMMA3N, LLM_ARCH_GEMMA4, + LLM_ARCH_GEMMA4_ASSISTANT, LLM_ARCH_GEMMA_EMBEDDING, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, @@ -564,6 +565,8 @@ enum llm_tensor { LLM_TENSOR_INDEXER_PROJ, LLM_TENSOR_INDEXER_ATTN_K, LLM_TENSOR_INDEXER_ATTN_Q_B, + LLM_TENSOR_NEXTN_PRE_PROJ, + LLM_TENSOR_NEXTN_POST_PROJ, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4098c719f98..9faa3674445 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -30,6 +30,21 @@ static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) { throw std::runtime_error("Unsupported ctx type"); } +static uint32_t ctx_type_to_embd_inp(const llama_hparams & hparams, llama_context_type ctx_type) { + switch (ctx_type) { + case LLAMA_CONTEXT_TYPE_DEFAULT: return hparams.n_embd_inp(); + case LLAMA_CONTEXT_TYPE_MTP : return hparams.n_embd_out(); + } + throw std::runtime_error("Unsupported ctx type"); +} + +namespace { +struct src_mctx_reset_on_exit { + llama_memory_context_ptr * slot; + ~src_mctx_reset_on_exit() { if (slot) slot->reset(); } +}; +} + llama_context::llama_context( const llama_model & model, llama_context_params params) : @@ -398,7 +413,11 @@ llama_context::llama_context( } } - sched_reserve(); + // MTP draft contexts can't reserve until the source context is wired + // via llama_set_mtp_source — defer to the first decode. + if (cparams.ctx_type != LLAMA_CONTEXT_TYPE_MTP) { + sched_reserve(); + } if (!cparams.flash_attn) { if (ggml_is_quantized(params.type_v)) { @@ -472,6 +491,23 @@ void llama_context::sched_reserve() { } } + // When called from decode(), src_mctx_for_decode is already populated and + // we must not drop it on exit (process_ubatch still needs it). Snapshot + // only when sched_reserve runs standalone (e.g. lazy first-decode reserve + // when set_mtp_source flipped sched_need_reserve). + const bool owns_src_snapshot = src_ctx && !src_mctx_for_decode; + if (owns_src_snapshot) { + auto * src_memory = src_ctx->get_memory(); + if (!src_memory) { + throw std::runtime_error("MTP source context has no memory module"); + } + src_mctx_for_decode = src_memory->init_full(); + if (!src_mctx_for_decode) { + throw std::runtime_error("failed to initialize MTP source memory snapshot"); + } + } + src_mctx_reset_on_exit reserve_src_drop{owns_src_snapshot ? &src_mctx_for_decode : nullptr}; + // avoid reserving graphs with zero outputs - assume one output per sequence const int n_outputs = n_seqs; @@ -930,7 +966,7 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) { throw std::runtime_error("no nextn embeddings"); } - const uint32_t n_embd = model.hparams.n_embd; + const uint32_t n_embd = model.hparams.n_embd_out(); if (!cparams.embeddings_nextn_masked) { // unmasked: nextn rows are stored densely, indexed by raw token position. @@ -1139,6 +1175,17 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) { cparams.embeddings_nextn_masked = masked; } +void llama_context::set_mtp_source(llama_context * src) { + if (src_ctx == src) { + return; + } + src_ctx = src; + src_mctx_for_decode.reset(); + // worst-case compute buffers were reserved without knowing about the source + // memory; force a re-reserve so the next decode sees src views + sched_need_reserve = true; +} + void llama_context::set_causal_attn(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); @@ -1510,7 +1557,7 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd_inp(); + const int64_t n_embd = ctx_type_to_embd_inp(hparams, cparams.ctx_type); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -1645,7 +1692,7 @@ int llama_context::encode(const llama_batch & batch_inp) { ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd = hparams.n_embd_out(); GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_nextn.size); ggml_backend_tensor_get_async(backend_h, t_h_nextn, embd_nextn.data, 0, n_tokens*n_embd*sizeof(float)); } @@ -1820,7 +1867,7 @@ int llama_context::decode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; const int64_t n_vocab = vocab.n_tokens(); - const int64_t n_embd = hparams.n_embd_inp(); + const int64_t n_embd = ctx_type_to_embd_inp(hparams, cparams.ctx_type); // when computing embeddings, all tokens are output const bool output_all = cparams.embeddings; @@ -1902,6 +1949,20 @@ int llama_context::decode(const llama_batch & batch_inp) { } } + src_mctx_reset_on_exit decode_src_drop{&src_mctx_for_decode}; + if (src_ctx) { + auto * src_memory = src_ctx->get_memory(); + if (!src_memory) { + LLAMA_LOG_ERROR("%s: MTP source context has no memory module\n", __func__); + return -2; + } + src_mctx_for_decode = src_memory->init_full(); + if (!src_mctx_for_decode) { + LLAMA_LOG_ERROR("%s: failed to snapshot MTP source memory\n", __func__); + return -2; + } + } + sched_reserve(); bool did_optimize = false; @@ -2116,7 +2177,7 @@ int llama_context::decode(const llama_batch & batch_inp) { ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd = hparams.n_embd_out(); float * embd_nextn_out = embd_nextn.data + offset*n_embd; GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_nextn.size); @@ -2209,7 +2270,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); bool has_logits = true; @@ -2228,12 +2288,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { logits.size = has_logits ? n_vocab*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; - embd_nextn.size = has_embd_nextn ? n_embd*n_outputs_max : 0; + embd_nextn.size = has_embd_nextn ? n_embd_out*n_outputs_max : 0; if (has_embd_nextn && !cparams.embeddings_nextn_masked) { // unmasked: nextn row exists for every token in the batch, not just // those flagged via batch.logits[i] -> size by token count instead. - embd_nextn.size = (size_t) n_embd * n_batch; + embd_nextn.size = (size_t) n_embd_out * n_batch; } // Allocate backend sampling output buffers if there are backend samplers configured. @@ -2496,6 +2556,8 @@ llm_graph_params llama_context::graph_params( /*.cvec =*/ cvec.get(), /*.loras =*/ loras.get(), /*.mctx =*/ mctx, + /*.src_mctx =*/ src_mctx_for_decode.get(), + /*.src_model =*/ src_ctx ? &src_ctx->get_model() : nullptr, /*.cross =*/ &cross, /*.dflash =*/ &dflash, /*.samplers =*/ sampling.samplers, @@ -3802,6 +3864,10 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { ctx->set_embeddings_nextn(value, masked); } +void llama_set_mtp_source(llama_context * ctx, llama_context * src) { + ctx->set_mtp_source(src); +} + float * llama_get_embeddings_nextn(llama_context * ctx) { ctx->synchronize(); diff --git a/src/llama-context.h b/src/llama-context.h index 92ba4081e89..d9cdc18b396 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -6,6 +6,7 @@ #include "llama-graph.h" #include "llama-adapter.h" #include "llama-impl.h" +#include "llama-memory.h" #include "ggml-cpp.h" #include "ggml-opt.h" @@ -111,6 +112,7 @@ struct llama_context { void set_embeddings (bool value); void set_embeddings_nextn(bool value, bool masked); + void set_mtp_source(llama_context * src); void set_causal_attn(bool value); void set_warmup(bool value); @@ -287,6 +289,12 @@ struct llama_context { std::unique_ptr memory; + // external KV source used by MTP draft contexts. src_ctx is the target + // context whose memory we read; src_mctx_for_decode is a per-decode + // snapshot held for the duration of one decode/sched_reserve call. + llama_context * src_ctx = nullptr; + llama_memory_context_ptr src_mctx_for_decode; + // decode output (2-dimensional array: [n_outputs][n_vocab]) buffer_view logits = {nullptr, 0}; diff --git a/src/llama-ext.h b/src/llama-ext.h index 7ad6125fad3..25473f5601d 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -85,6 +85,11 @@ using llama_memory_breakdown = std::mapget_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + src_mctx->get_swa() ->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); +} + +bool llm_graph_input_attn_src_kv_iswa::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.src_mctx); + + this->src_mctx = mctx; + + bool res = true; + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + return res; +} + void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(cross_kq_mask); @@ -1145,6 +1161,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : cvec (params.cvec), loras (params.loras), mctx (params.mctx), + src_mctx (params.src_mctx), + src_model (params.src_model), cross (params.cross), dflash (params.dflash), samplers (params.samplers), @@ -2790,6 +2808,59 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); } +llm_graph_input_attn_src_kv_iswa * llm_graph_context::build_attn_inp_src_kv_iswa() const { + GGML_ASSERT(src_mctx && "MTP draft graph requires src_mctx (set via llama_set_mtp_source)"); + + const auto * src_iswa = static_cast(src_mctx); + + auto inp = std::make_unique(hparams, cparams, src_iswa); + + inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, src_iswa->get_base(), ubatch, cparams); + inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; + + inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, src_iswa->get_swa(), ubatch, cparams); + inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; + + return (llm_graph_input_attn_src_kv_iswa *) res->add_input(std::move(inp)); +} + +ggml_tensor * llm_graph_context::build_attn( + llm_graph_input_attn_src_kv_iswa * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * wo_s, + ggml_tensor * q_cur, + ggml_tensor * kq_b, + ggml_tensor * sinks, + ggml_tensor * v_mla, + float kq_scale, + int il_assist, + int il_src) const { + const bool is_swa = hparams.is_swa(il_assist); + + const auto * src_iswa = inp->src_mctx; + const auto * src_cur = is_swa ? src_iswa->get_swa() : src_iswa->get_base(); + + const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask(); + + ggml_build_forward_expand(gf, q_cur); + + ggml_tensor * q = q_cur; + ggml_tensor * k = src_cur->get_k(ctx0, il_src); + ggml_tensor * v = src_cur->get_v(ctx0, il_src); + + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il_assist); + cb(cur, "kqv_out", il_assist); + + if (wo) { + cur = build_lora_mm(wo, cur, wo_s); + } + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + return cur; +} + ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_cross * inp, ggml_tensor * wo, diff --git a/src/llama-graph.h b/src/llama-graph.h index 21bb80a9564..c393f94ca64 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -494,6 +494,37 @@ class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { const llama_kv_cache_iswa_context * mctx; }; +// mask-only input for attention against an external (read-only) ISWA KV cache. +// used by MTP draft graphs that attend to the target's KV without owning any. +class llm_graph_input_attn_src_kv_iswa : public llm_graph_input_i { +public: + llm_graph_input_attn_src_kv_iswa( + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_iswa_context * src_mctx) : + hparams(hparams), + cparams(cparams), + src_mctx(src_mctx) { + } + ~llm_graph_input_attn_src_kv_iswa() = default; + + void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } + ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } + + ggml_tensor * self_kq_mask = nullptr; + ggml_tensor * self_kq_mask_cnv = nullptr; + ggml_tensor * self_kq_mask_swa = nullptr; + ggml_tensor * self_kq_mask_swa_cnv = nullptr; + + const llama_hparams hparams; + const llama_cparams cparams; + + const llama_kv_cache_iswa_context * src_mctx; +}; + class llm_graph_input_attn_cross : public llm_graph_input_i { public: llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {} @@ -636,6 +667,11 @@ struct llm_graph_params { const llama_adapter_cvec * cvec; const llama_adapter_loras * loras; const llama_memory_context_i * mctx; + // per-decode snapshot of an external memory module the graph reads from + // (never writes) — e.g. ctx_dft reading target KV during MTP draft. + // nullptr for a main decode. Rebound inside reuse-aware input classes. + const llama_memory_context_i * src_mctx; + const llama_model * src_model; const llama_cross * cross; const llama_dflash * dflash = nullptr; @@ -854,6 +890,8 @@ struct llm_graph_context { const llama_adapter_cvec * cvec; const llama_adapter_loras * loras; const llama_memory_context_i * mctx; + const llama_memory_context_i * src_mctx; + const llama_model * src_model; const llama_cross * cross; const llama_dflash * dflash = nullptr; @@ -1084,6 +1122,24 @@ struct llm_graph_context { float kq_scale, int il) const; + llm_graph_input_attn_src_kv_iswa * build_attn_inp_src_kv_iswa() const; + + // Q-only attention against an external ISWA KV cache (no K/V projections, + // no writes). il_assist labels the attention block in the local graph for + // logging; il_src indexes the source K/V layer to attend to. + ggml_tensor * build_attn( + llm_graph_input_attn_src_kv_iswa * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * wo_s, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + float kq_scale, + int il_assist, + int il_src) const; + llm_graph_input_attn_cross * build_attn_inp_cross() const; ggml_tensor * build_attn( diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index ccf7c2ccf92..487a96d7958 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -2478,6 +2478,10 @@ uint32_t llama_kv_cache_context::get_n_kv() const { return n_kv; } +llama_pos llama_kv_cache_context::seq_pos_max(llama_seq_id seq_id) const { + return kv->seq_pos_max(seq_id); +} + ggml_type llama_kv_cache_context::type_k() const { return kv->type_k(); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 649269af6dd..99f50101956 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -354,6 +354,11 @@ class llama_kv_cache_context : public llama_memory_context_i { uint32_t get_n_kv() const; + // last position recorded in the cache for this sequence; -1 if absent. + // exposed for cross-context KV consumers (e.g. MTP draft) that need to + // anchor the source position without owning a memory module of their own. + llama_pos seq_pos_max(llama_seq_id seq_id) const; + ggml_type type_k() const; ggml_type type_v() const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f91d341f2d4..810b8ca2ffb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -139,6 +139,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_gemma3n(params); case LLM_ARCH_GEMMA4: return new llama_model_gemma4(params); + case LLM_ARCH_GEMMA4_ASSISTANT: + return new llama_model_gemma4_assistant(params); case LLM_ARCH_GEMMA_EMBEDDING: return new llama_model_gemma_embedding(params); case LLM_ARCH_STARCODER2: @@ -2427,6 +2429,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: case LLM_ARCH_GEMMA4: + case LLM_ARCH_GEMMA4_ASSISTANT: case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: @@ -2622,6 +2625,10 @@ int32_t llama_model_n_devices(const struct llama_model * model) { return (int32_t)model->devices.size(); } +int32_t llama_model_n_layer_kv(const struct llama_model * model) { + return (int32_t) model->hparams.n_layer_kv(); +} + ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i) { if (i < 0 || i >= (int)model->devices.size()) { return nullptr; diff --git a/src/llama-model.h b/src/llama-model.h index d0b52f668fa..d8c3d0e66b9 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -552,6 +552,10 @@ struct llama_model { struct ggml_tensor * output_s = nullptr; struct ggml_tensor * output_in_s = nullptr; + // NextN/MTP model-level projections + struct ggml_tensor * nextn_pre_proj = nullptr; + struct ggml_tensor * nextn_post_proj = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index d255dffb573..b4cc945e918 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -135,6 +135,214 @@ std::unique_ptr llama_model_gemma4::build_arch_graph(const ll return std::make_unique(*this, params); } +void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + + uint32_t n_kv_shared_layers = 0; + ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); + + hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers; + hparams.f_attention_scale = 1.0f; + + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); + + if (hparams.n_layer == 4) { + type = LLM_TYPE_31B; + } +} + +void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + if (n_embd_head_k != n_embd_head_v) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k == n_embd_head_v"); + } + if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k_swa == n_embd_head_v_swa"); + } + if (hparams.n_embd_out() == n_embd) { + throw std::runtime_error("Gemma 4 assistant requires embedding_length_out to carry the target hidden size"); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + + const int64_t n_embd_backbone = hparams.n_embd_out(); + nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ, "weight"), { 2*n_embd_backbone, n_embd }, 0); + nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_POST_PROJ, "weight"), { n_embd, n_embd_backbone }, 0); + + int rope_freqs_flag = 0; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const int64_t n_head = hparams.n_head(i); + const int64_t n_embd_head = hparams.n_embd_head_k(i); + const int64_t n_ff = hparams.n_ff(i); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head*n_head }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head*n_head, n_embd }, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + + layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), { 1u }, 0); + + if (!hparams.is_swa(i)) { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_embd_head/2 }, rope_freqs_flag); + rope_freqs_flag = TENSOR_DUPLICATED; + } + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), { n_embd }, 0); + } +} + +std::unique_ptr llama_model_gemma4_assistant::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + GGML_ASSERT(src_mctx && "Gemma 4 assistant graph requires an MTP source (llama_set_mtp_source)"); + GGML_ASSERT(src_model && "Gemma 4 assistant graph requires a source model"); + GGML_ASSERT(src_model->tok_embd && "source model missing tok_embd"); + + const auto & src_hparams = src_model->hparams; + + // By convention the MTP draft reads from the trunk's final SWA and full layers. + const int32_t src_layer_full = (int32_t) src_hparams.n_layer - 1; + const int32_t src_layer_swa = (int32_t) src_hparams.n_layer - 2; + GGML_ASSERT(!src_hparams.is_swa(src_layer_full) && "trunk's last layer must be full attention"); + GGML_ASSERT( src_hparams.is_swa(src_layer_swa) && "trunk's penultimate layer must be SWA"); + + const int64_t n_embd_backbone = hparams.n_embd_out(); + + ggml_tensor * inp_tokens; + ggml_tensor * inp_h; + { + auto inp = std::make_unique(n_embd_backbone); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); + inp_tokens = inp->tokens; + res->t_inp_tokens = inp->tokens; + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_backbone, ubatch.n_tokens); + cb(inp->embd, "inp_h", -1); + ggml_set_input(inp->embd); + inp_h = inp->embd; + res->t_inp_embd = inp->embd; + + res->add_input(std::move(inp)); + } + + ggml_tensor * x = ggml_get_rows(ctx0, src_model->tok_embd, inp_tokens); + x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone)); + cb(x, "inp_embd_target", -1); + + ggml_tensor * xh = ggml_concat(ctx0, x, inp_h, 0); + cb(xh, "inp_xh", -1); + + ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_pre_proj, xh); + cb(cur, "pre_proj", -1); + + auto * inp_attn = build_attn_inp_src_kv_iswa(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + ggml_tensor * inpL = cur; + + for (int il = 0; il < n_layer; ++il) { + const bool is_swa = hparams.is_swa(il); + const int32_t il_src = is_swa ? src_layer_swa : src_layer_full; + + const int64_t n_embd_head = hparams.n_embd_head_k(il); + const int64_t n_head = hparams.n_head(il); + + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + const int n_rot_l = hparams.n_rot(il); + + ggml_tensor * cur_norm = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur_norm, "attn_norm", il); + + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur_norm); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + ggml_tensor * freq_factors = is_swa ? nullptr : model.layers[il].rope_freqs; + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, + freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_pos", il); + + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, nullptr, + Qcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il, il_src); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + cur = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL); + cb(attn_out, "attn_out", il); + + cur = build_norm(attn_out, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = build_norm(cur, model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, attn_out); + + cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); + cb(cur, "out_scaled", il); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + ggml_tensor * logits = build_lora_mm(model.output, cur); + cb(logits, "result_output", -1); + res->t_logits = logits; + + ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_post_proj, cur); + cb(h_next, "h_nextn", -1); + res->t_h_nextn = h_next; + + ggml_build_forward_expand(gf, logits); + ggml_build_forward_expand(gf, h_next); +} + // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, int idx) { GGML_ASSERT(idx < (int) x->ne[2]); @@ -301,7 +509,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para } // TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing - if (il == n_layer - 1 && inp_out_ids) { + // keep all rows when extracting unmasked nextn embeddings (MTP target needs the hidden state for every token) + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } @@ -416,7 +625,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_this_layer = ggml_view_2d_slice(ctx0, inp_per_layer, il); // [n_embd_per_layer, n_tokens] // TODO @ngxson : improve this - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids); } @@ -459,6 +668,17 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para model.output_norm, nullptr, LLM_NORM_RMS, -1); + // Expose the post-output-norm hidden state (the LM-head input feature) so that + // MTP draft contexts can read it via llama_get_embeddings_nextn_ith() as the + // recurrent h input. This matches the reference (transformers/vLLM/SGLang), + // which feeds the drafter the target's post-final-norm hidden state. + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + if (!cparams.embeddings_nextn_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + cb(cur, "result_norm", -1); res->t_embd = cur; diff --git a/src/models/models.h b/src/models/models.h index 03b770ab38c..4afcd918889 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -840,6 +840,19 @@ struct llama_model_gemma4 : public llama_model_base { }; +struct llama_model_gemma4_assistant : public llama_model_base { + llama_model_gemma4_assistant(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_gemma_embedding : public llama_model_base { llama_model_gemma_embedding(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index e8c120234ad..27f6ad8e874 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -10,6 +10,7 @@ #include "common.h" #include "fit.h" #include "llama.h" +#include "../../src/llama-ext.h" // staging API: llama_set_mtp_source #include "log.h" #include "sampling.h" #include "speculative.h" @@ -969,6 +970,11 @@ struct server_context_impl { llama_set_dflash(ctx_tgt, model_dft.get()); } + if (spec_mtp) { + // MTP draft must know its target before the first decode + llama_set_mtp_source(ctx_dft.get(), ctx_tgt); + } + ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); params_base.speculative.draft.ctx_tgt = ctx_tgt; @@ -991,6 +997,10 @@ struct server_context_impl { return false; } + // wire the source before any decode (the seq-rm probe below + // triggers sched_reserve which needs src for Gemma4-style MTP) + llama_set_mtp_source(ctx_dft.get(), ctx_tgt); + ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); params_base.speculative.draft.ctx_tgt = ctx_tgt; From af56714cfa60113afb54dde3d0374486bf7c24c7 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 19 May 2026 22:17:09 +0800 Subject: [PATCH 45/71] fix multi-seq --- src/llama-graph.cpp | 23 +++++++++++++++++++++++ src/models/gemma4.cpp | 1 - 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 96a5a4d56a3..049b91b01b5 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2849,6 +2849,29 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = src_cur->get_k(ctx0, il_src); ggml_tensor * v = src_cur->get_v(ctx0, il_src); + // build_attn_mha splits q across k->ne[3] (the trunk's stream count). When the + // trunk runs kv_unified=false the assistant's ubatch only references a subset + // of streams (one per active draft seq); q->ne[2] is not divisible by the full + // n_stream and the view collapses tokens. Slice k/v down to exactly the streams + // referenced by this ubatch. Requires those streams to form a contiguous range. + if (k->ne[3] > 1 && (uint32_t) k->ne[3] != ubatch.n_seqs_unq) { + GGML_ASSERT(ubatch.n_seqs_unq > 0 && ubatch.seq_id_unq); + llama_seq_id min_s = ubatch.seq_id_unq[0]; + llama_seq_id max_s = ubatch.seq_id_unq[0]; + for (uint32_t s = 1; s < ubatch.n_seqs_unq; ++s) { + min_s = std::min(min_s, ubatch.seq_id_unq[s]); + max_s = std::max(max_s, ubatch.seq_id_unq[s]); + } + GGML_ASSERT((uint32_t)(max_s - min_s + 1) == ubatch.n_seqs_unq && + "MTP src-kv attn requires the active draft seq_ids to be contiguous"); + GGML_ASSERT((int64_t) max_s < k->ne[3] && "MTP assistant seq_id beyond trunk stream count"); + + k = ggml_view_4d(ctx0, k, k->ne[0], k->ne[1], k->ne[2], (int64_t) ubatch.n_seqs_unq, + k->nb[1], k->nb[2], k->nb[3], (size_t) min_s * k->nb[3]); + v = ggml_view_4d(ctx0, v, v->ne[0], v->ne[1], v->ne[2], (int64_t) ubatch.n_seqs_unq, + v->nb[1], v->nb[2], v->nb[3], (size_t) min_s * v->nb[3]); + } + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il_assist); cb(cur, "kqv_out", il_assist); diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index b4cc945e918..938ba6bbab6 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -329,7 +329,6 @@ llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res->t_embd = cur; ggml_tensor * logits = build_lora_mm(model.output, cur); cb(logits, "result_output", -1); From 6289feb8ad96c603177601c3dc69bdec72ab0eaf Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 20 May 2026 23:41:33 +0800 Subject: [PATCH 46/71] add assert that draft + shared kv should be on same device --- src/llama-context.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9faa3674445..c9d827ac7e9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -43,6 +43,55 @@ struct src_mctx_reset_on_exit { llama_memory_context_ptr * slot; ~src_mctx_reset_on_exit() { if (slot) slot->reset(); } }; + +static void llama_assert_gemma4_mtp_source_placement( + const llama_context * ctx, + const llama_context * src) { + if (!ctx || !src) { + return; + } + + const auto & model_dft = ctx->get_model(); + const auto & model_tgt = src->get_model(); + + if (model_dft.arch != LLM_ARCH_GEMMA4_ASSISTANT || model_tgt.arch != LLM_ARCH_GEMMA4) { + return; + } + + if (model_tgt.split_mode() == LLAMA_SPLIT_MODE_TENSOR) { + return; + } + + const auto & hparams_dft = model_dft.hparams; + const auto & hparams_tgt = model_tgt.hparams; + + const int32_t il_tgt_full = (int32_t) hparams_tgt.n_layer - 1; + const int32_t il_tgt_swa = (int32_t) hparams_tgt.n_layer - 2; + + ggml_backend_dev_t dev_cpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!dev_cpu) { + throw std::runtime_error("Gemma 4 assistant MTP placement check failed: no CPU backend found"); + } + + const bool kv_offload = src->get_cparams().offload_kqv; + + for (uint32_t il_dft = 0; il_dft < hparams_dft.n_layer; ++il_dft) { + const int32_t il_tgt = hparams_dft.is_swa(il_dft) ? il_tgt_swa : il_tgt_full; + + ggml_backend_dev_t dev_dft = model_dft.dev_layer(il_dft); + ggml_backend_dev_t dev_kv = kv_offload ? model_tgt.dev_layer(il_tgt) : dev_cpu; + + if (dev_dft != dev_kv) { + throw std::runtime_error(format( + "Gemma 4 assistant MTP placement mismatch: draft layer %d is on %s, " + "but shared target KV layer %d is on %s", + (int) il_dft, + ggml_backend_dev_name(dev_dft), + (int) il_tgt, + ggml_backend_dev_name(dev_kv))); + } + } +} } llama_context::llama_context( @@ -1179,6 +1228,7 @@ void llama_context::set_mtp_source(llama_context * src) { if (src_ctx == src) { return; } + llama_assert_gemma4_mtp_source_placement(this, src); src_ctx = src; src_mctx_for_decode.reset(); // worst-case compute buffers were reserved without knowing about the source From 1cf8220502bd67a0706dadd0b687237d4d62fda4 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 22 May 2026 00:17:02 +0800 Subject: [PATCH 47/71] add Q rot when cache is quantized --- src/llama-graph.cpp | 29 +++++++++++++++++++++++++++++ src/llama-graph.h | 5 +++++ tools/server/server-context.cpp | 17 ++++++++++++----- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 049b91b01b5..062f277acaa 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -734,6 +734,19 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { void llm_graph_input_attn_src_kv_iswa::set_input(const llama_ubatch * ubatch) { src_mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); src_mctx->get_swa() ->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + + if (self_k_rot) { + src_mctx->get_base()->set_input_k_rot(self_k_rot); + } + if (self_v_rot) { + src_mctx->get_base()->set_input_v_rot(self_v_rot); + } + if (self_k_rot_swa) { + src_mctx->get_swa()->set_input_k_rot(self_k_rot_swa); + } + if (self_v_rot_swa) { + src_mctx->get_swa()->set_input_v_rot(self_v_rot_swa); + } } bool llm_graph_input_attn_src_kv_iswa::can_reuse(const llm_graph_params & params) { @@ -2821,6 +2834,11 @@ llm_graph_input_attn_src_kv_iswa * llm_graph_context::build_attn_inp_src_kv_iswa inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, src_iswa->get_swa(), ubatch, cparams); inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; + inp->self_k_rot = src_iswa->get_base()->build_input_k_rot(ctx0); + inp->self_v_rot = src_iswa->get_base()->build_input_v_rot(ctx0); + inp->self_k_rot_swa = src_iswa->get_swa()->build_input_k_rot(ctx0); + inp->self_v_rot_swa = src_iswa->get_swa()->build_input_v_rot(ctx0); + return (llm_graph_input_attn_src_kv_iswa *) res->add_input(std::move(inp)); } @@ -2843,6 +2861,13 @@ ggml_tensor * llm_graph_context::build_attn( const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask(); + auto * k_rot = is_swa ? inp->self_k_rot_swa : inp->self_k_rot; + auto * v_rot = is_swa ? inp->self_v_rot_swa : inp->self_v_rot; + + if (k_rot) { + q_cur = ggml_mul_mat_aux(ctx0, q_cur, k_rot); + } + ggml_build_forward_expand(gf, q_cur); ggml_tensor * q = q_cur; @@ -2875,6 +2900,10 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il_assist); cb(cur, "kqv_out", il_assist); + if (v_rot) { + cur = ggml_mul_mat_aux(ctx0, cur, v_rot); + } + if (wo) { cur = build_lora_mm(wo, cur, wo_s); } diff --git a/src/llama-graph.h b/src/llama-graph.h index c393f94ca64..0a4deb699a4 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -519,6 +519,11 @@ class llm_graph_input_attn_src_kv_iswa : public llm_graph_input_i { ggml_tensor * self_kq_mask_swa = nullptr; ggml_tensor * self_kq_mask_swa_cnv = nullptr; + ggml_tensor * self_k_rot = nullptr; + ggml_tensor * self_v_rot = nullptr; + ggml_tensor * self_k_rot_swa = nullptr; + ggml_tensor * self_v_rot_swa = nullptr; + const llama_hparams hparams; const llama_cparams cparams; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 27f6ad8e874..f49480d8e52 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -925,13 +925,23 @@ struct server_context_impl { SRV_INF("loading draft model '%s'\n", params_spec.mparams.path.c_str()); + const bool spec_mtp = std::find(params_base.speculative.types.begin(), + params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + auto params_dft = params_base; params_dft.devices = params_spec.devices; params_dft.model = params_spec.mparams; params_dft.n_gpu_layers = params_spec.n_gpu_layers; - params_dft.cache_type_k = params_spec.cache_type_k; - params_dft.cache_type_v = params_spec.cache_type_v; + // TODO: find a better way to expose that the cache is shared + if (spec_mtp) { + params_dft.cache_type_k = params_base.cache_type_k; + params_dft.cache_type_v = params_base.cache_type_v; + } else { + params_dft.cache_type_k = params_spec.cache_type_k; + params_dft.cache_type_v = params_spec.cache_type_v; + } if (params_spec.cpuparams.n_threads > 0) { params_dft.cpuparams.n_threads = params_spec.cpuparams.n_threads; @@ -950,9 +960,6 @@ struct server_context_impl { auto cparams = common_context_params_to_llama(params_dft); - const bool spec_mtp = std::find(params_base.speculative.types.begin(), - params_base.speculative.types.end(), - COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); if (spec_mtp) { cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP; } From 5fa9213833324776757ad1ef61ef34f87b4189e5 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 28 May 2026 12:53:08 +0800 Subject: [PATCH 48/71] add temp hack to not use fit with gemma4, rm later --- tools/server/server-context.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index f49480d8e52..ee2f08a096e 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -11,6 +11,7 @@ #include "fit.h" #include "llama.h" #include "../../src/llama-ext.h" // staging API: llama_set_mtp_source +#include "ggml-cpp.h" #include "log.h" #include "sampling.h" #include "speculative.h" @@ -846,11 +847,27 @@ struct server_context_impl { } cparams_dft.n_rs_seq = 0; + bool skip_measure = false; + //TODO: remove this + if (spec_mtp && has_draft) { + struct gguf_init_params meta_params = { + /* .no_alloc = */ true, + /* .ctx = */ nullptr, + }; + gguf_context_ptr meta(gguf_init_from_file(params_dft.model.path.c_str(), meta_params)); + + if (std::string(gguf_get_val_str(meta.get(), gguf_find_key(meta.get(), "general.architecture"))) == "gemma4-assistant") { + skip_measure = true; + SRV_WRN("[spec] skipping --fit memory measurement for Gemma 4 assistant draft model '%s'\n", + params_dft.model.path.c_str()); + } + } + std::vector devs; uint32_t hp_ngl = 0; uint32_t hp_nct = 0; uint32_t hp_nex = 0; - try { + if (!skip_measure) try { auto dmd = common_get_device_memory_data( params_dft.model.path.c_str(), &mparams_dft, &cparams_dft, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); From 5edc87fa6fd613120df4755599cb0498ee3639c0 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 28 May 2026 13:41:39 +0800 Subject: [PATCH 49/71] add exception in test-llama-archs --- tests/test-llama-archs.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 90004e37906..22e6dcc2e10 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -392,7 +392,7 @@ static bool arch_supported(const llm_arch arch) { if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { return false; // FIXME CUDA backend crashes. } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { return false; // FIXME @ngxson } if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) { @@ -450,7 +450,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } for (bool moe : {false, true}) { @@ -553,7 +553,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } From 571a9ddca53c8e056b55aed31cfcf0edf6a6ab05 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 28 May 2026 14:12:23 +0800 Subject: [PATCH 50/71] move assistant to separate file --- src/models/gemma4-assistant.cpp | 208 ++++++++++++++++++++++++++++++++ src/models/gemma4.cpp | 207 ------------------------------- 2 files changed, 208 insertions(+), 207 deletions(-) create mode 100644 src/models/gemma4-assistant.cpp diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp new file mode 100644 index 00000000000..1447058b4be --- /dev/null +++ b/src/models/gemma4-assistant.cpp @@ -0,0 +1,208 @@ +#include "models.h" + +void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + + uint32_t n_kv_shared_layers = 0; + ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); + + hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers; + hparams.f_attention_scale = 1.0f; + + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); + + if (hparams.n_layer == 4) { + type = LLM_TYPE_31B; + } +} + +void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + if (n_embd_head_k != n_embd_head_v) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k == n_embd_head_v"); + } + if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k_swa == n_embd_head_v_swa"); + } + if (hparams.n_embd_out() == n_embd) { + throw std::runtime_error("Gemma 4 assistant requires embedding_length_out to carry the target hidden size"); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + + const int64_t n_embd_backbone = hparams.n_embd_out(); + nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ, "weight"), { 2*n_embd_backbone, n_embd }, 0); + nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_POST_PROJ, "weight"), { n_embd, n_embd_backbone }, 0); + + int rope_freqs_flag = 0; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const int64_t n_head = hparams.n_head(i); + const int64_t n_embd_head = hparams.n_embd_head_k(i); + const int64_t n_ff = hparams.n_ff(i); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head*n_head }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head*n_head, n_embd }, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + + layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), { 1u }, 0); + + if (!hparams.is_swa(i)) { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_embd_head/2 }, rope_freqs_flag); + rope_freqs_flag = TENSOR_DUPLICATED; + } + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), { n_embd }, 0); + } +} + +std::unique_ptr llama_model_gemma4_assistant::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + GGML_ASSERT(src_mctx && "Gemma 4 assistant graph requires an MTP source (llama_set_mtp_source)"); + GGML_ASSERT(src_model && "Gemma 4 assistant graph requires a source model"); + GGML_ASSERT(src_model->tok_embd && "source model missing tok_embd"); + + const auto & src_hparams = src_model->hparams; + + // By convention the MTP draft reads from the trunk's final SWA and full layers. + const int32_t src_layer_full = (int32_t) src_hparams.n_layer - 1; + const int32_t src_layer_swa = (int32_t) src_hparams.n_layer - 2; + GGML_ASSERT(!src_hparams.is_swa(src_layer_full) && "trunk's last layer must be full attention"); + GGML_ASSERT( src_hparams.is_swa(src_layer_swa) && "trunk's penultimate layer must be SWA"); + + const int64_t n_embd_backbone = hparams.n_embd_out(); + + ggml_tensor * inp_tokens; + ggml_tensor * inp_h; + { + auto inp = std::make_unique(n_embd_backbone); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); + inp_tokens = inp->tokens; + res->t_inp_tokens = inp->tokens; + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_backbone, ubatch.n_tokens); + cb(inp->embd, "inp_h", -1); + ggml_set_input(inp->embd); + inp_h = inp->embd; + res->t_inp_embd = inp->embd; + + res->add_input(std::move(inp)); + } + + ggml_tensor * x = ggml_get_rows(ctx0, src_model->tok_embd, inp_tokens); + x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone)); + cb(x, "inp_embd_target", -1); + + ggml_tensor * xh = ggml_concat(ctx0, x, inp_h, 0); + cb(xh, "inp_xh", -1); + + ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_pre_proj, xh); + cb(cur, "pre_proj", -1); + + auto * inp_attn = build_attn_inp_src_kv_iswa(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + ggml_tensor * inpL = cur; + + for (int il = 0; il < n_layer; ++il) { + const bool is_swa = hparams.is_swa(il); + const int32_t il_src = is_swa ? src_layer_swa : src_layer_full; + + const int64_t n_embd_head = hparams.n_embd_head_k(il); + const int64_t n_head = hparams.n_head(il); + + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + const int n_rot_l = hparams.n_rot(il); + + ggml_tensor * cur_norm = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur_norm, "attn_norm", il); + + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur_norm); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + ggml_tensor * freq_factors = is_swa ? nullptr : model.layers[il].rope_freqs; + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, + freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_pos", il); + + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, nullptr, + Qcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il, il_src); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + cur = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL); + cb(attn_out, "attn_out", il); + + cur = build_norm(attn_out, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = build_norm(cur, model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, attn_out); + + cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); + cb(cur, "out_scaled", il); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + ggml_tensor * logits = build_lora_mm(model.output, cur); + cb(logits, "result_output", -1); + res->t_logits = logits; + + ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_post_proj, cur); + cb(h_next, "h_nextn", -1); + res->t_h_nextn = h_next; + + ggml_build_forward_expand(gf, logits); + ggml_build_forward_expand(gf, h_next); +} diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index 938ba6bbab6..1b1443b84b7 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -135,213 +135,6 @@ std::unique_ptr llama_model_gemma4::build_arch_graph(const ll return std::make_unique(*this, params); } -void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { - hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); - - uint32_t n_kv_shared_layers = 0; - ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); - - hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers; - hparams.f_attention_scale = 1.0f; - - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); - ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); - - if (hparams.n_layer == 4) { - type = LLM_TYPE_31B; - } -} - -void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { - LLAMA_LOAD_LOCALS; - - if (n_embd_head_k != n_embd_head_v) { - throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k == n_embd_head_v"); - } - if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) { - throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k_swa == n_embd_head_v_swa"); - } - if (hparams.n_embd_out() == n_embd) { - throw std::runtime_error("Gemma 4 assistant requires embedding_length_out to carry the target hidden size"); - } - - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); - output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); - - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); - - const int64_t n_embd_backbone = hparams.n_embd_out(); - nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ, "weight"), { 2*n_embd_backbone, n_embd }, 0); - nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_POST_PROJ, "weight"), { n_embd, n_embd_backbone }, 0); - - int rope_freqs_flag = 0; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; - - const int64_t n_head = hparams.n_head(i); - const int64_t n_embd_head = hparams.n_embd_head_k(i); - const int64_t n_ff = hparams.n_ff(i); - - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head*n_head }, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head*n_head, n_embd }, 0); - - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); - - layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), { 1u }, 0); - - if (!hparams.is_swa(i)) { - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_embd_head/2 }, rope_freqs_flag); - rope_freqs_flag = TENSOR_DUPLICATED; - } - - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); - layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), { n_embd }, 0); - } -} - -std::unique_ptr llama_model_gemma4_assistant::build_arch_graph(const llm_graph_params & params) const { - return std::make_unique(*this, params); -} - -llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_graph_params & params) : - llm_graph_context(params) { - GGML_ASSERT(src_mctx && "Gemma 4 assistant graph requires an MTP source (llama_set_mtp_source)"); - GGML_ASSERT(src_model && "Gemma 4 assistant graph requires a source model"); - GGML_ASSERT(src_model->tok_embd && "source model missing tok_embd"); - - const auto & src_hparams = src_model->hparams; - - // By convention the MTP draft reads from the trunk's final SWA and full layers. - const int32_t src_layer_full = (int32_t) src_hparams.n_layer - 1; - const int32_t src_layer_swa = (int32_t) src_hparams.n_layer - 2; - GGML_ASSERT(!src_hparams.is_swa(src_layer_full) && "trunk's last layer must be full attention"); - GGML_ASSERT( src_hparams.is_swa(src_layer_swa) && "trunk's penultimate layer must be SWA"); - - const int64_t n_embd_backbone = hparams.n_embd_out(); - - ggml_tensor * inp_tokens; - ggml_tensor * inp_h; - { - auto inp = std::make_unique(n_embd_backbone); - - inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - cb(inp->tokens, "inp_tokens", -1); - ggml_set_input(inp->tokens); - inp_tokens = inp->tokens; - res->t_inp_tokens = inp->tokens; - - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_backbone, ubatch.n_tokens); - cb(inp->embd, "inp_h", -1); - ggml_set_input(inp->embd); - inp_h = inp->embd; - res->t_inp_embd = inp->embd; - - res->add_input(std::move(inp)); - } - - ggml_tensor * x = ggml_get_rows(ctx0, src_model->tok_embd, inp_tokens); - x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone)); - cb(x, "inp_embd_target", -1); - - ggml_tensor * xh = ggml_concat(ctx0, x, inp_h, 0); - cb(xh, "inp_xh", -1); - - ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_pre_proj, xh); - cb(cur, "pre_proj", -1); - - auto * inp_attn = build_attn_inp_src_kv_iswa(); - ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = build_inp_out_ids(); - - ggml_tensor * inpL = cur; - - for (int il = 0; il < n_layer; ++il) { - const bool is_swa = hparams.is_swa(il); - const int32_t il_src = is_swa ? src_layer_swa : src_layer_full; - - const int64_t n_embd_head = hparams.n_embd_head_k(il); - const int64_t n_head = hparams.n_head(il); - - const float freq_base_l = model.get_rope_freq_base(cparams, il); - const float freq_scale_l = model.get_rope_freq_scale(cparams, il); - const int n_rot_l = hparams.n_rot(il); - - ggml_tensor * cur_norm = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur_norm, "attn_norm", il); - - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur_norm); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); - cb(Qcur, "Qcur_normed", il); - - ggml_tensor * freq_factors = is_swa ? nullptr : model.layers[il].rope_freqs; - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, - freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); - cb(Qcur, "Qcur_pos", il); - - cur = build_attn(inp_attn, model.layers[il].wo, nullptr, nullptr, - Qcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il, il_src); - - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - cur = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "attn_post_norm", il); - - ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL); - cb(attn_out, "attn_out", il); - - cur = build_norm(attn_out, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, nullptr, nullptr, - model.layers[il].ffn_gate, nullptr, nullptr, - model.layers[il].ffn_down, nullptr, nullptr, - nullptr, - LLM_FFN_GELU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - - cur = build_norm(cur, model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); - cb(cur, "ffn_post_norm", il); - - cur = ggml_add(ctx0, cur, attn_out); - - cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); - cb(cur, "out_scaled", il); - - inpL = cur; - } - cur = inpL; - - cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - ggml_tensor * logits = build_lora_mm(model.output, cur); - cb(logits, "result_output", -1); - res->t_logits = logits; - - ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_post_proj, cur); - cb(h_next, "h_nextn", -1); - res->t_h_nextn = h_next; - - ggml_build_forward_expand(gf, logits); - ggml_build_forward_expand(gf, h_next); -} - // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, int idx) { GGML_ASSERT(idx < (int) x->ne[2]); From b30096576411a61abe1808e223105604d4b8a70f Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 5 Jun 2026 14:59:44 +0800 Subject: [PATCH 51/71] add unified assistant --- conversion/__init__.py | 1 + conversion/gemma.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conversion/__init__.py b/conversion/__init__.py index 87d2f80550d..18162976f45 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -79,6 +79,7 @@ "Gemma4ForConditionalGeneration": "gemma", "Gemma4ForCausalLM": "gemma", "Gemma4UnifiedForConditionalGeneration": "gemma", + "Gemma4UnifiedAssistantForCausalLM": "gemma", "GemmaForCausalLM": "gemma", "Glm4ForCausalLM": "glm", "Glm4MoeForCausalLM": "glm", diff --git a/conversion/gemma.py b/conversion/gemma.py index f72d5081840..d8cf8be575c 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -785,7 +785,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_suppress_tokens(suppress_tokens) -@ModelBase.register("Gemma4AssistantForCausalLM") +@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM") class Gemma4AssistantModel(Gemma4Model): model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT From bcaf30d8c29192e36ebfc8364e9b0dcffa6520aa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Jun 2026 14:38:41 +0300 Subject: [PATCH 52/71] cont : adjust to hparams changes --- common/speculative.cpp | 4 ++-- src/llama-context.cpp | 17 ++++++++--------- src/llama-ext.h | 5 ----- src/llama-model.cpp | 4 ---- src/models/gemma4-assistant.cpp | 12 +++++------- 5 files changed, 15 insertions(+), 27 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index f452ad3ca68..7c234209a58 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -492,12 +492,12 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { } } + kv_shared_with_target = llama_get_memory(ctx_dft) == nullptr; + llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false); llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); llama_set_mtp_source(ctx_dft, ctx_tgt); - kv_shared_with_target = llama_model_n_layer_kv(llama_get_model(ctx_dft)) == 0; - pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); i_batch_beg.assign(n_seq, -1); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c9d827ac7e9..eed1f11b6e0 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -65,8 +65,8 @@ static void llama_assert_gemma4_mtp_source_placement( const auto & hparams_dft = model_dft.hparams; const auto & hparams_tgt = model_tgt.hparams; - const int32_t il_tgt_full = (int32_t) hparams_tgt.n_layer - 1; - const int32_t il_tgt_swa = (int32_t) hparams_tgt.n_layer - 2; + const int32_t il_tgt_full = (int32_t) hparams_tgt.n_layer() - 1; + const int32_t il_tgt_swa = (int32_t) hparams_tgt.n_layer() - 2; ggml_backend_dev_t dev_cpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!dev_cpu) { @@ -75,7 +75,7 @@ static void llama_assert_gemma4_mtp_source_placement( const bool kv_offload = src->get_cparams().offload_kqv; - for (uint32_t il_dft = 0; il_dft < hparams_dft.n_layer; ++il_dft) { + for (uint32_t il_dft = 0; il_dft < hparams_dft.n_layer(); ++il_dft) { const int32_t il_tgt = hparams_dft.is_swa(il_dft) ? il_tgt_swa : il_tgt_full; ggml_backend_dev_t dev_dft = model_dft.dev_layer(il_dft); @@ -3769,12 +3769,11 @@ llama_context * llama_init_from_model( model->hparams.pooling_type, params.pooling_type); } - if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && - model->hparams.n_layer_nextn == 0) { - LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); - return nullptr; - } - + //if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && + // model->hparams.n_layer_nextn == 0) { + // LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); + // return nullptr; + //} try { auto * ctx = new llama_context(*model, params); diff --git a/src/llama-ext.h b/src/llama-ext.h index 25473f5601d..92f8bfffa49 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -85,11 +85,6 @@ using llama_memory_breakdown = std::mapdevices.size(); } -int32_t llama_model_n_layer_kv(const struct llama_model * model) { - return (int32_t) model->hparams.n_layer_kv(); -} - ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i) { if (i < 0 || i >= (int)model->devices.size()) { return nullptr; diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp index 1447058b4be..10f69fa3d84 100644 --- a/src/models/gemma4-assistant.cpp +++ b/src/models/gemma4-assistant.cpp @@ -2,22 +2,20 @@ void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); uint32_t n_kv_shared_layers = 0; ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); - hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers; - hparams.f_attention_scale = 1.0f; + hparams.f_attention_scale = 1.0f; - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); - if (hparams.n_layer == 4) { + if (hparams.n_layer() == 4) { type = LLM_TYPE_31B; } } @@ -88,8 +86,8 @@ llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_ const auto & src_hparams = src_model->hparams; // By convention the MTP draft reads from the trunk's final SWA and full layers. - const int32_t src_layer_full = (int32_t) src_hparams.n_layer - 1; - const int32_t src_layer_swa = (int32_t) src_hparams.n_layer - 2; + const int32_t src_layer_full = (int32_t) src_hparams.n_layer() - 1; + const int32_t src_layer_swa = (int32_t) src_hparams.n_layer() - 2; GGML_ASSERT(!src_hparams.is_swa(src_layer_full) && "trunk's last layer must be full attention"); GGML_ASSERT( src_hparams.is_swa(src_layer_swa) && "trunk's penultimate layer must be SWA"); From 57a2246340f0c27c9a887b43f388091359f60ebf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Jun 2026 14:39:03 +0300 Subject: [PATCH 53/71] cont : avoid computations on the CPU --- src/llama-arch.cpp | 4 ++-- src/llama-arch.h | 4 ++-- src/models/gemma4-assistant.cpp | 5 ++++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 55daa600a92..7a704c971db 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -777,8 +777,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_NEXTN_PRE_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_NEXTN_POST_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PRE_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_POST_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so // the model loader doesn't fault on the block index. diff --git a/src/llama-arch.h b/src/llama-arch.h index e38e5c150e2..98027044ae5 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -565,8 +565,8 @@ enum llm_tensor { LLM_TENSOR_INDEXER_PROJ, LLM_TENSOR_INDEXER_ATTN_K, LLM_TENSOR_INDEXER_ATTN_Q_B, - LLM_TENSOR_NEXTN_PRE_PROJ, - LLM_TENSOR_NEXTN_POST_PROJ, + LLM_TENSOR_NEXTN_PRE_PROJ, // TODO: rename to PROJ_PRE + LLM_TENSOR_NEXTN_POST_PROJ, // TODO: rename to PROJ_POST LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp index 10f69fa3d84..8c274e0cbd3 100644 --- a/src/models/gemma4-assistant.cpp +++ b/src/models/gemma4-assistant.cpp @@ -39,7 +39,6 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); const int64_t n_embd_backbone = hparams.n_embd_out(); - nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ, "weight"), { 2*n_embd_backbone, n_embd }, 0); nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_POST_PROJ, "weight"), { n_embd, n_embd_backbone }, 0); int rope_freqs_flag = 0; @@ -51,6 +50,10 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { const int64_t n_embd_head = hparams.n_embd_head_k(i); const int64_t n_ff = hparams.n_ff(i); + if (i == 0) { + nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ, "weight", i), { 2*n_embd_backbone, n_embd }, 0); + } + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head*n_head }, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head*n_head, n_embd }, 0); From 93aa400e2fc49e3872603e1fabca948fac15edd6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 6 Jun 2026 10:48:36 +0300 Subject: [PATCH 54/71] cont : clean-up --- common/speculative.cpp | 21 +++-- include/llama.h | 2 + src/llama-context.cpp | 110 ++++++---------------- src/llama-context.h | 9 +- src/llama-cparams.h | 2 + src/llama-ext.h | 3 +- src/llama-graph.cpp | 156 ++++--------------------------- src/llama-graph.h | 62 +----------- src/llama-hparams.cpp | 4 + src/llama-hparams.h | 4 + src/llama-kv-cache-dsa.cpp | 4 +- src/llama-kv-cache-iswa.cpp | 18 +++- src/llama-kv-cache-iswa.h | 4 +- src/llama-kv-cache.cpp | 83 +++++++++++++++- src/llama-kv-cache.h | 9 +- src/llama-memory-hybrid-iswa.cpp | 2 + src/llama-memory-hybrid.cpp | 2 + src/llama-memory.h | 4 + src/llama-model.cpp | 84 ++++++++++++----- src/llama-model.h | 1 + src/models/gemma4-assistant.cpp | 43 ++++----- tools/server/server-context.cpp | 19 ++-- 22 files changed, 280 insertions(+), 366 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 7c234209a58..b99291a7972 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -3,13 +3,14 @@ #include "common.h" #include "ggml.h" #include "llama.h" -#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP) #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" #include "ngram-mod.h" #include "sampling.h" +#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP) + #include #include #include @@ -419,7 +420,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { int32_t n_embd = 0; - bool kv_shared_with_target = false; + bool is_mem_shared = false; // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. // The last h-row of one process() call needs the first token of the NEXT @@ -492,11 +493,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { } } - kv_shared_with_target = llama_get_memory(ctx_dft) == nullptr; - llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false); llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); - llama_set_mtp_source(ctx_dft, ctx_tgt); + + is_mem_shared = llama_get_ctx_src(ctx_dft) == ctx_tgt; pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); @@ -537,7 +537,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); - if (pos_max < N - 1 && !kv_shared_with_target) { + + if (pos_max < N - 1 && !is_mem_shared) { LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " "process() hook may not have run on every prefill ubatch " "(need_embd / logits=1 on every prompt position?). " @@ -581,7 +582,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { const size_t row_bytes = (size_t) n_embd * sizeof(float); // if kv is shared with target (e.g Gemma4), then we can skip this catch-up decode - if (!kv_shared_with_target) { + if (!is_mem_shared) { common_batch_clear(batch); for (int k = 0; k < n_tokens; ++k) { @@ -724,7 +725,11 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { continue; } - common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + if (is_mem_shared) { + common_batch_add(batch, id, dp.n_past, { seq_id }, true); + } else { + common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + } std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes); } diff --git a/include/llama.h b/include/llama.h index a7e5679c0ce..609de510a55 100644 --- a/include/llama.h +++ b/include/llama.h @@ -394,6 +394,8 @@ extern "C" { // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) struct llama_sampler_seq_config * samplers; size_t n_samplers; + + struct llama_context * ctx_src; }; struct llama_model_tensor_override { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index eed1f11b6e0..4bdf2159abf 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -30,20 +30,6 @@ static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) { throw std::runtime_error("Unsupported ctx type"); } -static uint32_t ctx_type_to_embd_inp(const llama_hparams & hparams, llama_context_type ctx_type) { - switch (ctx_type) { - case LLAMA_CONTEXT_TYPE_DEFAULT: return hparams.n_embd_inp(); - case LLAMA_CONTEXT_TYPE_MTP : return hparams.n_embd_out(); - } - throw std::runtime_error("Unsupported ctx type"); -} - -namespace { -struct src_mctx_reset_on_exit { - llama_memory_context_ptr * slot; - ~src_mctx_reset_on_exit() { if (slot) slot->reset(); } -}; - static void llama_assert_gemma4_mtp_source_placement( const llama_context * ctx, const llama_context * src) { @@ -92,7 +78,6 @@ static void llama_assert_gemma4_mtp_source_placement( } } } -} llama_context::llama_context( const llama_model & model, @@ -133,9 +118,10 @@ llama_context::llama_context( cparams.embeddings_nextn_masked = false; cparams.offload_kqv = params.offload_kqv; cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; cparams.warmup = false; + cparams.ctx_type = params.ctx_type; + cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -148,7 +134,10 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; - cparams.ctx_type = params.ctx_type; + // TODO: more generic + if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { + cparams.ctx_src = params.ctx_src; + } // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later @@ -367,7 +356,8 @@ llama_context::llama_context( /*.type_k =*/ params.type_k, /*.type_v =*/ params.type_v, /*.swa_full =*/ params.swa_full, - /*.ctx_type= */ cparams.ctx_type, + /*.ctx_type =*/ cparams.ctx_type, + /*.mem_src =*/ params.ctx_src ? params.ctx_src->memory.get() : nullptr, }; memory.reset(model.create_memory(params_mem, cparams)); @@ -462,11 +452,7 @@ llama_context::llama_context( } } - // MTP draft contexts can't reserve until the source context is wired - // via llama_set_mtp_source — defer to the first decode. - if (cparams.ctx_type != LLAMA_CONTEXT_TYPE_MTP) { - sched_reserve(); - } + sched_reserve(); if (!cparams.flash_attn) { if (ggml_is_quantized(params.type_v)) { @@ -540,23 +526,6 @@ void llama_context::sched_reserve() { } } - // When called from decode(), src_mctx_for_decode is already populated and - // we must not drop it on exit (process_ubatch still needs it). Snapshot - // only when sched_reserve runs standalone (e.g. lazy first-decode reserve - // when set_mtp_source flipped sched_need_reserve). - const bool owns_src_snapshot = src_ctx && !src_mctx_for_decode; - if (owns_src_snapshot) { - auto * src_memory = src_ctx->get_memory(); - if (!src_memory) { - throw std::runtime_error("MTP source context has no memory module"); - } - src_mctx_for_decode = src_memory->init_full(); - if (!src_mctx_for_decode) { - throw std::runtime_error("failed to initialize MTP source memory snapshot"); - } - } - src_mctx_reset_on_exit reserve_src_drop{owns_src_snapshot ? &src_mctx_for_decode : nullptr}; - // avoid reserving graphs with zero outputs - assume one output per sequence const int n_outputs = n_seqs; @@ -1224,18 +1193,6 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) { cparams.embeddings_nextn_masked = masked; } -void llama_context::set_mtp_source(llama_context * src) { - if (src_ctx == src) { - return; - } - llama_assert_gemma4_mtp_source_placement(this, src); - src_ctx = src; - src_mctx_for_decode.reset(); - // worst-case compute buffers were reserved without knowing about the source - // memory; force a re-reserve so the next decode sees src views - sched_need_reserve = true; -} - void llama_context::set_causal_attn(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); @@ -1607,7 +1564,7 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = ctx_type_to_embd_inp(hparams, cparams.ctx_type); + const int64_t n_embd = hparams.n_embd_inp(); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -1917,7 +1874,7 @@ int llama_context::decode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; const int64_t n_vocab = vocab.n_tokens(); - const int64_t n_embd = ctx_type_to_embd_inp(hparams, cparams.ctx_type); + const int64_t n_embd = hparams.n_embd_inp(); // when computing embeddings, all tokens are output const bool output_all = cparams.embeddings; @@ -1999,20 +1956,6 @@ int llama_context::decode(const llama_batch & batch_inp) { } } - src_mctx_reset_on_exit decode_src_drop{&src_mctx_for_decode}; - if (src_ctx) { - auto * src_memory = src_ctx->get_memory(); - if (!src_memory) { - LLAMA_LOG_ERROR("%s: MTP source context has no memory module\n", __func__); - return -2; - } - src_mctx_for_decode = src_memory->init_full(); - if (!src_mctx_for_decode) { - LLAMA_LOG_ERROR("%s: failed to snapshot MTP source memory\n", __func__); - return -2; - } - } - sched_reserve(); bool did_optimize = false; @@ -2606,8 +2549,6 @@ llm_graph_params llama_context::graph_params( /*.cvec =*/ cvec.get(), /*.loras =*/ loras.get(), /*.mctx =*/ mctx, - /*.src_mctx =*/ src_mctx_for_decode.get(), - /*.src_model =*/ src_ctx ? &src_ctx->get_model() : nullptr, /*.cross =*/ &cross, /*.dflash =*/ &dflash, /*.samplers =*/ sampling.samplers, @@ -3696,6 +3637,7 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, + /*.ctx_src =*/ nullptr, }; return result; @@ -3769,11 +3711,11 @@ llama_context * llama_init_from_model( model->hparams.pooling_type, params.pooling_type); } - //if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && - // model->hparams.n_layer_nextn == 0) { - // LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); - // return nullptr; - //} + if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && + model->hparams.n_layer_nextn == 0) { + LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); + return nullptr; + } try { auto * ctx = new llama_context(*model, params); @@ -3913,8 +3855,12 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { ctx->set_embeddings_nextn(value, masked); } -void llama_set_mtp_source(llama_context * ctx, llama_context * src) { - ctx->set_mtp_source(src); +llama_memory_t llama_get_memory(const struct llama_context * ctx) { + if (!ctx) { + return nullptr; + } + + return ctx->get_memory(); } float * llama_get_embeddings_nextn(llama_context * ctx) { @@ -3980,7 +3926,7 @@ struct ggml_cgraph * llama_graph_reserve( uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs) { - auto * memory = ctx->get_memory(); + auto memory = ctx->get_memory(); llama_memory_context_ptr mctx; if (memory) { mctx = memory->init_full(); @@ -4020,10 +3966,6 @@ int32_t llama_set_adapter_cvec( // memory // -llama_memory_t llama_get_memory(const struct llama_context * ctx) { - return ctx->get_memory(); -} - void llama_memory_clear(llama_memory_t mem, bool data) { if (!mem) { return; @@ -4334,3 +4276,7 @@ void llama_opt_epoch( llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) { return ctx->memory_breakdown(); } + +llama_context * llama_get_ctx_src(struct llama_context * ctx) { + return ctx->get_cparams().ctx_src; +} diff --git a/src/llama-context.h b/src/llama-context.h index d9cdc18b396..a48e8fcca82 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -112,7 +112,6 @@ struct llama_context { void set_embeddings (bool value); void set_embeddings_nextn(bool value, bool masked); - void set_mtp_source(llama_context * src); void set_causal_attn(bool value); void set_warmup(bool value); @@ -287,13 +286,7 @@ struct llama_context { bool dflash_decoder_ctx = false; - std::unique_ptr memory; - - // external KV source used by MTP draft contexts. src_ctx is the target - // context whose memory we read; src_mctx_for_decode is a per-decode - // snapshot held for the duration of one decode/sched_reserve call. - llama_context * src_ctx = nullptr; - llama_memory_context_ptr src_mctx_for_decode; + llama_memory_ptr memory; // decode output (2-dimensional array: [n_outputs][n_vocab]) buffer_view logits = {nullptr, 0}; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 1cba534edaf..2e0cab5a86c 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -50,4 +50,6 @@ struct llama_cparams { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + + llama_context * ctx_src; }; diff --git a/src/llama-ext.h b/src/llama-ext.h index 92f8bfffa49..da95cdb9cea 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -93,7 +93,6 @@ LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_c // If masked == true, output the embeddings only for the tokens with batch.logits != 0 // If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked); -LLAMA_API void llama_set_mtp_source(struct llama_context * ctx, struct llama_context * src); // mirrors: // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); @@ -101,3 +100,5 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); + +LLAMA_API llama_context * llama_get_ctx_src(struct llama_context * ctx); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 062f277acaa..f7868a5c9d6 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -508,7 +508,7 @@ static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break; }; - LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); + LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swa_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__); LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__); @@ -676,18 +676,18 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { if (self_k_idxs && self_k_idxs->buffer) { mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); - - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); - - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); } @@ -716,47 +716,18 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { if (self_k_idxs && self_k_idxs->buffer) { res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); - } - - return res; -} - -void llm_graph_input_attn_src_kv_iswa::set_input(const llama_ubatch * ubatch) { - src_mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); - src_mctx->get_swa() ->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); - - if (self_k_rot) { - src_mctx->get_base()->set_input_k_rot(self_k_rot); - } - if (self_v_rot) { - src_mctx->get_base()->set_input_v_rot(self_v_rot); - } - if (self_k_rot_swa) { - src_mctx->get_swa()->set_input_k_rot(self_k_rot_swa); - } - if (self_v_rot_swa) { - src_mctx->get_swa()->set_input_v_rot(self_v_rot_swa); } -} - -bool llm_graph_input_attn_src_kv_iswa::can_reuse(const llm_graph_params & params) { - const auto * mctx = static_cast(params.src_mctx); - this->src_mctx = mctx; + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); - bool res = true; - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); return res; } @@ -896,18 +867,18 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch); - - attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); } + attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); + // swa tensors may not be allocated if there are no SWA attention layers if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch); attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch); - - attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); } + attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); + if (inp_attn->self_k_rot) { attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot); } @@ -950,18 +921,18 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens; //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; @@ -1146,6 +1117,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : ubatch (params.ubatch), n_embd (hparams.n_embd), n_layer (hparams.n_layer()), + n_layer_nextn (hparams.n_layer_nextn), n_rot (hparams.n_rot()), n_ctx (cparams.n_ctx), n_head (hparams.n_head()), @@ -1174,8 +1146,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : cvec (params.cvec), loras (params.loras), mctx (params.mctx), - src_mctx (params.src_mctx), - src_model (params.src_model), cross (params.cross), dflash (params.dflash), samplers (params.samplers), @@ -2821,98 +2791,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); } -llm_graph_input_attn_src_kv_iswa * llm_graph_context::build_attn_inp_src_kv_iswa() const { - GGML_ASSERT(src_mctx && "MTP draft graph requires src_mctx (set via llama_set_mtp_source)"); - - const auto * src_iswa = static_cast(src_mctx); - - auto inp = std::make_unique(hparams, cparams, src_iswa); - - inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, src_iswa->get_base(), ubatch, cparams); - inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; - - inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, src_iswa->get_swa(), ubatch, cparams); - inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; - - inp->self_k_rot = src_iswa->get_base()->build_input_k_rot(ctx0); - inp->self_v_rot = src_iswa->get_base()->build_input_v_rot(ctx0); - inp->self_k_rot_swa = src_iswa->get_swa()->build_input_k_rot(ctx0); - inp->self_v_rot_swa = src_iswa->get_swa()->build_input_v_rot(ctx0); - - return (llm_graph_input_attn_src_kv_iswa *) res->add_input(std::move(inp)); -} - -ggml_tensor * llm_graph_context::build_attn( - llm_graph_input_attn_src_kv_iswa * inp, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * wo_s, - ggml_tensor * q_cur, - ggml_tensor * kq_b, - ggml_tensor * sinks, - ggml_tensor * v_mla, - float kq_scale, - int il_assist, - int il_src) const { - const bool is_swa = hparams.is_swa(il_assist); - - const auto * src_iswa = inp->src_mctx; - const auto * src_cur = is_swa ? src_iswa->get_swa() : src_iswa->get_base(); - - const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask(); - - auto * k_rot = is_swa ? inp->self_k_rot_swa : inp->self_k_rot; - auto * v_rot = is_swa ? inp->self_v_rot_swa : inp->self_v_rot; - - if (k_rot) { - q_cur = ggml_mul_mat_aux(ctx0, q_cur, k_rot); - } - - ggml_build_forward_expand(gf, q_cur); - - ggml_tensor * q = q_cur; - ggml_tensor * k = src_cur->get_k(ctx0, il_src); - ggml_tensor * v = src_cur->get_v(ctx0, il_src); - - // build_attn_mha splits q across k->ne[3] (the trunk's stream count). When the - // trunk runs kv_unified=false the assistant's ubatch only references a subset - // of streams (one per active draft seq); q->ne[2] is not divisible by the full - // n_stream and the view collapses tokens. Slice k/v down to exactly the streams - // referenced by this ubatch. Requires those streams to form a contiguous range. - if (k->ne[3] > 1 && (uint32_t) k->ne[3] != ubatch.n_seqs_unq) { - GGML_ASSERT(ubatch.n_seqs_unq > 0 && ubatch.seq_id_unq); - llama_seq_id min_s = ubatch.seq_id_unq[0]; - llama_seq_id max_s = ubatch.seq_id_unq[0]; - for (uint32_t s = 1; s < ubatch.n_seqs_unq; ++s) { - min_s = std::min(min_s, ubatch.seq_id_unq[s]); - max_s = std::max(max_s, ubatch.seq_id_unq[s]); - } - GGML_ASSERT((uint32_t)(max_s - min_s + 1) == ubatch.n_seqs_unq && - "MTP src-kv attn requires the active draft seq_ids to be contiguous"); - GGML_ASSERT((int64_t) max_s < k->ne[3] && "MTP assistant seq_id beyond trunk stream count"); - - k = ggml_view_4d(ctx0, k, k->ne[0], k->ne[1], k->ne[2], (int64_t) ubatch.n_seqs_unq, - k->nb[1], k->nb[2], k->nb[3], (size_t) min_s * k->nb[3]); - v = ggml_view_4d(ctx0, v, v->ne[0], v->ne[1], v->ne[2], (int64_t) ubatch.n_seqs_unq, - v->nb[1], v->nb[2], v->nb[3], (size_t) min_s * v->nb[3]); - } - - ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il_assist); - cb(cur, "kqv_out", il_assist); - - if (v_rot) { - cur = ggml_mul_mat_aux(ctx0, cur, v_rot); - } - - if (wo) { - cur = build_lora_mm(wo, cur, wo_s); - } - if (wo_b) { - cur = ggml_add(ctx0, cur, wo_b); - } - return cur; -} - ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_cross * inp, ggml_tensor * wo, diff --git a/src/llama-graph.h b/src/llama-graph.h index 0a4deb699a4..fa77a47321d 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -494,42 +494,6 @@ class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { const llama_kv_cache_iswa_context * mctx; }; -// mask-only input for attention against an external (read-only) ISWA KV cache. -// used by MTP draft graphs that attend to the target's KV without owning any. -class llm_graph_input_attn_src_kv_iswa : public llm_graph_input_i { -public: - llm_graph_input_attn_src_kv_iswa( - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_kv_cache_iswa_context * src_mctx) : - hparams(hparams), - cparams(cparams), - src_mctx(src_mctx) { - } - ~llm_graph_input_attn_src_kv_iswa() = default; - - void set_input(const llama_ubatch * ubatch) override; - bool can_reuse(const llm_graph_params & params) override; - - ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } - ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } - - ggml_tensor * self_kq_mask = nullptr; - ggml_tensor * self_kq_mask_cnv = nullptr; - ggml_tensor * self_kq_mask_swa = nullptr; - ggml_tensor * self_kq_mask_swa_cnv = nullptr; - - ggml_tensor * self_k_rot = nullptr; - ggml_tensor * self_v_rot = nullptr; - ggml_tensor * self_k_rot_swa = nullptr; - ggml_tensor * self_v_rot_swa = nullptr; - - const llama_hparams hparams; - const llama_cparams cparams; - - const llama_kv_cache_iswa_context * src_mctx; -}; - class llm_graph_input_attn_cross : public llm_graph_input_i { public: llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {} @@ -672,11 +636,6 @@ struct llm_graph_params { const llama_adapter_cvec * cvec; const llama_adapter_loras * loras; const llama_memory_context_i * mctx; - // per-decode snapshot of an external memory module the graph reads from - // (never writes) — e.g. ctx_dft reading target KV during MTP draft. - // nullptr for a main decode. Rebound inside reuse-aware input classes. - const llama_memory_context_i * src_mctx; - const llama_model * src_model; const llama_cross * cross; const llama_dflash * dflash = nullptr; @@ -861,6 +820,7 @@ struct llm_graph_context { const int64_t n_embd; const int64_t n_layer; + const int64_t n_layer_nextn; const int64_t n_rot; const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; @@ -895,8 +855,6 @@ struct llm_graph_context { const llama_adapter_cvec * cvec; const llama_adapter_loras * loras; const llama_memory_context_i * mctx; - const llama_memory_context_i * src_mctx; - const llama_model * src_model; const llama_cross * cross; const llama_dflash * dflash = nullptr; @@ -1127,24 +1085,6 @@ struct llm_graph_context { float kq_scale, int il) const; - llm_graph_input_attn_src_kv_iswa * build_attn_inp_src_kv_iswa() const; - - // Q-only attention against an external ISWA KV cache (no K/V projections, - // no writes). il_assist labels the attention block in the local graph for - // logging; il_src indexes the source K/V layer to attend to. - ggml_tensor * build_attn( - llm_graph_input_attn_src_kv_iswa * inp, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * wo_s, - ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] - ggml_tensor * kq_b, - ggml_tensor * sinks, // [n_head_q] - ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] - float kq_scale, - int il_assist, - int il_src) const; - llm_graph_input_attn_cross * build_attn_inp_cross() const; ggml_tensor * build_attn( diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index e1e49d1cc1f..2bf57687382 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -91,6 +91,10 @@ uint32_t llama_hparams::n_rot(uint32_t il) const { } uint32_t llama_hparams::n_embd_inp() const { + if (n_embd_inp_impl > 0) { + return n_embd_inp_impl; + } + uint32_t n_embd_inp = n_embd; if (n_deepstack_layers > 0) { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 7305e4d1563..53cc2d1938d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -185,6 +185,9 @@ struct llama_hparams { // for Classifiers uint32_t n_cls_out = 1; + // input embedding dimension (0 = use n_embd) + uint32_t n_embd_inp_impl = 0; + // output embedding dimension (0 = use n_embd) uint32_t n_embd_out_impl = 0; @@ -224,6 +227,7 @@ struct llama_hparams { // complex mapping. If using deepstack_mapping_arr, also make sure to set // n_deepstack_layers to the number of unique deepstack layers so that // n_embd_imp is accurate (see granite.cpp). + // TODO: can be expressed via the `new n_embd_inp_impl` and remove this param uint32_t n_deepstack_layers = 0; // DFlash draft model diff --git a/src/llama-kv-cache-dsa.cpp b/src/llama-kv-cache-dsa.cpp index e44004b5586..916ab653756 100644 --- a/src/llama-kv-cache-dsa.cpp +++ b/src/llama-kv-cache-dsa.cpp @@ -32,7 +32,7 @@ llama_kv_cache_dsa::llama_kv_cache_dsa( kv_mla = std::make_unique( model, model.hparams, type_k, type_v, v_trans, offload, unified, kv_size, n_seq_max, n_pad, - n_swa, swa_type, filter, reuse); + n_swa, swa_type, nullptr, filter, reuse, nullptr); // we use llama_kv_cache for caching indexer keys // by hand-tweaking some hparams we fool it to create @@ -49,7 +49,7 @@ llama_kv_cache_dsa::llama_kv_cache_dsa( kv_lid = std::make_unique( model, hparams_lid, type_k, type_v, v_trans, offload, unified, kv_size, n_seq_max, n_pad, - n_swa, swa_type, filter, reuse); + n_swa, swa_type, nullptr, filter, reuse, nullptr); } void llama_kv_cache_dsa::clear(bool data) { diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 9b9f1790363..54694d4a7ea 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -23,8 +23,10 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, + llama_memory_t mem_src, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) { + const layer_reuse_cb & reuse, + const layer_share_cb & share) : hparams(model.hparams), unified(unified) { // chain filters const layer_filter_cb filter_base = [&](int32_t il) { @@ -59,17 +61,27 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); + llama_memory_t mem_src_base = nullptr; + if (mem_src) { + mem_src_base = static_cast(mem_src)->get_base(); + } + + llama_memory_t mem_src_swa = nullptr; + if (mem_src) { + mem_src_swa = static_cast(mem_src)->get_swa(); + } + kv_base = std::make_unique( model, hparams, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, - 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse); + 0, LLAMA_SWA_TYPE_NONE, mem_src_base, filter_base, reuse, share); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( model, hparams, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, - hparams.n_swa, hparams.swa_type, filter_swa, reuse); + hparams.n_swa, hparams.swa_type, mem_src_swa, filter_swa, reuse, share); } void llama_kv_cache_iswa::clear(bool data) { diff --git a/src/llama-kv-cache-iswa.h b/src/llama-kv-cache-iswa.h index 70ab22f0d60..0206dd27e6e 100644 --- a/src/llama-kv-cache-iswa.h +++ b/src/llama-kv-cache-iswa.h @@ -25,8 +25,10 @@ class llama_kv_cache_iswa : public llama_memory_i { uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, + llama_memory_t mem_src, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + const layer_share_cb & share); ~llama_kv_cache_iswa() = default; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 487a96d7958..585908301d9 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -90,8 +90,10 @@ llama_kv_cache::llama_kv_cache( uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, + llama_memory_t mem_src, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : + const layer_reuse_cb & reuse, + const layer_share_cb & share) : model(model), hparams(hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { @@ -160,6 +162,8 @@ llama_kv_cache::llama_kv_cache( const bool is_mla = hparams.is_mla(); + other = static_cast(mem_src); + for (uint32_t il = 0; il < n_layer; il++) { if (!hparams.has_kv(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); @@ -171,6 +175,24 @@ llama_kv_cache::llama_kv_cache( continue; } + if (share && other) { + const int32_t il_share = share(il); + + if (il_share >= 0) { + const auto & layer_share = other->layers[other->map_layer_ids[il_share]]; + + LLAMA_LOG_WARN("%s: layer %3d: sharing with layer %d. k = %p, v = %p\n", __func__, il, il_share, + layer_share.k->data, layer_share.v->data); + + map_layer_ids[il] = layers.size(); + + layers.push_back(layer_share); + layers.back().il = il; + + continue; + } + } + if (n_embd_head_k_all == 0) { n_embd_head_k_all = (int32_t) hparams.n_embd_head_k(il); } else if (n_embd_head_k_all > 0 && n_embd_head_k_all != (int32_t) hparams.n_embd_head_k(il)) { @@ -347,6 +369,11 @@ void llama_kv_cache::clear(bool data) { } bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return true; + } + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); if (p0 < 0) { @@ -410,6 +437,11 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { } void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size()); GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size()); @@ -497,6 +529,11 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll } void llama_kv_cache::seq_keep(llama_seq_id seq_id) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -519,6 +556,11 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) { } void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1"); @@ -564,6 +606,11 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll } void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1"); @@ -598,6 +645,11 @@ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, in } llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return other->seq_pos_min(seq_id); + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -606,6 +658,11 @@ llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { } llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return other->seq_pos_max(seq_id); + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -746,6 +803,11 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vectorget_sched(); @@ -1021,6 +1083,12 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, } void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + v_cells = other->v_cells; + return; + } + // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ]; @@ -1831,6 +1899,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { } ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + GGML_ASSERT(!other); + auto * ctx = res->get_ctx(); auto * gf = res->get_gf(); @@ -1876,6 +1947,11 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co } void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_UNUSED(flags); io.write(&n_stream, sizeof(n_stream)); @@ -1941,6 +2017,11 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla } void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_UNUSED(flags); GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 99f50101956..8ad2412149c 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -98,7 +98,7 @@ class llama_kv_cache : public llama_memory_i { // likely through `struct llama_memory_params` llama_kv_cache( const llama_model & model, - const llama_hparams & hparams, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, @@ -109,8 +109,10 @@ class llama_kv_cache : public llama_memory_i { uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, + llama_memory_t mem_src, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + const layer_share_cb & share); ~llama_kv_cache() = default; @@ -264,6 +266,9 @@ class llama_kv_cache : public llama_memory_i { // note: this is not part of the KV state and it's only used to speed-up the find_slot() method std::vector v_heads; + // TODO: temporary until we refactor to be able to share the same cells between 2 kv caches [TAG_KV_CACHE_SHARE_CELLS] + llama_kv_cache * other; + std::vector v_cells; // maps from a sequence id to a stream id diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index a242079b406..c7d4bcd413e 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -43,9 +43,11 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( n_seq_max, n_ubatch, n_pad, + nullptr, filter_attn == nullptr ? [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, + nullptr, nullptr )), mem_recr(new llama_memory_recurrent( diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 66ec3fd6d55..f2d49cbce54 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -44,9 +44,11 @@ llama_memory_hybrid::llama_memory_hybrid( n_pad, n_swa, swa_type, + nullptr, filter_attn == nullptr ? [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, + nullptr, nullptr )), mem_recr(new llama_memory_recurrent( diff --git a/src/llama-memory.h b/src/llama-memory.h index 4ad1612e45b..e3025ec7895 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -23,6 +23,8 @@ struct llama_memory_params { bool swa_full; llama_context_type ctx_type; + + llama_memory_t mem_src; }; enum llama_memory_status { @@ -76,6 +78,8 @@ struct llama_memory_i { // return negative value to indicate that the layer il should not reuse memory using layer_reuse_cb = std::function; + using layer_share_cb = std::function; + virtual ~llama_memory_i() = default; // split the input batch into a set of ubatches and verify that they can fit into the cache diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f443527549e..28112fab1c8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1721,19 +1721,21 @@ void llama_model::print_info() const { if (!hparams.vocab_only) { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp()); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_embd_out = %u\n", __func__, hparams.n_embd_out()); LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer()); - LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer()).c_str()); - LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_layer_al = %u\n", __func__, hparams.n_layer_all); + LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any()); LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full); LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full); - LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer()).c_str()); - LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer()).c_str()); - LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); @@ -1741,7 +1743,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attn_value_scale = %.4f\n", __func__, hparams.f_attn_value_scale); - LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups); @@ -1768,7 +1770,7 @@ void llama_model::print_info() const { [](const auto & entry) { return entry >= 0; })) { LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__, print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; }, - hparams.n_layer()).c_str()); + hparams.n_layer_all).c_str()); } // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { @@ -2117,8 +2119,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* filter_recr */ std::move(filter_recr)); } } else { - llama_memory_i::layer_reuse_cb reuse = nullptr; llama_kv_cache::layer_filter_cb filter = nullptr; + llama_memory_i::layer_reuse_cb reuse = nullptr; + llama_kv_cache::layer_share_cb share = nullptr; if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { reuse = [&](uint32_t il) { @@ -2147,20 +2150,53 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(hparams.is_swa_any()); - res = new llama_kv_cache_iswa( - *this, - params.type_k, - params.type_v, - !cparams.flash_attn, - cparams.offload_kqv, - params.swa_full, - cparams.kv_unified, - cparams.n_ctx_seq, - cparams.n_seq_max, - cparams.n_ubatch, - 1, - filter, - reuse); + if (arch == LLM_ARCH_GEMMA4_ASSISTANT) { + llama_memory_t mem_src = llama_get_memory(cparams.ctx_src); + + share = [&](int32_t il) { + const llama_model * model_src = llama_get_model(cparams.ctx_src); + + if (hparams.is_swa(il)) { + return llama_model_n_layer(model_src) - 2; + } + + return llama_model_n_layer(model_src) - 1; + }; + + res = new llama_kv_cache_iswa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + params.swa_full, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + cparams.n_ubatch, + 1, + mem_src, + filter, + reuse, + share); + } else { + res = new llama_kv_cache_iswa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + params.swa_full, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + cparams.n_ubatch, + 1, + nullptr, + filter, + reuse, + share); + } } else { GGML_ASSERT(!hparams.is_swa_any()); @@ -2177,7 +2213,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, 1, hparams.n_swa, hparams.swa_type, + nullptr, filter, + nullptr, nullptr); } } diff --git a/src/llama-model.h b/src/llama-model.h index d8c3d0e66b9..82fee8f761b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -710,6 +710,7 @@ const char * llm_type_name(llm_type type); #define LLAMA_LOAD_LOCALS \ const int n_layer = hparams.n_layer(); GGML_UNUSED(n_layer); \ const int n_layer_all = hparams.n_layer_all; GGML_UNUSED(n_layer_all); \ + const int n_layer_nextn = hparams.n_layer_nextn; GGML_UNUSED(n_layer_nextn); \ const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \ const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \ const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \ diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp index 8c274e0cbd3..92c19fac82e 100644 --- a/src/models/gemma4-assistant.cpp +++ b/src/models/gemma4-assistant.cpp @@ -1,6 +1,8 @@ #include "models.h" void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { + hparams.n_embd_inp_impl = hparams.n_embd_out(); + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); @@ -9,15 +11,14 @@ void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { hparams.f_attention_scale = 1.0f; + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn <= hparams.n_layer_all && "n_layer_nextn must be <= n_layer_impl"); + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); - - if (hparams.n_layer() == 4) { - type = LLM_TYPE_31B; - } } void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { @@ -38,12 +39,12 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); - const int64_t n_embd_backbone = hparams.n_embd_out(); + const int64_t n_embd_backbone = hparams.n_embd_inp(); nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_POST_PROJ, "weight"), { n_embd, n_embd_backbone }, 0); int rope_freqs_flag = 0; - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_nextn; ++i) { auto & layer = layers[i]; const int64_t n_head = hparams.n_head(i); @@ -82,19 +83,7 @@ std::unique_ptr llama_model_gemma4_assistant::build_arch_grap llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(src_mctx && "Gemma 4 assistant graph requires an MTP source (llama_set_mtp_source)"); - GGML_ASSERT(src_model && "Gemma 4 assistant graph requires a source model"); - GGML_ASSERT(src_model->tok_embd && "source model missing tok_embd"); - - const auto & src_hparams = src_model->hparams; - - // By convention the MTP draft reads from the trunk's final SWA and full layers. - const int32_t src_layer_full = (int32_t) src_hparams.n_layer() - 1; - const int32_t src_layer_swa = (int32_t) src_hparams.n_layer() - 2; - GGML_ASSERT(!src_hparams.is_swa(src_layer_full) && "trunk's last layer must be full attention"); - GGML_ASSERT( src_hparams.is_swa(src_layer_swa) && "trunk's penultimate layer must be SWA"); - - const int64_t n_embd_backbone = hparams.n_embd_out(); + const int64_t n_embd_backbone = hparams.n_embd_inp(); ggml_tensor * inp_tokens; ggml_tensor * inp_h; @@ -116,7 +105,10 @@ llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_ res->add_input(std::move(inp)); } - ggml_tensor * x = ggml_get_rows(ctx0, src_model->tok_embd, inp_tokens); + GGML_ASSERT(cparams.ctx_src != nullptr); + const auto * model_src = llama_get_model(cparams.ctx_src); + + ggml_tensor * x = ggml_get_rows(ctx0, model_src->tok_embd, inp_tokens); x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone)); cb(x, "inp_embd_target", -1); @@ -126,15 +118,14 @@ llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_ ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_pre_proj, xh); cb(cur, "pre_proj", -1); - auto * inp_attn = build_attn_inp_src_kv_iswa(); + auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inpL = cur; - for (int il = 0; il < n_layer; ++il) { - const bool is_swa = hparams.is_swa(il); - const int32_t il_src = is_swa ? src_layer_swa : src_layer_full; + for (int il = 0; il < n_layer_nextn; ++il) { + const bool is_swa = hparams.is_swa(il); const int64_t n_embd_head = hparams.n_embd_head_k(il); const int64_t n_head = hparams.n_head(il); @@ -157,9 +148,9 @@ llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_ cb(Qcur, "Qcur_pos", il); cur = build_attn(inp_attn, model.layers[il].wo, nullptr, nullptr, - Qcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il, il_src); + Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer_nextn - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ee2f08a096e..14c4180c0ed 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1,4 +1,3 @@ - #include "server-context.h" #include "server-chat.h" #include "server-common.h" @@ -10,14 +9,14 @@ #include "common.h" #include "fit.h" #include "llama.h" -#include "../../src/llama-ext.h" // staging API: llama_set_mtp_source -#include "ggml-cpp.h" #include "log.h" #include "sampling.h" #include "speculative.h" #include "mtmd.h" #include "mtmd-helper.h" +#include "ggml-cpp.h" + #include #include #include @@ -988,6 +987,8 @@ struct server_context_impl { // note: for small models maybe we can set this to the maximum possible draft from all speculative types // the extra memory for small models is likely negligible? cparams.n_rs_seq = 0; + cparams.ctx_src = ctx_tgt; + ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); if (params_base.speculative.dflash) { @@ -1001,6 +1002,7 @@ struct server_context_impl { ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); } else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), @@ -1014,6 +1016,7 @@ struct server_context_impl { cparams_mtp.type_v = params_base.speculative.draft.cache_type_v; cparams_mtp.n_rs_seq = 0; cparams_mtp.n_outputs_max = params_base.n_parallel; + cparams_mtp.ctx_src = ctx_tgt; ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); if (ctx_dft == nullptr) { @@ -1021,12 +1024,6 @@ struct server_context_impl { return false; } - // wire the source before any decode (the seq-rm probe below - // triggers sched_reserve which needs src for Gemma4-style MTP) - llama_set_mtp_source(ctx_dft.get(), ctx_tgt); - - ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); - params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); } @@ -1114,6 +1111,10 @@ struct server_context_impl { } } + if (ctx_dft) { + ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + } + if (spec) { SRV_INF("%s", "speculative decoding context initialized\n"); } else { From 89f00b724adfca7b3163eb5793f0d053c7e78b06 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 6 Jun 2026 17:54:50 +0300 Subject: [PATCH 55/71] cont : clean-up --- src/llama-context.cpp | 2 +- src/llama-kv-cache.cpp | 4 ---- src/llama-kv-cache.h | 5 ----- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4bdf2159abf..3a425aeb7e7 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -357,7 +357,7 @@ llama_context::llama_context( /*.type_v =*/ params.type_v, /*.swa_full =*/ params.swa_full, /*.ctx_type =*/ cparams.ctx_type, - /*.mem_src =*/ params.ctx_src ? params.ctx_src->memory.get() : nullptr, + /*.mem_src =*/ llama_get_memory(cparams.ctx_src), }; memory.reset(model.create_memory(params_mem, cparams)); diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 585908301d9..45d6bce7b24 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -2559,10 +2559,6 @@ uint32_t llama_kv_cache_context::get_n_kv() const { return n_kv; } -llama_pos llama_kv_cache_context::seq_pos_max(llama_seq_id seq_id) const { - return kv->seq_pos_max(seq_id); -} - ggml_type llama_kv_cache_context::type_k() const { return kv->type_k(); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 8ad2412149c..fb022da73e5 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -359,11 +359,6 @@ class llama_kv_cache_context : public llama_memory_context_i { uint32_t get_n_kv() const; - // last position recorded in the cache for this sequence; -1 if absent. - // exposed for cross-context KV consumers (e.g. MTP draft) that need to - // anchor the source position without owning a memory module of their own. - llama_pos seq_pos_max(llama_seq_id seq_id) const; - ggml_type type_k() const; ggml_type type_v() const; From 5af09f1d33ab5f31dea76215d38405a2e6b89269 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 6 Jun 2026 18:05:14 +0300 Subject: [PATCH 56/71] cont : fix handling of unused tensors --- src/llama-graph.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f7868a5c9d6..11e0dc036f2 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -869,7 +869,9 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch); } - attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); + if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) { + attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); + } // swa tensors may not be allocated if there are no SWA attention layers if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { @@ -877,7 +879,9 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch); } - attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); + if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) { + attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); + } if (inp_attn->self_k_rot) { attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot); From 1df52f7b309fe04a6065a83e910ac35c70c88096 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 6 Jun 2026 18:39:55 +0300 Subject: [PATCH 57/71] cont : fix undefined --- src/llama-context.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3a425aeb7e7..77f5a7eaa65 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -134,6 +134,8 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; + cparams.ctx_src = nullptr; + // TODO: more generic if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { cparams.ctx_src = params.ctx_src; From 86ef6998b1cdba14597664aac24092b9c4fc64c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 6 Jun 2026 22:25:51 +0200 Subject: [PATCH 58/71] fix typo --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 28112fab1c8..6b44fc1ee3d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1725,7 +1725,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd_out = %u\n", __func__, hparams.n_embd_out()); LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer()); - LLAMA_LOG_INFO("%s: n_layer_al = %u\n", __func__, hparams.n_layer_all); + LLAMA_LOG_INFO("%s: n_layer_all = %u\n", __func__, hparams.n_layer_all); LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); From 4278550b65180c094a472508a4cac81f72054c64 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 11:16:43 +0300 Subject: [PATCH 59/71] cont : enable gemma4 graph reuse --- src/models/gemma4.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index 1b1443b84b7..585854bfe57 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -155,12 +155,14 @@ class llm_graph_input_logits_bias : public llm_graph_input_i { } virtual ~llm_graph_input_logits_bias() = default; - void set_input(const llama_ubatch *) override { + void set_input(const llama_ubatch * /*ubatch*/) override { const int64_t n_vocab = arr.size(); ggml_backend_tensor_set(logits_bias, arr.data(), 0, n_vocab*ggml_element_size(logits_bias)); } - // bool can_reuse(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & /*params*/) override { + return true; + } ggml_tensor * logits_bias = nullptr; // F32 [n_vocab] From 05e89f8b3896d9bd6c95bec81fb5c3c3e78b64a3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 11:24:58 +0300 Subject: [PATCH 60/71] cont : fix assert --- src/models/gemma4-assistant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp index 92c19fac82e..f17ea80eec4 100644 --- a/src/models/gemma4-assistant.cpp +++ b/src/models/gemma4-assistant.cpp @@ -12,7 +12,7 @@ void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { hparams.f_attention_scale = 1.0f; ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); - GGML_ASSERT(hparams.n_layer_nextn <= hparams.n_layer_all && "n_layer_nextn must be <= n_layer_impl"); + GGML_ASSERT(hparams.n_layer_nextn == hparams.n_layer_all && "n_layer_nextn must be == n_layer_impl"); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); From a66b02732cd66450cd33502db635064729df7ba1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 11:39:29 +0300 Subject: [PATCH 61/71] cont : fix quantized cache --- src/llama-kv-cache.cpp | 47 ++++++++++++++++++++------------- tools/server/server-context.cpp | 18 +++++-------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 45d6bce7b24..cca48578cfe 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -304,28 +304,37 @@ llama_kv_cache::llama_kv_cache( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); - const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false; - if (attn_rot_disable) { - LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__); - } + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + n_embd_head_k_all = other->n_embd_head_k_all; + n_embd_head_v_all = other->n_embd_head_v_all; - attn_rot_k = - !attn_rot_disable && - n_embd_head_k_all > 0 && - ggml_is_quantized(type_k) && - hparams.n_embd_head_k() % 64 == 0; + attn_rot_k = other->attn_rot_k; + attn_rot_v = other->attn_rot_v; + } else { + const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); + const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false; + if (attn_rot_disable) { + LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__); + } - // always create Hadamard rotation tensors for DeepSeek V3.2 DSA lightning indexer - if (model.arch == LLM_ARCH_DEEPSEEK32 && hparams.n_embd_head_k_full == hparams.indexer_head_size) { - attn_rot_k = true; - } + attn_rot_k = + !attn_rot_disable && + n_embd_head_k_all > 0 && + ggml_is_quantized(type_k) && + hparams.n_embd_head_k() % 64 == 0; + + // always create Hadamard rotation tensors for DeepSeek V3.2 DSA lightning indexer + if (model.arch == LLM_ARCH_DEEPSEEK32 && hparams.n_embd_head_k_full == hparams.indexer_head_size) { + attn_rot_k = true; + } - attn_rot_v = - !attn_rot_disable && - n_embd_head_v_all > 0 && - ggml_is_quantized(type_v) && - hparams.n_embd_head_v() % 64 == 0; + attn_rot_v = + !attn_rot_disable && + n_embd_head_v_all > 0 && + ggml_is_quantized(type_v) && + hparams.n_embd_head_v() % 64 == 0; + } LLAMA_LOG_INFO("%s: attn_rot_k = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_k, n_embd_head_k_all); LLAMA_LOG_INFO("%s: attn_rot_v = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_v, n_embd_head_v_all); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 14c4180c0ed..4c9ef3c2d9e 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -941,23 +941,13 @@ struct server_context_impl { SRV_INF("loading draft model '%s'\n", params_spec.mparams.path.c_str()); - const bool spec_mtp = std::find(params_base.speculative.types.begin(), - params_base.speculative.types.end(), - COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); - auto params_dft = params_base; params_dft.devices = params_spec.devices; params_dft.model = params_spec.mparams; params_dft.n_gpu_layers = params_spec.n_gpu_layers; - // TODO: find a better way to expose that the cache is shared - if (spec_mtp) { - params_dft.cache_type_k = params_base.cache_type_k; - params_dft.cache_type_v = params_base.cache_type_v; - } else { - params_dft.cache_type_k = params_spec.cache_type_k; - params_dft.cache_type_v = params_spec.cache_type_v; - } + params_dft.cache_type_k = params_spec.cache_type_k; + params_dft.cache_type_v = params_spec.cache_type_v; if (params_spec.cpuparams.n_threads > 0) { params_dft.cpuparams.n_threads = params_spec.cpuparams.n_threads; @@ -976,6 +966,10 @@ struct server_context_impl { auto cparams = common_context_params_to_llama(params_dft); + const bool spec_mtp = std::find(params_base.speculative.types.begin(), + params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + if (spec_mtp) { cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP; } From 7e2848a2e5c2e36b5193faad83a5525893dcb975 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 11:41:15 +0300 Subject: [PATCH 62/71] cont : fix names --- src/llama-arch.cpp | 8 ++++---- src/llama-arch.h | 4 ++-- src/models/gemma4-assistant.cpp | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 7a704c971db..2fc556053a6 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -459,8 +459,8 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, - { LLM_TENSOR_NEXTN_PRE_PROJ, "nextn.pre_projection" }, - { LLM_TENSOR_NEXTN_POST_PROJ, "nextn.post_projection" }, + { LLM_TENSOR_NEXTN_PROJ_PRE, "nextn.pre_projection" }, + { LLM_TENSOR_NEXTN_PROJ_POST, "nextn.post_projection" }, { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, { LLM_TENSOR_DFLASH_FC, "dflash_fc" }, { LLM_TENSOR_DFLASH_HIDDEN_NORM, "dflash_hidden_norm" }, @@ -777,8 +777,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_NEXTN_PRE_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_NEXTN_POST_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PROJ_PRE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PROJ_POST, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so // the model loader doesn't fault on the block index. diff --git a/src/llama-arch.h b/src/llama-arch.h index 98027044ae5..0da781aea5d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -565,8 +565,8 @@ enum llm_tensor { LLM_TENSOR_INDEXER_PROJ, LLM_TENSOR_INDEXER_ATTN_K, LLM_TENSOR_INDEXER_ATTN_Q_B, - LLM_TENSOR_NEXTN_PRE_PROJ, // TODO: rename to PROJ_PRE - LLM_TENSOR_NEXTN_POST_PROJ, // TODO: rename to PROJ_POST + LLM_TENSOR_NEXTN_PROJ_PRE, + LLM_TENSOR_NEXTN_PROJ_POST, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp index f17ea80eec4..391bfec7a09 100644 --- a/src/models/gemma4-assistant.cpp +++ b/src/models/gemma4-assistant.cpp @@ -40,7 +40,7 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); const int64_t n_embd_backbone = hparams.n_embd_inp(); - nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_POST_PROJ, "weight"), { n_embd, n_embd_backbone }, 0); + nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0); int rope_freqs_flag = 0; @@ -52,7 +52,7 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { const int64_t n_ff = hparams.n_ff(i); if (i == 0) { - nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ, "weight", i), { 2*n_embd_backbone, n_embd }, 0); + nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_PRE, "weight", i), { 2*n_embd_backbone, n_embd }, 0); } layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); From b00c1d6a1b21b6abd24a610cfe537fa7a20ab676 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 11:45:17 +0300 Subject: [PATCH 63/71] cont : fix names --- src/llama-model.h | 4 ++-- src/models/gemma4-assistant.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/llama-model.h b/src/llama-model.h index 82fee8f761b..12df43a1faf 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -553,8 +553,8 @@ struct llama_model { struct ggml_tensor * output_in_s = nullptr; // NextN/MTP model-level projections - struct ggml_tensor * nextn_pre_proj = nullptr; - struct ggml_tensor * nextn_post_proj = nullptr; + struct ggml_tensor * nextn_proj_pre = nullptr; + struct ggml_tensor * nextn_proj_post = nullptr; // classifier struct ggml_tensor * cls = nullptr; diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp index 391bfec7a09..9598ab6e862 100644 --- a/src/models/gemma4-assistant.cpp +++ b/src/models/gemma4-assistant.cpp @@ -40,7 +40,7 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); const int64_t n_embd_backbone = hparams.n_embd_inp(); - nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0); + nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0); int rope_freqs_flag = 0; @@ -52,7 +52,7 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { const int64_t n_ff = hparams.n_ff(i); if (i == 0) { - nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_PRE, "weight", i), { 2*n_embd_backbone, n_embd }, 0); + nextn_proj_pre = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_PRE, "weight", i), { 2*n_embd_backbone, n_embd }, 0); } layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); @@ -115,7 +115,7 @@ llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_ ggml_tensor * xh = ggml_concat(ctx0, x, inp_h, 0); cb(xh, "inp_xh", -1); - ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_pre_proj, xh); + ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_proj_pre, xh); cb(cur, "pre_proj", -1); auto * inp_attn = build_attn_inp_kv_iswa(); @@ -191,7 +191,7 @@ llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_ cb(logits, "result_output", -1); res->t_logits = logits; - ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_post_proj, cur); + ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_proj_post, cur); cb(h_next, "h_nextn", -1); res->t_h_nextn = h_next; From bf6700453ad0bb673b1e373b9e127903f1589b32 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 12:00:06 +0300 Subject: [PATCH 64/71] cont : add reference for draft positions --- common/speculative.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/speculative.cpp b/common/speculative.cpp index b99291a7972..4b038979a5e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -726,6 +726,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { } if (is_mem_shared) { + // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens + // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37 common_batch_add(batch, id, dp.n_past, { seq_id }, true); } else { common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); From 96a14a98bf10b8d8e772a26f8979c463cf7eaf3d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 12:40:55 +0300 Subject: [PATCH 65/71] cont : fix multi-modality --- tools/server/server-context.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 4c9ef3c2d9e..67cd27f51f2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -17,6 +17,9 @@ #include "ggml-cpp.h" +// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING] +#include "../../src/llama-ext.h" + #include #include #include @@ -3077,10 +3080,11 @@ struct server_context_impl { continue; } - if (ctx_dft) { + if (ctx_dft && llama_get_ctx_src(ctx_dft.get()) != ctx_tgt) { // TODO: in the future, figure out how to infuse target embeddings to the images // for now, we skip this for simplicity // maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above? + // [TAG_MTMD_DRAFT_PROCESSING] res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); if (res != 0) { GGML_ABORT("failed to process multi-modal data on draft context\n"); From e10ad044ee28caefdb6e7111814f2c3a2efe0a44 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 12:44:14 +0300 Subject: [PATCH 66/71] cont : add comment about ctx_src --- include/llama.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/llama.h b/include/llama.h index 609de510a55..527b8af6b5a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -395,6 +395,8 @@ extern "C" { struct llama_sampler_seq_config * samplers; size_t n_samplers; + // a source/target/parent context + // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts struct llama_context * ctx_src; }; From 024ac5fa7ea0491207a76af835bdb0e72864482f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 12:53:31 +0300 Subject: [PATCH 67/71] cont : clean-up server fit logic --- src/llama-context.cpp | 5 +++++ tools/server/server-context.cpp | 20 ++------------------ 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 77f5a7eaa65..3de9030405e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -138,6 +138,11 @@ llama_context::llama_context( // TODO: more generic if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { + if (params.ctx_src == nullptr) { + // TODO: change from runtime_error to llama_exception to avoid printing error message + throw std::runtime_error("Gemma4Assistant requires ctx_src to be set (this is normal during memory fitting)"); + } + cparams.ctx_src = params.ctx_src; } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 67cd27f51f2..3342ea25f03 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -849,27 +849,11 @@ struct server_context_impl { } cparams_dft.n_rs_seq = 0; - bool skip_measure = false; - //TODO: remove this - if (spec_mtp && has_draft) { - struct gguf_init_params meta_params = { - /* .no_alloc = */ true, - /* .ctx = */ nullptr, - }; - gguf_context_ptr meta(gguf_init_from_file(params_dft.model.path.c_str(), meta_params)); - - if (std::string(gguf_get_val_str(meta.get(), gguf_find_key(meta.get(), "general.architecture"))) == "gemma4-assistant") { - skip_measure = true; - SRV_WRN("[spec] skipping --fit memory measurement for Gemma 4 assistant draft model '%s'\n", - params_dft.model.path.c_str()); - } - } - std::vector devs; uint32_t hp_ngl = 0; uint32_t hp_nct = 0; uint32_t hp_nex = 0; - if (!skip_measure) try { + try { auto dmd = common_get_device_memory_data( params_dft.model.path.c_str(), &mparams_dft, &cparams_dft, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); @@ -904,7 +888,7 @@ struct server_context_impl { has_draft ? "draft model" : "MTP context", total / (1024.0 * 1024.0)); } catch (const std::exception & e) { - SRV_ERR("[spec] failed to measure %s memory: %s\n", + SRV_WRN("[spec] failed to measure %s memory: %s\n", has_draft ? "draft model" : "MTP context", e.what()); } } From 6caeb6ac33606a626fa1182c6fd810ae3d4fd8f1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 12:56:27 +0300 Subject: [PATCH 68/71] cont : clean-up llama_context --- src/llama-context.cpp | 49 ------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3de9030405e..3f8036bb43f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -30,55 +30,6 @@ static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) { throw std::runtime_error("Unsupported ctx type"); } -static void llama_assert_gemma4_mtp_source_placement( - const llama_context * ctx, - const llama_context * src) { - if (!ctx || !src) { - return; - } - - const auto & model_dft = ctx->get_model(); - const auto & model_tgt = src->get_model(); - - if (model_dft.arch != LLM_ARCH_GEMMA4_ASSISTANT || model_tgt.arch != LLM_ARCH_GEMMA4) { - return; - } - - if (model_tgt.split_mode() == LLAMA_SPLIT_MODE_TENSOR) { - return; - } - - const auto & hparams_dft = model_dft.hparams; - const auto & hparams_tgt = model_tgt.hparams; - - const int32_t il_tgt_full = (int32_t) hparams_tgt.n_layer() - 1; - const int32_t il_tgt_swa = (int32_t) hparams_tgt.n_layer() - 2; - - ggml_backend_dev_t dev_cpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (!dev_cpu) { - throw std::runtime_error("Gemma 4 assistant MTP placement check failed: no CPU backend found"); - } - - const bool kv_offload = src->get_cparams().offload_kqv; - - for (uint32_t il_dft = 0; il_dft < hparams_dft.n_layer(); ++il_dft) { - const int32_t il_tgt = hparams_dft.is_swa(il_dft) ? il_tgt_swa : il_tgt_full; - - ggml_backend_dev_t dev_dft = model_dft.dev_layer(il_dft); - ggml_backend_dev_t dev_kv = kv_offload ? model_tgt.dev_layer(il_tgt) : dev_cpu; - - if (dev_dft != dev_kv) { - throw std::runtime_error(format( - "Gemma 4 assistant MTP placement mismatch: draft layer %d is on %s, " - "but shared target KV layer %d is on %s", - (int) il_dft, - ggml_backend_dev_name(dev_dft), - (int) il_tgt, - ggml_backend_dev_name(dev_kv))); - } - } -} - llama_context::llama_context( const llama_model & model, llama_context_params params) : From e41c9b01e0ae108130d715b45ccc87832faf5991 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 13:05:46 +0300 Subject: [PATCH 69/71] py : fix names --- gguf-py/gguf/constants.py | 12 ++++++------ gguf-py/gguf/tensor_mapping.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b48bc0bcb8f..bd6246137b0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -898,8 +898,8 @@ class MODEL_TENSOR(IntEnum): A_PER_DIM_K_SCALE = auto() # gemma4 A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp - NEXTN_PRE_PROJ = auto() - NEXTN_POST_PROJ = auto() + NEXTN_PROJ_PRE = auto() + NEXTN_PROJ_POST = auto() NEXTN_EH_PROJ = auto() NEXTN_EMBED_TOKENS = auto() NEXTN_ENORM = auto() @@ -1475,8 +1475,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down", MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm", # NextN/MTP - MODEL_TENSOR.NEXTN_PRE_PROJ: "nextn.pre_projection", - MODEL_TENSOR.NEXTN_POST_PROJ: "nextn.post_projection", + MODEL_TENSOR.NEXTN_PROJ_PRE: "nextn.pre_projection", + MODEL_TENSOR.NEXTN_PROJ_POST: "nextn.post_projection", MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm", @@ -2587,8 +2587,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.NEXTN_PRE_PROJ, - MODEL_TENSOR.NEXTN_POST_PROJ, + MODEL_TENSOR.NEXTN_PROJ_PRE, + MODEL_TENSOR.NEXTN_PROJ_POST, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q_NORM, MODEL_TENSOR.ATTN_OUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 34feade0783..a9537983de1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2367,11 +2367,11 @@ class TensorNameMap: ), # NextN/MTP tensors - MODEL_TENSOR.NEXTN_PRE_PROJ: ( + MODEL_TENSOR.NEXTN_PROJ_PRE: ( "pre_projection", ), - MODEL_TENSOR.NEXTN_POST_PROJ: ( + MODEL_TENSOR.NEXTN_PROJ_POST: ( "post_projection", ), From 0f2f35a67b892c5a0c093bef9fa963bd1f329628 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 Jun 2026 13:24:18 +0300 Subject: [PATCH 70/71] cont : rename ctx_src -> ctx_other --- common/speculative.cpp | 2 +- include/llama.h | 2 +- src/llama-context.cpp | 24 ++++++++++++------------ src/llama-cparams.h | 2 +- src/llama-ext.h | 2 +- src/llama-kv-cache-iswa.cpp | 18 +++++++++--------- src/llama-kv-cache-iswa.h | 2 +- src/llama-kv-cache.cpp | 4 ++-- src/llama-kv-cache.h | 2 +- src/llama-memory.h | 2 +- src/llama-model.cpp | 10 +++++----- src/models/gemma4-assistant.cpp | 6 +++--- tools/server/server-context.cpp | 8 ++++---- 13 files changed, 42 insertions(+), 42 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 4b038979a5e..8880add5ea7 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -496,7 +496,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false); llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); - is_mem_shared = llama_get_ctx_src(ctx_dft) == ctx_tgt; + is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt; pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); diff --git a/include/llama.h b/include/llama.h index 527b8af6b5a..6da9e995373 100644 --- a/include/llama.h +++ b/include/llama.h @@ -397,7 +397,7 @@ extern "C" { // a source/target/parent context // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts - struct llama_context * ctx_src; + struct llama_context * ctx_other; }; struct llama_model_tensor_override { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3f8036bb43f..8b1aba82d9d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -85,16 +85,16 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; - cparams.ctx_src = nullptr; + cparams.ctx_other = nullptr; // TODO: more generic if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { - if (params.ctx_src == nullptr) { + if (params.ctx_other == nullptr) { // TODO: change from runtime_error to llama_exception to avoid printing error message - throw std::runtime_error("Gemma4Assistant requires ctx_src to be set (this is normal during memory fitting)"); + throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)"); } - cparams.ctx_src = params.ctx_src; + cparams.ctx_other = params.ctx_other; } // Initialize backend samplers here so they are part of the sampling graph @@ -311,11 +311,11 @@ llama_context::llama_context( // init the memory module if (!hparams.vocab_only) { llama_memory_params params_mem = { - /*.type_k =*/ params.type_k, - /*.type_v =*/ params.type_v, - /*.swa_full =*/ params.swa_full, - /*.ctx_type =*/ cparams.ctx_type, - /*.mem_src =*/ llama_get_memory(cparams.ctx_src), + /*.type_k =*/ params.type_k, + /*.type_v =*/ params.type_v, + /*.swa_full =*/ params.swa_full, + /*.ctx_type =*/ cparams.ctx_type, + /*.mem_other =*/ llama_get_memory(cparams.ctx_other), }; memory.reset(model.create_memory(params_mem, cparams)); @@ -3595,7 +3595,7 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, - /*.ctx_src =*/ nullptr, + /*.ctx_other =*/ nullptr, }; return result; @@ -4235,6 +4235,6 @@ llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * c return ctx->memory_breakdown(); } -llama_context * llama_get_ctx_src(struct llama_context * ctx) { - return ctx->get_cparams().ctx_src; +llama_context * llama_get_ctx_other(struct llama_context * ctx) { + return ctx->get_cparams().ctx_other; } diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 2e0cab5a86c..7e324dbebf1 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -51,5 +51,5 @@ struct llama_cparams { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; - llama_context * ctx_src; + llama_context * ctx_other; }; diff --git a/src/llama-ext.h b/src/llama-ext.h index da95cdb9cea..bd74544129b 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -101,4 +101,4 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); -LLAMA_API llama_context * llama_get_ctx_src(struct llama_context * ctx); +LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 54694d4a7ea..aa1b1b72ebe 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -23,7 +23,7 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, - llama_memory_t mem_src, + llama_memory_t mem_other, const layer_filter_cb & filter, const layer_reuse_cb & reuse, const layer_share_cb & share) : hparams(model.hparams), unified(unified) { @@ -61,27 +61,27 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); - llama_memory_t mem_src_base = nullptr; - if (mem_src) { - mem_src_base = static_cast(mem_src)->get_base(); + llama_memory_t mem_other_base = nullptr; + if (mem_other) { + mem_other_base = static_cast(mem_other)->get_base(); } - llama_memory_t mem_src_swa = nullptr; - if (mem_src) { - mem_src_swa = static_cast(mem_src)->get_swa(); + llama_memory_t mem_other_swa = nullptr; + if (mem_other) { + mem_other_swa = static_cast(mem_other)->get_swa(); } kv_base = std::make_unique( model, hparams, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, - 0, LLAMA_SWA_TYPE_NONE, mem_src_base, filter_base, reuse, share); + 0, LLAMA_SWA_TYPE_NONE, mem_other_base, filter_base, reuse, share); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( model, hparams, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, - hparams.n_swa, hparams.swa_type, mem_src_swa, filter_swa, reuse, share); + hparams.n_swa, hparams.swa_type, mem_other_swa, filter_swa, reuse, share); } void llama_kv_cache_iswa::clear(bool data) { diff --git a/src/llama-kv-cache-iswa.h b/src/llama-kv-cache-iswa.h index 0206dd27e6e..dfafc1ef510 100644 --- a/src/llama-kv-cache-iswa.h +++ b/src/llama-kv-cache-iswa.h @@ -25,7 +25,7 @@ class llama_kv_cache_iswa : public llama_memory_i { uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, - llama_memory_t mem_src, + llama_memory_t mem_other, const layer_filter_cb & filter, const layer_reuse_cb & reuse, const layer_share_cb & share); diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index cca48578cfe..8d53bf0ef44 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -90,7 +90,7 @@ llama_kv_cache::llama_kv_cache( uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, - llama_memory_t mem_src, + llama_memory_t mem_other, const layer_filter_cb & filter, const layer_reuse_cb & reuse, const layer_share_cb & share) : @@ -162,7 +162,7 @@ llama_kv_cache::llama_kv_cache( const bool is_mla = hparams.is_mla(); - other = static_cast(mem_src); + other = static_cast(mem_other); for (uint32_t il = 0; il < n_layer; il++) { if (!hparams.has_kv(il)) { diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index fb022da73e5..f5ace6ae350 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -109,7 +109,7 @@ class llama_kv_cache : public llama_memory_i { uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, - llama_memory_t mem_src, + llama_memory_t mem_other, const layer_filter_cb & filter, const layer_reuse_cb & reuse, const layer_share_cb & share); diff --git a/src/llama-memory.h b/src/llama-memory.h index e3025ec7895..db825396645 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -24,7 +24,7 @@ struct llama_memory_params { llama_context_type ctx_type; - llama_memory_t mem_src; + llama_memory_t mem_other; }; enum llama_memory_status { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6b44fc1ee3d..e0760b84ceb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2151,16 +2151,16 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, GGML_ASSERT(hparams.is_swa_any()); if (arch == LLM_ARCH_GEMMA4_ASSISTANT) { - llama_memory_t mem_src = llama_get_memory(cparams.ctx_src); + llama_memory_t mem_other = llama_get_memory(cparams.ctx_other); share = [&](int32_t il) { - const llama_model * model_src = llama_get_model(cparams.ctx_src); + const llama_model * model_other = llama_get_model(cparams.ctx_other); if (hparams.is_swa(il)) { - return llama_model_n_layer(model_src) - 2; + return llama_model_n_layer(model_other) - 2; } - return llama_model_n_layer(model_src) - 1; + return llama_model_n_layer(model_other) - 1; }; res = new llama_kv_cache_iswa( @@ -2175,7 +2175,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_seq_max, cparams.n_ubatch, 1, - mem_src, + mem_other, filter, reuse, share); diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp index 9598ab6e862..5b7a25a5aba 100644 --- a/src/models/gemma4-assistant.cpp +++ b/src/models/gemma4-assistant.cpp @@ -105,10 +105,10 @@ llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_ res->add_input(std::move(inp)); } - GGML_ASSERT(cparams.ctx_src != nullptr); - const auto * model_src = llama_get_model(cparams.ctx_src); + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); - ggml_tensor * x = ggml_get_rows(ctx0, model_src->tok_embd, inp_tokens); + ggml_tensor * x = ggml_get_rows(ctx0, model_other->tok_embd, inp_tokens); x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone)); cb(x, "inp_embd_target", -1); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 3342ea25f03..66fa336ff45 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -967,8 +967,8 @@ struct server_context_impl { // note: for small models maybe we can set this to the maximum possible draft from all speculative types // the extra memory for small models is likely negligible? - cparams.n_rs_seq = 0; - cparams.ctx_src = ctx_tgt; + cparams.n_rs_seq = 0; + cparams.ctx_other = ctx_tgt; ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); @@ -997,7 +997,7 @@ struct server_context_impl { cparams_mtp.type_v = params_base.speculative.draft.cache_type_v; cparams_mtp.n_rs_seq = 0; cparams_mtp.n_outputs_max = params_base.n_parallel; - cparams_mtp.ctx_src = ctx_tgt; + cparams_mtp.ctx_other = ctx_tgt; ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); if (ctx_dft == nullptr) { @@ -3064,7 +3064,7 @@ struct server_context_impl { continue; } - if (ctx_dft && llama_get_ctx_src(ctx_dft.get()) != ctx_tgt) { + if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) { // TODO: in the future, figure out how to infuse target embeddings to the images // for now, we skip this for simplicity // maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above? From 5e6dff22613f4db63e3a5624e2b86e90ac3a6e82 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sun, 7 Jun 2026 13:51:31 +0200 Subject: [PATCH 71/71] chore(sync): drop intermediate llama_set_mtp_source call The first PR #23398 commit added an `llama_set_mtp_source(ctx_dft, ctx_tgt)` call after `llama_init_from_model`. Later cleanup commits in the same PR removed that API and moved the wiring to `cparams.ctx_other = ctx_tgt` set BEFORE init. Our keep-both resolution carried the intermediate call forward; this drops it to match the PR's final API. Drops 1 use of removed symbol, no behavior change (the rebased cparams.ctx_other assignment is what's actually used). --- tools/server/server-context.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 66fa336ff45..21b2cceeb85 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -976,10 +976,9 @@ struct server_context_impl { llama_set_dflash(ctx_tgt, model_dft.get()); } - if (spec_mtp) { - // MTP draft must know its target before the first decode - llama_set_mtp_source(ctx_dft.get(), ctx_tgt); - } + // note: MTP target wiring uses cparams.ctx_other set before + // llama_init_from_model above — no explicit call needed here. + (void) spec_mtp; ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get());