-
Notifications
You must be signed in to change notification settings - Fork 20k
Add Kimi-K2.5 support #19170
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Kimi-K2.5 support #19170
Changes from 14 commits
042c3cb
a4c9a08
9c44981
9b14cb8
37a386d
b1cf34e
f13b383
be1b0c3
052fda6
0c50dd9
d0d1062
c895365
7b4af22
16010cb
c5de0ef
2d7c44a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -655,6 +655,11 @@ ggml_tensor * clip_graph::build_rope_2d( | |
| const int64_t n_head = cur->ne[1]; | ||
| const int64_t n_pos = cur->ne[2]; | ||
|
|
||
| // Ensure input is contiguous (needed when using merged QKV with ggml_view) | ||
| if (!ggml_is_contiguous(cur)) { | ||
| cur = ggml_cont(ctx0, cur); | ||
| } | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since #19338 only merged a few hours ago, I didn't have that one merged into this branch. I'll merge master and retry without that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @CISC I merged Before, working: After, broken: And the text output is now seeing the image as garbled
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, so we have another bug. @ORippler @JohannesGaessler @jeffbolznv Mind testing on Vulkan?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tested one more time with Correct text outputSo it's definitely the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you try this patch: diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index dae17c6fb..422a0e410 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -655,11 +655,6 @@ ggml_tensor * clip_graph::build_rope_2d(
const int64_t n_head = cur->ne[1];
const int64_t n_pos = cur->ne[2];
- // Ensure input is contiguous (needed when using merged QKV with ggml_view)
- if (!ggml_is_contiguous(cur)) {
- cur = ggml_cont(ctx0, cur);
- }
-
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
// first half of cur will use 1e-0, 1e-2 (even)
@@ -677,8 +672,8 @@ ggml_tensor * clip_graph::build_rope_2d(
{
first = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
- ggml_row_size(cur->type, n_dim),
- ggml_row_size(cur->type, n_dim*n_head),
+ cur->nb[1],
+ cur->nb[2],
0);
first = ggml_rope_ext(
ctx0,
@@ -696,8 +691,8 @@ ggml_tensor * clip_graph::build_rope_2d(
{
second = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
- ggml_row_size(cur->type, n_dim),
- ggml_row_size(cur->type, n_dim*n_head),
+ cur->nb[1],
+ cur->nb[2],
n_dim/2 * ggml_element_size(cur));
second = ggml_rope_ext(
ctx0,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I'll give it a shot in a few hours once I'm back home from the office.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@CISC Just repeating myself earlier, but this is the first model to use the Other models seem to use the combo
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Sure, I meant
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No CPU only, no ggml_contThe patch for the gpu w/ patch, no ggml_contI can update this PR with that fix applied @ggerganov |
||
| // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) | ||
| // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 | ||
| // first half of cur will use 1e-0, 1e-2 (even) | ||
|
|
@@ -825,6 +830,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 | |
| { | ||
| builder = std::make_unique<clip_graph_kimivl>(ctx, img); | ||
| } break; | ||
| case PROJECTOR_TYPE_KIMIK25: | ||
| { | ||
| builder = std::make_unique<clip_graph_kimik25>(ctx, img); | ||
| } break; | ||
| case PROJECTOR_TYPE_COGVLM: | ||
| { | ||
| builder = std::make_unique<clip_graph_cogvlm>(ctx, img); | ||
|
|
@@ -1139,6 +1148,22 @@ struct clip_model_loader { | |
| hparams.set_limit_image_tokens(8, 1024); | ||
| hparams.set_warmup_n_tokens(256); // avoid OOM on warmup | ||
| } break; | ||
| case PROJECTOR_TYPE_KIMIK25: | ||
| { | ||
| hparams.rope_theta = 10000.0f; | ||
| get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); | ||
|
|
||
| int min_pixels = 0, max_pixels = 0; | ||
| get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false); | ||
| get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false); | ||
| if (min_pixels > 0 && max_pixels > 0) { | ||
| hparams.image_min_pixels = min_pixels; | ||
| hparams.image_max_pixels = max_pixels; | ||
| hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels)); | ||
| } else { | ||
| hparams.set_limit_image_tokens(2, 4096); | ||
| } | ||
| } break; | ||
| case PROJECTOR_TYPE_GEMMA3: | ||
| { | ||
| // default value (used by all model sizes in gemma 3 family) | ||
|
|
@@ -1668,6 +1693,7 @@ struct clip_model_loader { | |
| model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); | ||
| } break; | ||
| case PROJECTOR_TYPE_KIMIVL: | ||
| case PROJECTOR_TYPE_KIMIK25: | ||
| { | ||
| model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); | ||
| model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B); | ||
|
|
@@ -3039,6 +3065,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str | |
| res_imgs->entries.push_back(std::move(res)); | ||
| } break; | ||
|
|
||
| case PROJECTOR_TYPE_KIMIK25: | ||
| { | ||
| GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); | ||
| const clip_image_size target_size = img_tool::calc_size_preserved_ratio( | ||
| original_size, | ||
| params.patch_size * params.n_merge, | ||
| params.image_min_pixels, | ||
| params.image_max_pixels); | ||
| const std::array<uint8_t, 3> pad_color = {0, 0, 0}; | ||
|
|
||
| clip_image_u8 resized_img; | ||
| img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color); | ||
| clip_image_f32_ptr res(clip_image_f32_init()); | ||
| normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); | ||
| res_imgs->entries.push_back(std::move(res)); | ||
| } break; | ||
|
|
||
| case PROJECTOR_TYPE_MLP: | ||
| case PROJECTOR_TYPE_MLP_NORM: | ||
| case PROJECTOR_TYPE_LDP: | ||
|
|
@@ -3247,6 +3290,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im | |
| } break; | ||
| case PROJECTOR_TYPE_LFM2: | ||
| case PROJECTOR_TYPE_KIMIVL: | ||
| case PROJECTOR_TYPE_KIMIK25: | ||
| { | ||
| // dynamic size | ||
| int out_patch_size = params.patch_size * ctx->model.hparams.n_merge; | ||
|
|
@@ -3588,6 +3632,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima | |
| } break; | ||
| case PROJECTOR_TYPE_PIXTRAL: | ||
| case PROJECTOR_TYPE_KIMIVL: | ||
| case PROJECTOR_TYPE_KIMIK25: | ||
| case PROJECTOR_TYPE_LIGHTONOCR: | ||
| { | ||
| // set the 2D positions | ||
|
|
@@ -3724,6 +3769,47 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima | |
| ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); | ||
| } | ||
|
|
||
| // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set | ||
| if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) { | ||
| const int64_t n_embd = embeddings->ne[0]; | ||
| const int64_t n_tokens = embeddings->ne[1]; | ||
| std::vector<float> emb_data(n_embd * n_tokens); | ||
| ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings)); | ||
|
|
||
| LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n"); | ||
| LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens); | ||
|
|
||
| // Print first few values of first token | ||
| LOG_INF("Token 0 (first 16 values): "); | ||
| for (int i = 0; i < std::min((int64_t)16, n_embd); i++) { | ||
| LOG_INF("%.6f ", emb_data[i]); | ||
| } | ||
| LOG_INF("\n"); | ||
|
|
||
| // Print last few values of first token | ||
| if (n_embd > 16) { | ||
| LOG_INF("Token 0 (last 16 values): "); | ||
| for (int64_t i = n_embd - 16; i < n_embd; i++) { | ||
| LOG_INF("%.6f ", emb_data[i]); | ||
| } | ||
| LOG_INF("\n"); | ||
| } | ||
|
|
||
| // Compute and print statistics | ||
| float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0]; | ||
| for (size_t i = 0; i < emb_data.size(); i++) { | ||
| sum += emb_data[i]; | ||
| sum_sq += emb_data[i] * emb_data[i]; | ||
| min_val = std::min(min_val, emb_data[i]); | ||
| max_val = std::max(max_val, emb_data[i]); | ||
| } | ||
| float mean = sum / emb_data.size(); | ||
| float variance = (sum_sq / emb_data.size()) - (mean * mean); | ||
| LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n", | ||
| mean, sqrtf(variance), min_val, max_val, sum); | ||
| LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n"); | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -3770,6 +3856,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { | |
| return ctx->model.mm_2_w->ne[1]; | ||
| case PROJECTOR_TYPE_LFM2: | ||
| case PROJECTOR_TYPE_KIMIVL: | ||
| case PROJECTOR_TYPE_KIMIK25: | ||
| return ctx->model.mm_2_w->ne[1]; | ||
| case PROJECTOR_TYPE_COGVLM: | ||
| return ctx->model.mm_4h_to_h_w->ne[1]; | ||
|
|
||

Uh oh!
There was an error while loading. Please reload this page.