Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 116 additions & 9 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
self.ftype = gguf.LlamaFileType.MOSTLY_F16
logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")

self.dequant_model()

# Configure GGUF Writer
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
Expand Down Expand Up @@ -527,6 +525,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
return ()

def prepare_tensors(self):
self.dequant_model()

Comment thread
ngxson marked this conversation as resolved.
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
if self.tensor_map.mapping:
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
Expand Down Expand Up @@ -1812,7 +1812,7 @@ class MmprojModel(ModelBase):
preprocessor_config: dict[str, Any]
global_config: dict[str, Any]

n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"]

has_vision_encoder: bool = True # by default
has_audio_encoder: bool = False
Expand Down Expand Up @@ -1867,7 +1867,15 @@ def __init__(self, *args, **kwargs):
preprocessor_config_path = self.dir_model / "preprocessor_config.json"
if preprocessor_config_path.is_file():
with open(preprocessor_config_path, "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
cfg = json.load(f)
# move media_proc_cfg to root level for compat
if "media_proc_cfg" in cfg:
cfg = {
**cfg,
**cfg["media_proc_cfg"],
}
# merge configs
self.preprocessor_config = {**self.preprocessor_config, **cfg}
Comment thread
AesSedai marked this conversation as resolved.

# prefer processor_config.json if possible
processor_config_path = self.dir_model / "processor_config.json"
Expand Down Expand Up @@ -1916,10 +1924,10 @@ def set_gguf_parameters(self):
self.image_size = self.find_vparam(["image_size"])
self.gguf_writer.add_vision_image_size(self.image_size)
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))

# preprocessor config
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
Expand Down Expand Up @@ -7579,6 +7587,7 @@ def prepare_tensors(self):
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"KimiVLForConditionalGeneration",
"KimiK25ForConditionalGeneration",
"YoutuForCausalLM",
"YoutuVLForConditionalGeneration",
)
Expand Down Expand Up @@ -7697,8 +7706,8 @@ def set_gguf_parameters(self):
_experts: list[dict[str, Tensor]] | None = None

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# skip vision tensors and remove "language_model." for Kimi-VL
if "vision_tower" in name or "multi_modal_projector" in name:
# skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name:
return
if name.startswith("siglip2.") or name.startswith("merger."):
return
Expand Down Expand Up @@ -11060,6 +11069,104 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("KimiK25ForConditionalGeneration")
class KimiK25Model(MmprojModel):
"""Kimi-K2.5 with MoonViT3d vision encoder"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config"

self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2]))
self.patch_size = self.hparams_vision.get("patch_size", 14)

# Set image_size for compatibility with base class
# Use position embedding dimensions as image_size reference
pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64)
self.hparams_vision["image_size"] = pos_emb_h * self.patch_size

def set_gguf_parameters(self):
# Base class MmprojModel.set_gguf_parameters() already writes:
# - vision_block_count, vision_head_count, vision_embedding_length
# - vision_feed_forward_length, vision_patch_size, image_mean, image_std
# via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config
super().set_gguf_parameters()
assert self.hparams_vision is not None

self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25)

# Position embedding parameters (for interpolation)
self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64))
self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64))
self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4))

# Projector parameters
self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu")
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5))
self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0])

# Image size limits
# Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet)
in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384)
min_patches = 8 # reasonable minimum
pixels_per_patch = self.patch_size ** 2
self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch)
self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch)

@staticmethod
def _permute_kqv(weights: Tensor, n_head: int) -> Tensor:
Comment thread
AesSedai marked this conversation as resolved.
Outdated
out_dim, in_dim = weights.shape
head_dim = out_dim // n_head
w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim)
w = w.permute(0, 2, 1, 3, 4)
return w.reshape(out_dim, in_dim)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Only process vision and projector tensors
is_vision = any(x in name for x in ["vision_tower", "mm_projector"])

if not is_vision:
return

assert self.hparams_vision is not None
n_head = self.hparams_vision.get("num_attention_heads", 16)

# Permute Q/K weights/biases from interleaved to split RoPE format
# This allows using build_rope_2d at runtime without post-permutation.
if "wqkv" in name:
out_dim = data_torch.shape[0]
qkv_dim = out_dim // 3
head_dim = qkv_dim // n_head

if "weight" in name:
wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2*qkv_dim, :], data_torch[2*qkv_dim:, :]
wq = self._permute_kqv(wq, n_head)
wk = self._permute_kqv(wk, n_head)
data_torch = torch.cat([wq, wk, wv], dim=0)
elif "bias" in name:
bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2*qkv_dim], data_torch[2*qkv_dim:]
bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
data_torch = torch.cat([bq, bk, bv], dim=0)


# Temporal embeddings: (T, 1, C) → (T, C)
Comment thread
AesSedai marked this conversation as resolved.
if "pos_emb.time_weight" in name:
T, _, C = data_torch.shape
data_torch = data_torch.reshape(T, C)

# PatchMergerMLP tensor name mapping
# proj.0.weight → proj.linear_1.weight
# proj.2.weight → proj.linear_2.weight
if "mm_projector.proj.0." in name:
name = name.replace(".proj.0.", ".proj.linear_1.")
elif "mm_projector.proj.2." in name:
name = name.replace(".proj.2.", ".proj.linear_2.")

yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("CogVLMForCausalLM")
class CogVLMVisionModel(MmprojModel):

Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3704,6 +3704,7 @@ class VisionProjectorType:
VOXTRAL = "voxtral"
LFM2 = "lfm2"
KIMIVL = "kimivl"
KIMIK25 = "kimik25"
LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,6 +1296,7 @@ class TensorNameMap:

MODEL_TENSOR.V_MMPROJ: (
"multi_modal_projector.linear_{bid}",
"mm_projector.proj.linear_{bid}", # Kimi-K2.5
"visual.merger.mlp.{bid}", # qwen2vl
"merger.mlp.{bid}",
),
Expand Down Expand Up @@ -1357,6 +1358,7 @@ class TensorNameMap:
MODEL_TENSOR.V_ENC_ATTN_QKV: (
"visual.blocks.{bid}.attn.qkv", # qwen3vl
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
"vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
),

MODEL_TENSOR.V_ENC_ATTN_Q: (
Expand Down Expand Up @@ -1531,6 +1533,7 @@ class TensorNameMap:
"multi_modal_projector.norm",
"multi_modal_projector.layer_norm",
"multi_modal_projector.pre_norm",
"mm_projector.pre_norm", # Kimi-K2.5
"pre_mm_projector_norm",
"model.vision.linear_proj.norm1", # cogvlm
"merger.ln_q",
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ add_library(mtmd
models/glm4v.cpp
models/internvl.cpp
models/kimivl.cpp
models/kimik25.cpp
models/llama4.cpp
models/llava.cpp
models/minicpmv.cpp
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ enum projector_type {
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_KIMIK25,
PROJECTOR_TYPE_UNKNOWN,
};

Expand Down Expand Up @@ -268,6 +269,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down
87 changes: 87 additions & 0 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,11 @@ ggml_tensor * clip_graph::build_rope_2d(
const int64_t n_head = cur->ne[1];
const int64_t n_pos = cur->ne[2];

// Ensure input is contiguous (needed when using merged QKV with ggml_view)
if (!ggml_is_contiguous(cur)) {
cur = ggml_cont(ctx0, cur);
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure #19299/#19338 didn't fix this?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since #19338 only merged a few hours ago, I didn't have that one merged into this branch. I'll merge master and retry without that ggml_cont. Thanks for the callout!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@CISC I merged master locally, recompiled, and tested w/o the ggml_cont and the embeddings are different on my raccoon image test.

./build/bin/llama-mtmd-cli -m /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/aes_sedai/Kimi-K2.5-Q4_X.gguf --chat-template /mnt/srv/snowdrift/fp16/Kimi-K2.5/chat_template.jinja --jinja --mmproj /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/mmproj-Kimi-K2.5-F16.gguf --n-gpu-layers 999 --threads 54 --override-tensor "blk\..*_exps\.=CPU" --flash-attn on --image Kimi-K2.5/raccoon.png -p "Describe the contents of this image" --seed 42 --no-warmup

Before, working:

Token 0 (first 16 values): -0.022020 -0.102026 -0.162646 0.098094 -0.076318 -0.260308 -0.064937 0.102250 0.065213 0.213322 -0.105310 -0.137638 -0.101015 -0.098639 -0.135186 0.053588 
Token 0 (last 16 values):  -0.017177 0.046693 0.107003 0.026340 -0.026663 -0.013917 0.131319 -0.144277 -0.192712 0.093361 -0.096736 0.163151 0.216722 0.159008 -0.039639 0.178694 
Stats: mean=0.000457, std=0.078247, min=-4.923960, max=8.580477, sum=6362.352051

After, broken:

Token 0 (first 16 values): -0.004227 0.017700 0.045288 0.014412 0.019241 -0.000450 -0.011505 0.052048 -0.007355 -0.014572 0.018112 -0.027519 -0.005363 -0.020798 0.010681 -0.027069 
Token 0 (last 16 values):  -0.025635 0.009247 -0.005234 0.031509 -0.006050 -0.023453 -0.008224 -0.012344 -0.011505 -0.009567 -0.009521 -0.001862 0.034729 -0.016602 -0.026917 -0.002815 
Stats: mean=0.000353, std=0.068569, min=-1.949219, max=1.311523, sum=4924.728027
And the text output is now seeing the image as garbled
<think>The user wants me to describe the contents of the image. Looking at the image, I see a repeating pattern of what appears to be a stylized logo or emblem. 

The pattern consists of:
1. A circular gear-like or cog-like shape with teeth around the perimeter
2. Inside the circle, there appears to be a stylized design that looks like a DNA double helix or some kind of molecular structure in the center
3. The colors seem to be shades of blue, purple, and possibly some green or teal, with a gradient effect
4. The pattern is tiled across the entire image in a grid formation
5. The background appears to be dark (black or very dark gray)

The design looks like it could be related to:
- Bioinformatics
- DNA/genetics
- Engineering/mechanical gears
- A specific software logo or institutional logo

The image is essentially a wallpaper or background pattern made up of this repeating emblem. The emblem has a technical, scientific appearance combining mechanical (gear) and biological (DNA helix) elements.

I should describe this clearly and objectively, noting the repeating pattern, the gear-like shapes, the central helix design, and the color scheme.</think>The image displays a **repeating tiled pattern** consisting of a stylized emblem arranged in a grid across a dark background.

Each individual emblem features:
- **An outer gear or cog-like ring** with evenly spaced teeth around the circumference, rendered in shades of blue and cyan
- **A central design** resembling a DNA double helix or molecular structure, depicted in purple, pink, and light blue tones
- **A gradient color scheme** transitioning from cyan/blue on the outer ring to purple/pink in the center

The pattern creates a wallpaper-like effect with rows of these circular logos aligned both horizontally and vertically, filling the entire frame. The overall aesthetic suggests themes of **bioinformatics, biotechnology, genetics, or synthetic biology**, combining mechanical (gear) and biological (DNA) imagery. The dark background makes the cool-colored emblems stand out prominently.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so we have another bug. @ORippler @JohannesGaessler

@jeffbolznv Mind testing on Vulkan?

@AesSedai AesSedai Feb 8, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested one more time with e06088da0fa86aa444409f38dff274904931c507 (the non-contig rope fix) with the ggml_cont re-enabled and the embedding values look right again:

Token 0 (first 16 values): -0.029358 -0.105713 -0.161133 0.100708 -0.075317 -0.264465 -0.062134 0.100281 0.069458 0.216919 -0.102356 -0.134155 -0.098579 -0.098022 -0.130737 0.051514 
Token 0 (last 16 values):  -0.015167 0.048523 0.111225 0.024429 -0.022324 -0.014832 0.134521 -0.149582 -0.187012 0.097748 -0.097107 0.166382 0.219757 0.160767 -0.041840 0.179504 
Stats: mean=0.000452, std=0.078249, min=-4.953125, max=8.621094, sum=6304.381348
Correct text output
<think>The user wants me to describe the contents of the image. Let me analyze what I see:

 1. **Main subject**: A raccoon is the central focus of the image.
 2. **Position**: The raccoon is lying on a tree branch, draped over it in a relaxed or lazy manner.
 3. **Physical appearance**: 
    - Gray fur with the characteristic black "mask" around its eyes
    - White fur on its snout/muzzle area
    - Ringed tail (though partially obscured)
    - Small rounded ears with white fur on the edges
    - Front paws hanging down from the branch
 4. **Setting**: 
    - Outdoor, natural environment
    - Tree branch is thick and brown with some moss or lichen
    - Background is blurred (bokeh effect) showing green foliage and trees
    - Appears to be a forest or wooded area
 5. **Lighting**: Natural daylight, soft lighting
 6. **Mood**: The raccoon looks sleepy, relaxed, or resting. Its posture suggests it's comfortable and possibly napping.

 I should provide a clear, detailed description covering these elements.</think>The image shows a **raccoon** draped lazily over a thick, horizontal tree branch in what appears to be a forest or wooded setting. 

The raccoon has the species' characteristic features: grayish-brown fur, a distinctive black "mask" marking around its eyes, and white fur on its snout and the edges of its ears. It is lying with its belly on the branch, its head resting forward, and its front paws hanging down limply on either side, giving it a very relaxed, sleepy, or exhausted appearance.

The tree branch is sturdy and covered with patches of moss or lichen. The background is softly blurred (bokeh effect), showing various shades of green from foliage and the darker brown of tree trunks, suggesting a lush, natural environment with dappled daylight filtering through the leaves. The overall mood of the image is peaceful and endearing, capturing a moment of rest in the wild.
Reference image image

So it's definitely the ggml_cont there making the difference. I've pushed the updated merge from master to this branch so this should be reproducible by just commenting that line out and testing.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you try this patch:

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index dae17c6fb..422a0e410 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -655,11 +655,6 @@ ggml_tensor * clip_graph::build_rope_2d(
     const int64_t n_head = cur->ne[1];
     const int64_t n_pos  = cur->ne[2];
 
-    // Ensure input is contiguous (needed when using merged QKV with ggml_view)
-    if (!ggml_is_contiguous(cur)) {
-        cur = ggml_cont(ctx0, cur);
-    }
-
     // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
     // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
     // first half of cur will use 1e-0, 1e-2 (even)
@@ -677,8 +672,8 @@ ggml_tensor * clip_graph::build_rope_2d(
     {
         first = ggml_view_3d(ctx0, cur,
             n_dim/2, n_head, n_pos,
-            ggml_row_size(cur->type, n_dim),
-            ggml_row_size(cur->type, n_dim*n_head),
+            cur->nb[1],
+            cur->nb[2],
             0);
         first = ggml_rope_ext(
             ctx0,
@@ -696,8 +691,8 @@ ggml_tensor * clip_graph::build_rope_2d(
     {
         second = ggml_view_3d(ctx0, cur,
             n_dim/2, n_head, n_pos,
-            ggml_row_size(cur->type, n_dim),
-            ggml_row_size(cur->type, n_dim*n_head),
+            cur->nb[1],
+            cur->nb[2],
             n_dim/2 * ggml_element_size(cur));
         second = ggml_rope_ext(
             ctx0,

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I'll give it a shot in a few hours once I'm back home from the office.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine by me, not many models using this anyway I think?

@CISC Just repeating myself earlier, but this is the first model to use the build_rope_2d + merged QKV combo.

Other models seem to use the combo ggml_rope_ext + merged QKV so they're fine

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine by me, not many models using this anyway I think?

@CISC Just repeating myself earlier, but this is the first model to use the build_rope_2d + merged QKV combo.

Other models seem to use the combo ggml_rope_ext + merged QKV so they're fine

Sure, I meant build_rope_2d in general.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No ggml_cont + CUDA_VISIBLE_DEVICES= has the wrong embedding and text output still:

CPU only, no ggml_cont
$ CUDA_VISIBLE_DEVICES= MTMD_DEBUG_EMBEDDINGS=1 ./build/bin/llama-mtmd-cli -m /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/aes_sedai/Kimi-K2.5-Q4_X.gguf --chat-template /mnt/srv/snowdrift/fp16/Kimi-K2.5/chat_template.jinja --jinja --mmproj /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/mmproj-Kimi-K2.5-F16.gguf --threads 54 --flash-attn on --image Kimi-K2.5/raccoon.png -p "Describe the contents of this image" --seed 42 --no-warmup 2>&1 | tee ggml-cpu-only.log
ggml_cuda_init: failed to initialize CUDA: no CUDA-capable device is detected
build: 7986 (16010cba6) with GNU 14.2.1 for Linux x86_64
common_init_result: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on
llama_params_fit_impl: no devices with dedicated memory found
llama_params_fit: successfully fit params to free device memory
llama_params_fit: fitting params to free memory took 19.65 seconds
llama_model_loader: loaded meta data with 49 key-value pairs and 1096 tensors from /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/aes_sedai/Kimi-K2.5-Q4_X.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                         general.size_label str              = 384x14B
llama_model_loader: - kv   3:                            general.license str              = other
llama_model_loader: - kv   4:                       general.license.name str              = modified-mit
llama_model_loader: - kv   5:                               general.tags arr[str,1]       = ["image-text-to-text"]
llama_model_loader: - kv   6:                      deepseek2.block_count u32              = 61
llama_model_loader: - kv   7:                   deepseek2.context_length u32              = 262144
llama_model_loader: - kv   8:                 deepseek2.embedding_length u32              = 7168
llama_model_loader: - kv   9:              deepseek2.feed_forward_length u32              = 18432
llama_model_loader: - kv  10:             deepseek2.attention.head_count u32              = 64
llama_model_loader: - kv  11:          deepseek2.attention.head_count_kv u32              = 1
llama_model_loader: - kv  12:                deepseek2.rope.scaling.type str              = yarn
llama_model_loader: - kv  13:              deepseek2.rope.scaling.factor f32              = 64.000000
llama_model_loader: - kv  14: deepseek2.rope.scaling.original_context_length u32              = 4096
llama_model_loader: - kv  15:      deepseek2.rope.scaling.yarn_beta_fast f32              = 32.000000
llama_model_loader: - kv  16:      deepseek2.rope.scaling.yarn_beta_slow f32              = 1.000000
llama_model_loader: - kv  17:                   deepseek2.rope.freq_base f32              = 50000.000000
llama_model_loader: - kv  18: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  19:                deepseek2.expert_used_count u32              = 8
llama_model_loader: - kv  20:               deepseek2.expert_group_count u32              = 1
llama_model_loader: - kv  21:          deepseek2.expert_group_used_count u32              = 1
llama_model_loader: - kv  22:               deepseek2.expert_gating_func u32              = 2
llama_model_loader: - kv  23:        deepseek2.leading_dense_block_count u32              = 1
llama_model_loader: - kv  24:                       deepseek2.vocab_size u32              = 163840
llama_model_loader: - kv  25:            deepseek2.attention.q_lora_rank u32              = 1536
llama_model_loader: - kv  26:           deepseek2.attention.kv_lora_rank u32              = 512
llama_model_loader: - kv  27:             deepseek2.attention.key_length u32              = 576
llama_model_loader: - kv  28:           deepseek2.attention.value_length u32              = 512
llama_model_loader: - kv  29:         deepseek2.attention.key_length_mla u32              = 192
llama_model_loader: - kv  30:       deepseek2.attention.value_length_mla u32              = 128
llama_model_loader: - kv  31:       deepseek2.expert_feed_forward_length u32              = 2048
llama_model_loader: - kv  32:                     deepseek2.expert_count u32              = 384
llama_model_loader: - kv  33:              deepseek2.expert_shared_count u32              = 1
llama_model_loader: - kv  34:             deepseek2.expert_weights_scale f32              = 2.827000
llama_model_loader: - kv  35:              deepseek2.expert_weights_norm bool             = true
llama_model_loader: - kv  36:             deepseek2.rope.dimension_count u32              = 64
llama_model_loader: - kv  37: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.100000
llama_model_loader: - kv  38:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  39:                         tokenizer.ggml.pre str              = kimi-k2
llama_model_loader: - kv  40:                      tokenizer.ggml.tokens arr[str,163840]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  41:                  tokenizer.ggml.token_type arr[i32,163840]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  42:                      tokenizer.ggml.merges arr[str,163328]  = ["Ġ Ġ", "ĠĠ ĠĠ", "Ġ t", "i n",...
llama_model_loader: - kv  43:                tokenizer.ggml.bos_token_id u32              = 163584
llama_model_loader: - kv  44:                tokenizer.ggml.eos_token_id u32              = 163585
llama_model_loader: - kv  45:            tokenizer.ggml.padding_token_id u32              = 163839
llama_model_loader: - kv  46:                    tokenizer.chat_template str              = {%- macro render_content(msg) -%}\n   ...
llama_model_loader: - kv  47:               general.quantization_version u32              = 2
llama_model_loader: - kv  48:                          general.file_type u32              = 7
llama_model_loader: - type  f32:  365 tensors
llama_model_loader: - type q4_0:  180 tensors
llama_model_loader: - type q8_0:  551 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = Q8_0
print_info: file size   = 543.62 GiB (4.55 BPW) 
load: 0 unused tokens
load: printing all EOG tokens:
load:   - 163585 ('[EOS]')
load:   - 163586 ('<|im_end|>')
load:   - 163593 ('[EOT]')
load:   - 163839 ('[PAD]')
load: special tokens cache size = 256
load: token to piece cache size = 1.0606 MB
print_info: arch                  = deepseek2
print_info: vocab_only            = 0
print_info: no_alloc              = 0
print_info: n_ctx_train           = 262144
print_info: n_embd                = 7168
print_info: n_embd_inp            = 7168
print_info: n_layer               = 61
print_info: n_head                = 64
print_info: n_head_kv             = 1
print_info: n_rot                 = 64
print_info: n_swa                 = 0
print_info: is_swa_any            = 0
print_info: n_embd_head_k         = 576
print_info: n_embd_head_v         = 512
print_info: n_gqa                 = 64
print_info: n_embd_k_gqa          = 576
print_info: n_embd_v_gqa          = 512
print_info: f_norm_eps            = 0.0e+00
print_info: f_norm_rms_eps        = 1.0e-05
print_info: f_clamp_kqv           = 0.0e+00
print_info: f_max_alibi_bias      = 0.0e+00
print_info: f_logit_scale         = 0.0e+00
print_info: f_attn_scale          = 0.0e+00
print_info: n_ff                  = 18432
print_info: n_expert              = 384
print_info: n_expert_used         = 8
print_info: n_expert_groups       = 1
print_info: n_group_used          = 1
print_info: causal attn           = 1
print_info: pooling type          = 0
print_info: rope type             = 0
print_info: rope scaling          = yarn
print_info: freq_base_train       = 50000.0
print_info: freq_scale_train      = 0.015625
print_info: n_ctx_orig_yarn       = 4096
print_info: rope_yarn_log_mul     = 1.0000
print_info: rope_finetuned        = unknown
print_info: model type            = 671B
print_info: model params          = 1.03 T
print_info: general.name          = n/a
print_info: n_layer_dense_lead    = 1
print_info: n_lora_q              = 1536
print_info: n_lora_kv             = 512
print_info: n_embd_head_k_mla     = 192
print_info: n_embd_head_v_mla     = 128
print_info: n_ff_exp              = 2048
print_info: n_expert_shared       = 1
print_info: expert_weights_scale  = 2.8
print_info: expert_weights_norm   = 1
print_info: expert_gating_func    = sigmoid
print_info: vocab type            = BPE
print_info: n_vocab               = 163840
print_info: n_merges              = 163328
print_info: BOS token             = 163584 '[BOS]'
print_info: EOS token             = 163585 '[EOS]'
print_info: EOT token             = 163586 '<|im_end|>'
print_info: PAD token             = 163839 '[PAD]'
print_info: LF token              = 198 'Ċ'
print_info: FIM PAD token         = 163839 '[PAD]'
print_info: EOG token             = 163585 '[EOS]'
print_info: EOG token             = 163586 '<|im_end|>'
print_info: EOG token             = 163593 '[EOT]'
print_info: EOG token             = 163839 '[PAD]'
print_info: max token length      = 512
load_tensors: loading model tensors, this can take a while... (mmap = true, direct_io = false)
load_tensors:   CPU_Mapped model buffer size = 556663.41 MiB
load_tensors:   CPU_REPACK model buffer size = 544320.00 MiB
....................................................................................................
common_init_result: added [EOS] logit bias = -inf
common_init_result: added <|im_end|> logit bias = -inf
common_init_result: added [EOT] logit bias = -inf
common_init_result: added [PAD] logit bias = -inf
llama_context: constructing llama_context
llama_context: setting new yarn_attn_factor = 1.0000 (mscale == 1.0, mscale_all_dim = 1.0)
llama_context: n_seq_max     = 1
llama_context: n_ctx         = 262144
llama_context: n_ctx_seq     = 262144
llama_context: n_batch       = 2048
llama_context: n_ubatch      = 512
llama_context: causal_attn   = 1
llama_context: flash_attn    = enabled
llama_context: kv_unified    = false
llama_context: freq_base     = 50000.0
llama_context: freq_scale    = 0.015625
llama_context:        CPU  output buffer size =     0.62 MiB
llama_kv_cache:        CPU KV buffer size = 17568.00 MiB
llama_kv_cache: size = 17568.00 MiB (262144 cells,  61 layers,  1/1 seqs), K (f16): 17568.00 MiB, V (f16):    0.00 MiB
sched_reserve: reserving ...
sched_reserve:        CPU compute buffer size =   981.01 MiB
sched_reserve: graph nodes  = 4791
sched_reserve: graph splits = 1
sched_reserve: reserve took 8.48 ms, sched copies = 1
mtmd_cli_context: chat template example:
<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|><|im_user|>user<|im_middle|>Hello<|im_end|><|im_assistant|>assistant<|im_middle|><think></think>Hi there<|im_end|><|im_user|>user<|im_middle|>How are you?<|im_end|><|im_assistant|>assistant<|im_middle|>
clip_model_loader: model name:   Kimi K2.5
clip_model_loader: description:  
clip_model_loader: GGUF version: 3
clip_model_loader: alignment:    32
clip_model_loader: n_tensors:    335
clip_model_loader: n_kv:         28

clip_model_loader: has vision encoder
clip_ctx: CLIP using CPU backend
load_hparams: projector:          kimik25
load_hparams: n_embd:             1152
load_hparams: n_head:             16
load_hparams: n_ff:               4304
load_hparams: n_layer:            27
load_hparams: ffn_op:             gelu
load_hparams: projection_dim:     7168

--- vision hparams ---
load_hparams: image_size:         896
load_hparams: patch_size:         14
load_hparams: has_llava_proj:     0
load_hparams: minicpmv_version:   0
load_hparams: n_merge:            2
load_hparams: n_wa_pattern: 0
load_hparams: image_min_pixels:   1568
load_hparams: image_max_pixels:   3211264

load_hparams: model size:         908.42 MiB
load_hparams: metadata size:      0.12 MiB
main: loading model: /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/aes_sedai/Kimi-K2.5-Q4_X.gguf
WARN: This is an experimental CLI for testing multimodal capability.
      For normal use cases, please use the standard llama-cli
encoding image slice...
alloc_compute_meta:        CPU compute buffer size =   264.77 MiB
alloc_compute_meta: graph splits = 1, nodes = 1034
warmup: flash attention is enabled

=== MTMD_DEBUG_EMBEDDINGS ===
Shape: [7168, 1944]
Token 0 (first 16 values): -0.031418 0.018485 0.076531 0.027357 0.050587 -0.002738 -0.014581 0.131898 -0.018651 -0.064568 0.038107 -0.068829 -0.001136 -0.065361 0.036655 -0.092244 
Token 0 (last 16 values):  -0.046038 0.025009 -0.036792 0.040030 0.005790 -0.034914 -0.026276 -0.044675 0.006448 0.010157 -0.003982 -0.000043 0.091428 -0.022575 -0.079341 0.009261 
Stats: mean=0.000517, std=0.063481, min=-1.903979, max=1.135730, sum=7206.027344
=== END MTMD_DEBUG_EMBEDDINGS ===

image slice encoded in 15268 ms
decoding image batch 1/1, n_tokens_batch = 1944
image decoded (batch 1/1) in 64037 ms

<think>The user wants me to describe the contents of the image. Looking at the image, I see a repeating pattern of what appears to be a stylized icon or logo. 

The pattern consists of:
1. A purple/lavender colored circular gear-like shape with multiple teeth or cogs around the perimeter
2. Inside the circle, there's a white silhouette of what looks like a classical building or structure with columns (perhaps resembling a temple, courthouse, or government building)
3. The background appears to be a dark green or olive color
4. There's a wavy, translucent overlay in shades of orange, yellow, and red that runs horizontally across the image, creating a distorted or glitch-like effect over the repeating pattern

The pattern is tiled repeatedly across the entire image, creating a wallpaper-like effect. The wavy distortion overlay gives it a somewhat psychedelic or digital art aesthetic.

Let me provide a detailed description of these elements.</think>The image displays a **repeating tiled pattern** consisting of stylized icons set against a dark olive-green background. Each tile features:

**Main Icon Elements:**
- A **purple/lavender gear or cogwheel** with rounded teeth forming a circular border
- Inside each gear is a **white silhouette of a classical building** resembling a Greek or Roman temple with columns and a triangular pediment

**Overlay Effect:**
- A **horizontal, wavy, translucent distortion band** runs across the entire image in shades of orange, yellow, and amber
- This creates a "glitch" or heat-wave effect that slightly distorts the underlying pattern, making the icons appear to shimmer or ripple like a mirage

**Overall Composition:**
The pattern is arranged in a tight grid, with the gear icons touching or nearly touching each other, creating a wallpaper-like texture. The color palette is limited primarily to dark green, purple, and white, with the warm-toned wavy overlay adding contrast and visual movement to the otherwise static geometric repetition. The aesthetic suggests a blend of industrial/technical imagery (gears) with classical architecture, filtered through a digital or psychedelic visual effect.


llama_perf_context_print:        load time =  731834.48 ms
llama_perf_context_print: prompt eval time =   93832.96 ms /  1957 tokens (   47.95 ms per token,    20.86 tokens per second)
llama_perf_context_print:        eval time =   55847.56 ms /   428 runs   (  130.48 ms per token,     7.66 tokens per second)
llama_perf_context_print:       total time =  787807.93 ms /  2385 tokens
llama_perf_context_print:    graphs reused =        425

The patch for the cur->nb[1] and cur->nb[2] fixed it, no ggml_cont and the output is correct:

gpu w/ patch, no ggml_cont
$ MTMD_DEBUG_EMBEDDINGS=1 ./build/bin/llama-mtmd-cli -m /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/aes_sedai/Kimi-K2.5-Q4_X.gguf --chat-template /mnt/srv/snowdrift/fp16/Kimi-K2.5/chat_template.jinja --jinja --mmproj /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/mmproj-Kimi-K2.5-F16.gguf --threads 54 --flash-attn on --image Kimi-K2.5/raccoon.png -p "Describe the contents of this image" --seed 42 --no-warmup 2>&1 | tee ggml-rope-cur.log
ggml_cuda_init: found 2 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
build: 7986 (16010cba6) with GNU 14.2.1 for Linux x86_64
common_init_result: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on
llama_params_fit_impl: projected memory use with initial parameters [MiB]:
llama_params_fit_impl:   - CUDA0 (NVIDIA GeForce RTX 3090):  24135 total, 288978 used, -265106 free vs. target of   1024
llama_params_fit_impl:   - CUDA1 (NVIDIA GeForce RTX 3090):  24135 total, 288217 used, -264345 free vs. target of   1024
llama_params_fit_impl: projected to use 577196 MiB of device memory vs. 47743 MiB of free device memory
llama_params_fit_impl: cannot meet free memory targets on all devices, need to use 531500 MiB less in total
llama_params_fit_impl: context size reduced from 262144 to 4096 -> need 20614 MiB less memory in total
llama_params_fit_impl: with only dense weights in device memory there is a total surplus of 27843 MiB
llama_params_fit_impl: filling dense-only layers back-to-front:
llama_params_fit_impl:   - CUDA1 (NVIDIA GeForce RTX 3090): 62 layers,  11789 MiB used,  12081 MiB free
llama_params_fit_impl:   - CUDA0 (NVIDIA GeForce RTX 3090):  0 layers,   6062 MiB used,  17809 MiB free
llama_params_fit_impl: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:
llama_params_fit_impl:   - CUDA0 (NVIDIA GeForce RTX 3090):  2 layers ( 0 overflowing),  15805 MiB used,   8066 MiB free
llama_params_fit_impl:   - CUDA1 (NVIDIA GeForce RTX 3090): 60 layers (59 overflowing),  20136 MiB used,   3735 MiB free
llama_params_fit: successfully fit params to free device memory
llama_params_fit: fitting params to free memory took 11.35 seconds
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:06:10.0) - 23871 MiB free
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:06:11.0) - 23871 MiB free
llama_model_loader: loaded meta data with 49 key-value pairs and 1096 tensors from /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/aes_sedai/Kimi-K2.5-Q4_X.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                         general.size_label str              = 384x14B
llama_model_loader: - kv   3:                            general.license str              = other
llama_model_loader: - kv   4:                       general.license.name str              = modified-mit
llama_model_loader: - kv   5:                               general.tags arr[str,1]       = ["image-text-to-text"]
llama_model_loader: - kv   6:                      deepseek2.block_count u32              = 61
llama_model_loader: - kv   7:                   deepseek2.context_length u32              = 262144
llama_model_loader: - kv   8:                 deepseek2.embedding_length u32              = 7168
llama_model_loader: - kv   9:              deepseek2.feed_forward_length u32              = 18432
llama_model_loader: - kv  10:             deepseek2.attention.head_count u32              = 64
llama_model_loader: - kv  11:          deepseek2.attention.head_count_kv u32              = 1
llama_model_loader: - kv  12:                deepseek2.rope.scaling.type str              = yarn
llama_model_loader: - kv  13:              deepseek2.rope.scaling.factor f32              = 64.000000
llama_model_loader: - kv  14: deepseek2.rope.scaling.original_context_length u32              = 4096
llama_model_loader: - kv  15:      deepseek2.rope.scaling.yarn_beta_fast f32              = 32.000000
llama_model_loader: - kv  16:      deepseek2.rope.scaling.yarn_beta_slow f32              = 1.000000
llama_model_loader: - kv  17:                   deepseek2.rope.freq_base f32              = 50000.000000
llama_model_loader: - kv  18: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  19:                deepseek2.expert_used_count u32              = 8
llama_model_loader: - kv  20:               deepseek2.expert_group_count u32              = 1
llama_model_loader: - kv  21:          deepseek2.expert_group_used_count u32              = 1
llama_model_loader: - kv  22:               deepseek2.expert_gating_func u32              = 2
llama_model_loader: - kv  23:        deepseek2.leading_dense_block_count u32              = 1
llama_model_loader: - kv  24:                       deepseek2.vocab_size u32              = 163840
llama_model_loader: - kv  25:            deepseek2.attention.q_lora_rank u32              = 1536
llama_model_loader: - kv  26:           deepseek2.attention.kv_lora_rank u32              = 512
llama_model_loader: - kv  27:             deepseek2.attention.key_length u32              = 576
llama_model_loader: - kv  28:           deepseek2.attention.value_length u32              = 512
llama_model_loader: - kv  29:         deepseek2.attention.key_length_mla u32              = 192
llama_model_loader: - kv  30:       deepseek2.attention.value_length_mla u32              = 128
llama_model_loader: - kv  31:       deepseek2.expert_feed_forward_length u32              = 2048
llama_model_loader: - kv  32:                     deepseek2.expert_count u32              = 384
llama_model_loader: - kv  33:              deepseek2.expert_shared_count u32              = 1
llama_model_loader: - kv  34:             deepseek2.expert_weights_scale f32              = 2.827000
llama_model_loader: - kv  35:              deepseek2.expert_weights_norm bool             = true
llama_model_loader: - kv  36:             deepseek2.rope.dimension_count u32              = 64
llama_model_loader: - kv  37: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.100000
llama_model_loader: - kv  38:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  39:                         tokenizer.ggml.pre str              = kimi-k2
llama_model_loader: - kv  40:                      tokenizer.ggml.tokens arr[str,163840]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  41:                  tokenizer.ggml.token_type arr[i32,163840]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  42:                      tokenizer.ggml.merges arr[str,163328]  = ["Ġ Ġ", "ĠĠ ĠĠ", "Ġ t", "i n",...
llama_model_loader: - kv  43:                tokenizer.ggml.bos_token_id u32              = 163584
llama_model_loader: - kv  44:                tokenizer.ggml.eos_token_id u32              = 163585
llama_model_loader: - kv  45:            tokenizer.ggml.padding_token_id u32              = 163839
llama_model_loader: - kv  46:                    tokenizer.chat_template str              = {%- macro render_content(msg) -%}\n   ...
llama_model_loader: - kv  47:               general.quantization_version u32              = 2
llama_model_loader: - kv  48:                          general.file_type u32              = 7
llama_model_loader: - type  f32:  365 tensors
llama_model_loader: - type q4_0:  180 tensors
llama_model_loader: - type q8_0:  551 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = Q8_0
print_info: file size   = 543.62 GiB (4.55 BPW) 
load: 0 unused tokens
load: printing all EOG tokens:
load:   - 163585 ('[EOS]')
load:   - 163586 ('<|im_end|>')
load:   - 163593 ('[EOT]')
load:   - 163839 ('[PAD]')
load: special tokens cache size = 256
load: token to piece cache size = 1.0606 MB
print_info: arch                  = deepseek2
print_info: vocab_only            = 0
print_info: no_alloc              = 0
print_info: n_ctx_train           = 262144
print_info: n_embd                = 7168
print_info: n_embd_inp            = 7168
print_info: n_layer               = 61
print_info: n_head                = 64
print_info: n_head_kv             = 1
print_info: n_rot                 = 64
print_info: n_swa                 = 0
print_info: is_swa_any            = 0
print_info: n_embd_head_k         = 576
print_info: n_embd_head_v         = 512
print_info: n_gqa                 = 64
print_info: n_embd_k_gqa          = 576
print_info: n_embd_v_gqa          = 512
print_info: f_norm_eps            = 0.0e+00
print_info: f_norm_rms_eps        = 1.0e-05
print_info: f_clamp_kqv           = 0.0e+00
print_info: f_max_alibi_bias      = 0.0e+00
print_info: f_logit_scale         = 0.0e+00
print_info: f_attn_scale          = 0.0e+00
print_info: n_ff                  = 18432
print_info: n_expert              = 384
print_info: n_expert_used         = 8
print_info: n_expert_groups       = 1
print_info: n_group_used          = 1
print_info: causal attn           = 1
print_info: pooling type          = 0
print_info: rope type             = 0
print_info: rope scaling          = yarn
print_info: freq_base_train       = 50000.0
print_info: freq_scale_train      = 0.015625
print_info: n_ctx_orig_yarn       = 4096
print_info: rope_yarn_log_mul     = 1.0000
print_info: rope_finetuned        = unknown
print_info: model type            = 671B
print_info: model params          = 1.03 T
print_info: general.name          = n/a
print_info: n_layer_dense_lead    = 1
print_info: n_lora_q              = 1536
print_info: n_lora_kv             = 512
print_info: n_embd_head_k_mla     = 192
print_info: n_embd_head_v_mla     = 128
print_info: n_ff_exp              = 2048
print_info: n_expert_shared       = 1
print_info: expert_weights_scale  = 2.8
print_info: expert_weights_norm   = 1
print_info: expert_gating_func    = sigmoid
print_info: vocab type            = BPE
print_info: n_vocab               = 163840
print_info: n_merges              = 163328
print_info: BOS token             = 163584 '[BOS]'
print_info: EOS token             = 163585 '[EOS]'
print_info: EOT token             = 163586 '<|im_end|>'
print_info: PAD token             = 163839 '[PAD]'
print_info: LF token              = 198 'Ċ'
print_info: FIM PAD token         = 163839 '[PAD]'
print_info: EOG token             = 163585 '[EOS]'
print_info: EOG token             = 163586 '<|im_end|>'
print_info: EOG token             = 163593 '[EOT]'
print_info: EOG token             = 163839 '[PAD]'
print_info: max token length      = 512
load_tensors: loading model tensors, this can take a while... (mmap = true, direct_io = false)
load_tensors: offloading output layer to GPU
load_tensors: offloading 60 repeating layers to GPU
load_tensors: offloaded 62/62 layers to GPU
load_tensors:   CPU_Mapped model buffer size = 555458.51 MiB
load_tensors:        CUDA0 model buffer size =  9733.81 MiB
load_tensors:        CUDA1 model buffer size = 19508.51 MiB
....................................................................................................
common_init_result: added [EOS] logit bias = -inf
common_init_result: added <|im_end|> logit bias = -inf
common_init_result: added [EOT] logit bias = -inf
common_init_result: added [PAD] logit bias = -inf
llama_context: constructing llama_context
llama_context: setting new yarn_attn_factor = 1.0000 (mscale == 1.0, mscale_all_dim = 1.0)
llama_context: n_seq_max     = 1
llama_context: n_ctx         = 4096
llama_context: n_ctx_seq     = 4096
llama_context: n_batch       = 2048
llama_context: n_ubatch      = 512
llama_context: causal_attn   = 1
llama_context: flash_attn    = enabled
llama_context: kv_unified    = false
llama_context: freq_base     = 50000.0
llama_context: freq_scale    = 0.015625
llama_context: n_ctx_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
llama_context:  CUDA_Host  output buffer size =     0.62 MiB
llama_kv_cache:      CUDA0 KV buffer size =     9.00 MiB
llama_kv_cache:      CUDA1 KV buffer size =   265.50 MiB
llama_kv_cache: size =  274.50 MiB (  4096 cells,  61 layers,  1/1 seqs), K (f16):  274.50 MiB, V (f16):    0.00 MiB
sched_reserve: reserving ...
sched_reserve:      CUDA0 compute buffer size =  6062.75 MiB
sched_reserve:      CUDA1 compute buffer size =   362.00 MiB
sched_reserve:  CUDA_Host compute buffer size =    36.01 MiB
sched_reserve: graph nodes  = 4791
sched_reserve: graph splits = 240 (with bs=512), 121 (with bs=1)
sched_reserve: reserve took 12.79 ms, sched copies = 1
mtmd_cli_context: chat template example:
<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|><|im_user|>user<|im_middle|>Hello<|im_end|><|im_assistant|>assistant<|im_middle|><think></think>Hi there<|im_end|><|im_user|>user<|im_middle|>How are you?<|im_end|><|im_assistant|>assistant<|im_middle|>
clip_model_loader: model name:   Kimi K2.5
clip_model_loader: description:  
clip_model_loader: GGUF version: 3
clip_model_loader: alignment:    32
clip_model_loader: n_tensors:    335
clip_model_loader: n_kv:         28

clip_model_loader: has vision encoder
clip_ctx: CLIP using CUDA0 backend
load_hparams: projector:          kimik25
load_hparams: n_embd:             1152
load_hparams: n_head:             16
load_hparams: n_ff:               4304
load_hparams: n_layer:            27
load_hparams: ffn_op:             gelu
load_hparams: projection_dim:     7168

--- vision hparams ---
load_hparams: image_size:         896
load_hparams: patch_size:         14
load_hparams: has_llava_proj:     0
load_hparams: minicpmv_version:   0
load_hparams: n_merge:            2
load_hparams: n_wa_pattern: 0
load_hparams: image_min_pixels:   1568
load_hparams: image_max_pixels:   3211264

load_hparams: model size:         908.42 MiB
load_hparams: metadata size:      0.12 MiB
main: loading model: /mnt/srv/snowdrift/gguf/Kimi-K2.5-GGUF/aes_sedai/Kimi-K2.5-Q4_X.gguf
WARN: This is an experimental CLI for testing multimodal capability.
      For normal use cases, please use the standard llama-cli
encoding image slice...
alloc_compute_meta:      CUDA0 compute buffer size =   264.77 MiB
alloc_compute_meta:        CPU compute buffer size =    17.50 MiB
alloc_compute_meta: graph splits = 1, nodes = 1034
warmup: flash attention is enabled

=== MTMD_DEBUG_EMBEDDINGS ===
Shape: [7168, 1944]
Token 0 (first 16 values): -0.029358 -0.105713 -0.161133 0.100708 -0.075317 -0.264465 -0.062134 0.100281 0.069458 0.216919 -0.102356 -0.134155 -0.098579 -0.098022 -0.130737 0.051514 
Token 0 (last 16 values):  -0.015167 0.048523 0.111225 0.024429 -0.022324 -0.014832 0.134521 -0.149582 -0.187012 0.097748 -0.097107 0.166382 0.219757 0.160767 -0.041840 0.179504 
Stats: mean=0.000452, std=0.078249, min=-4.953125, max=8.621094, sum=6304.381348
=== END MTMD_DEBUG_EMBEDDINGS ===

image slice encoded in 1017 ms
decoding image batch 1/1, n_tokens_batch = 1944
image decoded (batch 1/1) in 83232 ms

<think>The user wants me to describe the contents of the image. Let me analyze what I see:

 1. **Main subject**: A raccoon is the central focus of the image.
 2. **Position**: The raccoon is lying on a tree branch, draped over it in a relaxed or lazy manner.
 3. **Physical appearance**: 
    - Gray fur with the characteristic black "mask" around its eyes
    - White markings on its face, particularly around the nose and mouth area
    - Ringed tail (though partially visible)
    - Small, rounded ears with white fur on the edges
    - Front paws hanging down from the branch
 4. **Setting**: 
    - Outdoor, natural environment
    - Tree branch is thick and brown with some moss or lichen
    - Background is blurred (bokeh effect) showing green foliage and trees
    - Appears to be a forest or wooded area
 5. **Lighting**: Natural daylight, soft lighting
 6. **Mood**: The raccoon looks relaxed, possibly sleepy or resting. Its posture suggests it's comfortable and at ease.

 The image captures a candid moment of wildlife in a natural habitat. The raccoon's body language suggests it's taking a break or observing its surroundings from a safe perch.

 I should provide a clear, detailed description covering the subject, its appearance, its position/action, and the surrounding environment.</think>The image shows a **raccoon** draped lazily over a thick, horizontal tree branch in what appears to be a forest or wooded environment.

**The raccoon:**
- Has the species' characteristic grayish-brown fur with darker and lighter variations
- Displays the distinctive black "mask" markings around its eyes, contrasted with white fur on its snout and cheeks
- Is lying with its belly on the branch, front paws hanging down loosely, and head resting forward
- Has small, rounded ears with white trim and a slightly pointed snout
- Appears relaxed or sleepy, with a calm, somewhat melancholic expression

**The setting:**
- The tree branch is sturdy and brown with patches of moss or lichen
- Background features soft, out-of-focus greenery (bokeh effect) including leaves and tree trunks
- Natural daylight illuminates the scene, highlighting the texture of the raccoon's fur
- The overall atmosphere is peaceful and natural, suggesting a wildlife habitat or nature reserve

The composition captures an intimate, candid moment of the animal at rest in its natural arboreal environment.


llama_perf_context_print:        load time =  317102.77 ms
llama_perf_context_print: prompt eval time =   84821.87 ms /  1957 tokens (   43.34 ms per token,    23.07 tokens per second)
llama_perf_context_print:        eval time =   43858.18 ms /   523 runs   (   83.86 ms per token,    11.92 tokens per second)
llama_perf_context_print:       total time =  361103.84 ms /  2480 tokens
llama_perf_context_print:    graphs reused =        520

I can update this PR with that fix applied @ggerganov

// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
// first half of cur will use 1e-0, 1e-2 (even)
Expand Down Expand Up @@ -825,6 +830,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_kimivl>(ctx, img);
} break;
case PROJECTOR_TYPE_KIMIK25:
{
builder = std::make_unique<clip_graph_kimik25>(ctx, img);
} break;
case PROJECTOR_TYPE_COGVLM:
{
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
Expand Down Expand Up @@ -1139,6 +1148,22 @@ struct clip_model_loader {
hparams.set_limit_image_tokens(8, 1024);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_KIMIK25:
{
hparams.rope_theta = 10000.0f;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);

int min_pixels = 0, max_pixels = 0;
get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false);
get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false);
if (min_pixels > 0 && max_pixels > 0) {
hparams.image_min_pixels = min_pixels;
hparams.image_max_pixels = max_pixels;
hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels));
} else {
hparams.set_limit_image_tokens(2, 4096);
}
} break;
case PROJECTOR_TYPE_GEMMA3:
{
// default value (used by all model sizes in gemma 3 family)
Expand Down Expand Up @@ -1668,6 +1693,7 @@ struct clip_model_loader {
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_KIMIVL:
case PROJECTOR_TYPE_KIMIK25:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
Expand Down Expand Up @@ -3039,6 +3065,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->entries.push_back(std::move(res));
} break;

case PROJECTOR_TYPE_KIMIK25:
{
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
original_size,
params.patch_size * params.n_merge,
params.image_min_pixels,
params.image_max_pixels);
const std::array<uint8_t, 3> pad_color = {0, 0, 0};

clip_image_u8 resized_img;
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color);
clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));
} break;

case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
Expand Down Expand Up @@ -3247,6 +3290,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
case PROJECTOR_TYPE_KIMIK25:
{
// dynamic size
int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
Expand Down Expand Up @@ -3588,6 +3632,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_KIMIVL:
case PROJECTOR_TYPE_KIMIK25:
case PROJECTOR_TYPE_LIGHTONOCR:
{
// set the 2D positions
Expand Down Expand Up @@ -3724,6 +3769,47 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
}

// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
const int64_t n_embd = embeddings->ne[0];
const int64_t n_tokens = embeddings->ne[1];
std::vector<float> emb_data(n_embd * n_tokens);
ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));

LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens);

// Print first few values of first token
LOG_INF("Token 0 (first 16 values): ");
for (int i = 0; i < std::min((int64_t)16, n_embd); i++) {
LOG_INF("%.6f ", emb_data[i]);
}
LOG_INF("\n");

// Print last few values of first token
if (n_embd > 16) {
LOG_INF("Token 0 (last 16 values): ");
for (int64_t i = n_embd - 16; i < n_embd; i++) {
LOG_INF("%.6f ", emb_data[i]);
}
LOG_INF("\n");
}

// Compute and print statistics
float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0];
for (size_t i = 0; i < emb_data.size(); i++) {
sum += emb_data[i];
sum_sq += emb_data[i] * emb_data[i];
min_val = std::min(min_val, emb_data[i]);
max_val = std::max(max_val, emb_data[i]);
}
float mean = sum / emb_data.size();
float variance = (sum_sq / emb_data.size()) - (mean * mean);
LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n",
mean, sqrtf(variance), min_val, max_val, sum);
LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n");
}

return true;
}

Expand Down Expand Up @@ -3770,6 +3856,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
case PROJECTOR_TYPE_KIMIK25:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];
Expand Down
Loading