Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"Ernie4_5_ForCausalLM": "ernie",
"Ernie4_5_MoeForCausalLM": "ernie",
"EuroBertModel": "bert",
"Exaone4_5_ForConditionalGeneration": "exaone",
"Exaone4ForCausalLM": "exaone",
"ExaoneForCausalLM": "exaone",
"ExaoneMoEForCausalLM": "exaone",
Expand Down Expand Up @@ -238,6 +239,7 @@
"CogVLMForCausalLM": "cogvlm",
"DeepseekOCRForCausalLM": "deepseek",
"DotsOCRForCausalLM": "dotsocr",
"Exaone4_5_ForConditionalGeneration": "exaone",
"Gemma3ForConditionalGeneration": "gemma",
"Gemma3nForConditionalGeneration": "gemma",
"Gemma4ForConditionalGeneration": "gemma",
Expand Down
2 changes: 1 addition & 1 deletion conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2555,7 +2555,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
# For text conversion we route to a dedicated text-only class.
# TODO: refactor this later to avoid adding exception here
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"):
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM", "Exaone4_5_ForConditionalGeneration"):
return arch

# if "architectures" is found in the sub-config, use that instead
Expand Down
99 changes: 97 additions & 2 deletions conversion/exaone.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import math

from pathlib import Path
from typing import Iterable, TYPE_CHECKING
from typing import Callable, Iterable, TYPE_CHECKING

import torch

if TYPE_CHECKING:
from torch import Tensor

from .base import ModelBase, TextModel, gguf
from .base import MmprojModel, ModelBase, TextModel, gguf
from .qwenvl import Qwen2VLVisionModel


@ModelBase.register("ExaoneForCausalLM")
Expand Down Expand Up @@ -208,3 +209,97 @@ def prepare_tensors(self):
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("Exaone4_5_ForConditionalGeneration")
class Exaone4_5_TextModel(Exaone4Model):
"""Text tower of EXAONE 4.5; Tensors match EXAONE4"""

model_arch = gguf.MODEL_ARCH.EXAONE4

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
if n_nextn > 0:
self.block_count = self.hparams["num_hidden_layers"] + n_nextn
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)

def set_gguf_parameters(self):
super().set_gguf_parameters()
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
if n_nextn > 0:
self.gguf_writer.add_nextn_predict_layers(n_nextn)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("mtp."):
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
if n_nextn <= 0:
return
nh = self.hparams["num_hidden_layers"]
if ".layers." in name:
share = self.hparams.get("mtp_share_layers", False)
mtp_bid = bid if bid is not None else 0
if share:
for k in range(n_nextn):
nn = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{nh + k}")
yield from super().modify_tensors(data_torch, nn, nh + k)
return
name = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{mtp_bid + nh}")
else:
remapper = {
"mtp.fc": gguf.MODEL_TENSOR.NEXTN_EH_PROJ,
"mtp.pre_fc_norm_embedding": gguf.MODEL_TENSOR.NEXTN_ENORM,
"mtp.pre_fc_norm_hidden": gguf.MODEL_TENSOR.NEXTN_HNORM,
"mtp.norm": gguf.MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
}
_n = Path(name)
key = _n.stem
if key not in remapper:
return
for bid_mtp in range(nh, self.block_count):
mapped_name = self.format_tensor_name(remapper[key], bid_mtp, suffix=_n.suffix)
yield from ModelBase.modify_tensors(self, data_torch, mapped_name, bid_mtp)
return

yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Exaone4_5_ForConditionalGeneration")
class Exaone4_5VisionModel(Qwen2VLVisionModel):
"""Vision tower for EXAONE 4.5; Qwen2-VL-style ViT (GQA) + patch merger"""

@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
name = name.replace("model.visual.", "visual.", 1)
return super().filter_tensors((name, gen))

def set_gguf_parameters(self):
MmprojModel.set_gguf_parameters(self)
assert self.hparams_vision is not None
hparams = self.hparams_vision
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.EXAONE4_5)
self.gguf_writer.add_vision_use_silu(True)
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
num_kv_head = self.find_vparam(["num_key_value_heads"], optional=True)
if num_kv_head is not None:
self.gguf_writer.add_vision_head_count_kv(num_kv_head)
eps = hparams.get("rms_norm_eps", self.global_config.get("rms_norm_eps", 1e-6))
self.gguf_writer.add_vision_attention_layernorm_eps(eps)
if (window_size := hparams.get("window_size")) is not None:
self.gguf_writer.add_vision_window_size(window_size)
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
if fullatt_block_indexes:
n_wa_pattern = fullatt_block_indexes[0] + 1
for i in range(1, len(fullatt_block_indexes)):
if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
raise ValueError(f"Invalid EXAONE4.5 fullatt_block_indexes: {fullatt_block_indexes}")
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if ".qkv." in name:
yield from ModelBase.modify_tensors(self, data_torch, name, bid)
return

yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid)
8 changes: 8 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3260,6 +3260,13 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_POST_NORM,
# NextN/MTP tensors - preserved but unused
MODEL_TENSOR.NEXTN_EH_PROJ,
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
MODEL_TENSOR.NEXTN_ENORM,
MODEL_TENSOR.NEXTN_HNORM,
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
],
MODEL_ARCH.EXAONE_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
Expand Down Expand Up @@ -4264,6 +4271,7 @@ class VisionProjectorType:
LLAMA4 = "llama4"
QWEN2VL = "qwen2vl_merger"
QWEN25VL = "qwen2.5vl_merger"
EXAONE4_5 = "exaone4_5"
QWEN3VL = "qwen3vl_merger"
STEP3VL = "step3vl"
ULTRAVOX = "ultravox"
Expand Down
47 changes: 35 additions & 12 deletions src/models/exaone4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {

ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
Comment thread
nuxlear marked this conversation as resolved.
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;

switch (hparams.n_layer) {
case 30: type = LLM_TYPE_1_2B; break;
Expand All @@ -38,21 +41,37 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) {
}

for (int i = 0; i < n_layer; ++i) {
const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers;
int flags = 0;
if (is_nextn) {
// NextN/MTP layers are preserved in GGUF but are not executed yet.
flags |= TENSOR_SKIP;
}

auto & layer = layers[i];

create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, flags);

if (!is_nextn) {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
}

layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);

layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);

layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
if (is_nextn) {
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
}
}
}

Expand Down Expand Up @@ -90,7 +109,11 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
}
ggml_tensor * inp_out_ids = build_inp_out_ids();

for (int il = 0; il < n_layer; ++il) {
// MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe).
const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers);
GGML_ASSERT(n_layer_main > 0);

for (int il = 0; il < n_layer_main; ++il) {
ggml_tensor * inpSA = inpL;

// use RoPE for SWA layers or non-SWA models
Expand Down Expand Up @@ -126,7 +149,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "attn_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
if (il == n_layer_main - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_library(mtmd
models/cogvlm.cpp
models/conformer.cpp
models/dotsocr.cpp
models/exaone4_5.cpp
models/gemma4a.cpp
models/gemma4v.cpp
models/glm4v.cpp
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ enum projector_type {
PROJECTOR_TYPE_KIMIK25,
PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_HUNYUANVL,
PROJECTOR_TYPE_EXAONE4_5,
PROJECTOR_TYPE_MINICPMV4_6,
PROJECTOR_TYPE_GRANITE_SPEECH,
PROJECTOR_TYPE_MIMOVL,
Expand Down Expand Up @@ -392,6 +393,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_YASA2, "yasa2"},
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
{ PROJECTOR_TYPE_EXAONE4_5, "exaone4_5"},
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
{ PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
{ PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
Expand Down
29 changes: 28 additions & 1 deletion tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
} break;
case PROJECTOR_TYPE_EXAONE4_5:
{
builder = std::make_unique<clip_graph_exaone4_5>(ctx, img);
} break;
case PROJECTOR_TYPE_MIMOVL:
{
builder = std::make_unique<clip_graph_mimovl>(ctx, img);
Expand Down Expand Up @@ -1543,6 +1547,19 @@ struct clip_model_loader {
hparams.audio_window_len = 400;
hparams.audio_hop_len = 160;
} break;
case PROJECTOR_TYPE_EXAONE4_5:
{
hparams.n_merge = 2;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, false);
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
hparams.set_warmup_n_tokens(46 * 46);
if (hparams.rope_theta <= 0.0f) {
hparams.rope_theta = 10000.0f;
}
get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
} break;
case PROJECTOR_TYPE_GEMMA4A:
{
// Gemma4 feature_extraction_gemma4.py:
Expand Down Expand Up @@ -1783,6 +1800,7 @@ struct clip_model_loader {
|| model.proj_type == PROJECTOR_TYPE_LDPV2
|| model.proj_type == PROJECTOR_TYPE_QWEN2VL
|| model.proj_type == PROJECTOR_TYPE_QWEN25VL
|| model.proj_type == PROJECTOR_TYPE_EXAONE4_5
|| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
Expand Down Expand Up @@ -1932,6 +1950,7 @@ struct clip_model_loader {
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_EXAONE4_5:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
Expand Down Expand Up @@ -3060,6 +3079,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_EXAONE4_5:
case PROJECTOR_TYPE_MIMOVL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
Expand All @@ -3081,6 +3101,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_EXAONE4_5:
case PROJECTOR_TYPE_MIMOVL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
Expand Down Expand Up @@ -3160,6 +3181,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_EXAONE4_5:
case PROJECTOR_TYPE_MIMOVL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
Expand Down Expand Up @@ -3647,11 +3669,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_EXAONE4_5:
case PROJECTOR_TYPE_YOUTUVL:
{
// pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
const bool use_window_attn =
(ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL || ctx->model.proj_type == PROJECTOR_TYPE_EXAONE4_5)
? hparams.n_wa_pattern > 0
: !hparams.wa_layer_indexes.empty();
const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio;
Expand Down Expand Up @@ -4205,6 +4231,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_EXAONE4_5:
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_YOUTUVL:
return ctx->model.mm_1_b->ne[0];
Expand Down
Loading
Loading