Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 165 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3814,6 +3814,171 @@ def prepare_tensors(self):
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")

@ModelBase.register("Ernie4_5_VLMoeForConditionalGeneration")
class Ernie4_5VLMoeModel(Ernie4_5MoeModel):
model_arch = gguf.MODEL_ARCH.ERNIE4_5_VL_MOE
_experts: list[dict[str, Tensor]] | None = None

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._experts = [{} for _ in range(self.block_count)]

def set_gguf_parameters(self):
super().set_gguf_parameters()

# Handle list-based expert configurations by taking the first value
moe_num_experts = self.hparams["moe_num_experts"]
if isinstance(moe_num_experts, list):
moe_num_experts = moe_num_experts[0]
self.gguf_writer.add_expert_count(moe_num_experts)

self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])

moe_layer_start_index = self.hparams["moe_layer_start_index"]
if isinstance(moe_layer_start_index, list):
moe_layer_start_index = moe_layer_start_index[0]
self.gguf_writer.add_leading_dense_block_count(moe_layer_start_index)

if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
if isinstance(moe_intermediate_size, list):
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
if len(moe_intermediate_size) > 1:
self.gguf_writer.add_vision_expert_feed_forward_length(moe_intermediate_size[1])
else:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)

if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
self.gguf_writer.add_expert_shared_count(shared_expert_count)
if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision and multimodal tensors - they are not part of the text model
if name.startswith("vision_model") or name.startswith("resampler_model") or \
name.startswith("model.vision_model") or name.startswith("model.resampler_model"):
return []

# todo(megemini): gate_inp weight/weight_1
# weight
if name.endswith(".mlp.gate.weight") or name.endswith(".mlp.gate.weight_1"):
if name.endswith(".mlp.gate.weight_1"):
name = name.replace(".mlp.gate.weight_1", ".mlp.gate.vision.weight")

data_torch = data_torch.t()
# Extract bid from name if not provided
if bid is None:
match = re.search(r"model\.layers\.(\d+)", name)
if match:
bid = int(match.group(1))
# todo(megemini):
logger.info("Processing gate.weight/weight_1: %s -> shape %s", name, data_torch.shape)
# Map the tensor name and ensure it has .weight suffix
mapped_name = self.map_tensor_name(name)

return [(mapped_name, data_torch)]

# todo(megemini): e_score_correction.bias/bias_1 for weight/weight_1
if name.endswith(".mlp.moe_statics.e_score_correction_bias"):
name_text = name.replace("e_score_correction_bias", "e_score_correction.bias")
data_torch_text = data_torch[0, :]

name_vision = name.replace("e_score_correction_bias", "e_score_correction.vision.bias")
data_torch_vision = data_torch[1, :]

return [(self.map_tensor_name(name_text), data_torch_text),
(self.map_tensor_name(name_vision), data_torch_vision)]

# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["moe_num_experts"]

# Handle n_experts being a list (for models with multiple expert groups)
if isinstance(n_experts, list):
total_experts = sum(n_experts)
else:
total_experts = n_experts

assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]

self._experts[bid][name] = data_torch

# Only merge routed experts (not shared experts)
# Total tensors = total_experts * 3 (gate, up, down)
if len(self._experts[bid]) >= total_experts * 3:
tensors: list[tuple[str, Tensor]] = []

# For models with multiple expert groups of different sizes,
for w_name in ["gate_proj", "up_proj", "down_proj"]:
# Collect all experts for this weight type
expert_data: dict[int, Tensor] = {}
for xid in range(total_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
if ename in self._experts[bid]:
expert_data[xid] = self._experts[bid][ename]
del self._experts[bid][ename]

if not expert_data:
continue

# Group experts by shape (to handle different intermediate sizes)
shape_groups: dict[tuple[int, ...], list[tuple[int, Tensor]]] = {}
for xid, tensor in expert_data.items():
shape_key = tuple(tensor.shape)
if shape_key not in shape_groups:
shape_groups[shape_key] = []
shape_groups[shape_key].append((xid, tensor))

# For each shape group, stack the experts
# For ERNIE-4.5-VL with multiple expert groups of different sizes,
# we need to save them separately as llama.cpp doesn't support mixed sizes yet
if len(shape_groups) > 1:
# Sort shape groups by number of experts (descending)
sorted_groups = sorted(shape_groups.items(), key=lambda x: len(x[1]), reverse=True)

for group_idx, (shape_key, expert_list) in enumerate(sorted_groups):
# Sort by expert ID to maintain order
expert_list.sort(key=lambda x: x[0])
datas = [tensor for _, tensor in expert_list]

data_torch = torch.stack(datas, dim=0)

# Use group suffix for additional groups
if group_idx == 0:
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
else:
merged_name = f"model.vision.layers.{bid}.mlp.experts.{w_name}.weight"

new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
else:
# Single shape - stack all experts
expert_list = list(shape_groups.values())[0]
expert_list.sort(key=lambda x: x[0])
datas = [tensor for _, tensor in expert_list]

data_torch = torch.stack(datas, dim=0)

merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))

return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]

def prepare_tensors(self):
super().prepare_tensors()

if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register(
"Qwen2VLModel",
Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24
#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
#define GGML_ROPE_TYPE_ERNIE3D 72 // binary: 1001000, ERNIE-VL 3D RoPE (NORMAL rotation + interleaved h/w freq)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the ROPE_TYPE system is quite fragile now and I think we should always reflect twice before adding a new mode.

It seems like interleaved h/w freq is already supported by Pixtral model, please verify one more time if you can reuse the code from Pixtral instead of adding a new rope kernel here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the heads-up. I completely agree that we should be cautious with the ROPE_TYPE system. I’ll re-examine the Pixtral implementation to see if we can reuse its interleaved frequency logic instead of adding a new kernel.

@isLinXu isLinXu Feb 10, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback. I’ve conducted a detailed mathematical comparison between Pixtral’s build_rope_2d and the ERNIE implementation. It turns out they are mathematically incompatible, and direct reuse would result in incorrect positional embeddings.

Below is the technical breakdown:

Feature Pixtral build_rope_2d ERNIE (Vision / LLM)
Rotation Mode NORMAL (Adjacent pairs) NEOX (Half-dimension offset)
Freq. Allocation 2-way Interleaved (via freq_scale_odd) Sectional (2D) / 3-way Interleaved (3D)
Theta Accumulation Continuous across the head Independent reset per section
Dimensionality 2D (h, w) only 3D (t, h, w)
Implementation Dual rope_ext + concat ggml_rope_multi with mrope 4-slot

Key Technical Differences:

  1. Mathematical Incompatibility: Pixtral uses NORMAL rotation, whereas ERNIE follows the NEOX convention (commonly used in Vision Transformers). Since the pairing of dimensions differs, swapping them would break the model's spatial understanding.
  2. Frequency Mapping: Pixtral achieves interleaved frequencies by applying a freq_scale to one-half of the dimensions. ERNIE uses sections [20, 20, 0, 0] to strictly block frequencies, where each section starts its theta accumulation independently from $base^0$.

Regarding the complexity of the ROPE_TYPE system:

  • Vision Side: We are actually using the existing GGML_ROPE_TYPE_VISION. No new mode is introduced here.
  • LLM Side: The new GGML_ROPE_TYPE_ERNIE3D is a strict requirement to support the Temporal (t) dimension. Current 2D implementations (like Pixtral) cannot handle this 3D mapping.

Conclusion:

To maintain mathematical correctness and support 3D RoPE, we cannot reuse the Pixtral logic. The new ERNIE3D type is the minimum necessary change to support these specific requirements. I will ensure the implementation is as modular as possible to keep the system maintainable.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the difference is just the normal and neox style, you can also permute the Q and K tensor upon converting to GGUF.

Kimi 2.5 also do exactly this, you can copy the conversion code from #19170

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also just a friendly reminder: We don't allow replying to human maintainer with AI-generated response. Please write the response with your own writing,to prove that you fully understand your code


#define GGML_MROPE_SECTIONS 4

Expand Down
48 changes: 47 additions & 1 deletion ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5651,6 +5651,43 @@ static void rotate_pairs(const int64_t n, const int64_t n_offset, const float *
}
}

static void ggml_ernie3d_rope_cache_init(
float theta_base_t, float theta_base_h, float theta_base_w,
int sections[4],
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
float * cache, float sin_sign, float theta_scale) {
// n_hw = sections[0] + sections[1] = total number of interleaved h/w frequencies
int n_hw = sections[0] + sections[1];

float theta_accum = 1.0f; // accumulated theta_scale^freq_idx

for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
int freq_idx = (int)(i0 / 2);
const float ff = freq_factors ? freq_factors[freq_idx] : 1.0f;

float theta;
if (freq_idx < n_hw) {
if (freq_idx % 2 == 0) {
// even freq index -> height position
theta = theta_base_h * theta_accum;
} else {
// odd freq index -> width position
theta = theta_base_w * theta_accum;
}
} else {
// temporal position
theta = theta_base_t * theta_accum;
}

rope_yarn(
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
);
cache[i0 + 1] *= sin_sign;

theta_accum *= theta_scale;
}
}

template<typename T> //float or ggml_fp16_t
static void ggml_compute_forward_rope_flt(
const ggml_compute_params * params,
Expand Down Expand Up @@ -5723,7 +5760,7 @@ static void ggml_compute_forward_rope_flt(
if (is_vision) {
GGML_ASSERT(n_dims == ne0/2);
}

const bool is_ernie3d = mode == GGML_ROPE_TYPE_ERNIE3D;
const float * freq_factors = NULL;
if (src2 != NULL) {
GGML_ASSERT(src2->type == GGML_TYPE_F32);
Expand All @@ -5745,6 +5782,14 @@ static void ggml_compute_forward_rope_flt(
if (!mrope_used) {
const int64_t p = pos[i2];
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
} else if (is_ernie3d) {
// ERNIE-VL 3D RoPE: interleaved h/w freq with NORMAL rotation
const int64_t p_t = pos[i2];
const int64_t p_h = pos[i2 + ne2];
const int64_t p_w = pos[i2 + ne2 * 2];
ggml_ernie3d_rope_cache_init(
p_t, p_h, p_w, sections,
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
else {
const int64_t p_t = pos[i2];
Expand All @@ -5765,6 +5810,7 @@ static void ggml_compute_forward_rope_flt(

switch (mode) {
case GGML_ROPE_TYPE_NORMAL:
case GGML_ROPE_TYPE_ERNIE3D:
rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
break;
case GGML_ROPE_TYPE_NEOX:
Expand Down
100 changes: 99 additions & 1 deletion ggml/src/ggml-cuda/rope.cu
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,68 @@ static __global__ void rope_multi(const T * x,
dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
}

template<bool forward, bool has_ff, typename T>
static __global__ void rope_ernie3d(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);

if (i0 >= ne0) {
return;
}

const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;

const int row_x = row_dst % ne1;
const int channel_x = row_dst / ne1;

// NORMAL rotation: pair (x[i0], x[i0+1]), stored at adjacent positions
const int idst = row_dst*ne0 + i0;
const int ix = channel_x*s2 + row_x*s1 + i0;

if (i0 >= n_dims) {
dst[idst + 0] = x[ix + 0];
dst[idst + 1] = x[ix + 1];
return;
}

// freq_idx = i0/2 (which frequency pair this is)
const int freq_idx = i0 / 2;
// n_hw = sections[0] + sections[1] = total number of h+w interleaved frequencies
const int n_hw = sections.v[0] + sections.v[1];

// Determine which position slot to use based on interleaved pattern
// Position slots: slot 0 = t_position, slot 1 = h_position, slot 2 = w_position
float theta_base = 0.0f;
if (freq_idx < n_hw) {
if (freq_idx % 2 == 0) {
// even freq index -> height position (slot 1)
theta_base = pos[channel_x + ne2 * 1] * powf(theta_scale, (float)freq_idx);
} else {
// odd freq index -> width position (slot 2)
theta_base = pos[channel_x + ne2 * 2] * powf(theta_scale, (float)freq_idx);
}
} else {
// temporal position (slot 0)
theta_base = pos[channel_x] * powf(theta_scale, (float)freq_idx);
}

const float freq_factor = has_ff ? freq_factors[freq_idx] : 1.0f;

float cos_theta;
float sin_theta;

rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);

// NORMAL (GPT-J) rotation: adjacent pair (x[i0], x[i0+1])
const float x0 = x[ix + 0];
const float x1 = x[ix + 1];

dst[idst + 0] = x0*cos_theta - x1*sin_theta;
dst[idst + 1] = x0*sin_theta + x1*cos_theta;
}

template <bool forward, bool has_ff, typename T>
static __global__ void rope_vision(const T * x,
T * dst,
Expand Down Expand Up @@ -453,6 +515,29 @@ static void rope_multi_cuda(const T * x,
}
}

template<bool forward, typename T>
static void rope_ernie3d_cuda(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);

const float theta_scale = powf(freq_base, -2.0f/n_dims);

if (freq_factors == nullptr) {
rope_ernie3d<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, sections);
} else {
rope_ernie3d<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, sections);
}
}

template <bool forward, typename T>
static void rope_vision_cuda(const T * x,
T * dst,
Expand Down Expand Up @@ -603,7 +688,20 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
set_rows_stride, stream);
} else {
} else if (is_ernie3d) {
if (src0->type == GGML_TYPE_F32) {
rope_ernie3d_cuda<forward>(
(const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
} else if (src0->type == GGML_TYPE_F16) {
rope_ernie3d_cuda<forward>(
(const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
} else {
GGML_ABORT("fatal error");
}
}
else {
GGML_ABORT("fatal error");
}
} else if (is_mrope && !is_vision) {
Expand Down
Loading