ggml-org · isLinXu · Feb 9, 2026 · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026
@@ -3814,6 +3814,171 @@ def prepare_tensors(self):
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
 
+@ModelBase.register("Ernie4_5_VLMoeForConditionalGeneration")
+class Ernie4_5VLMoeModel(Ernie4_5MoeModel):
+    model_arch = gguf.MODEL_ARCH.ERNIE4_5_VL_MOE
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._experts = [{} for _ in range(self.block_count)]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # Handle list-based expert configurations by taking the first value
+        moe_num_experts = self.hparams["moe_num_experts"]
+        if isinstance(moe_num_experts, list):
+            moe_num_experts = moe_num_experts[0]
+        self.gguf_writer.add_expert_count(moe_num_experts)
+
+        self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
+        self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
+
+        moe_layer_start_index = self.hparams["moe_layer_start_index"]
+        if isinstance(moe_layer_start_index, list):
+            moe_layer_start_index = moe_layer_start_index[0]
+        self.gguf_writer.add_leading_dense_block_count(moe_layer_start_index)
+
+        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
+            if isinstance(moe_intermediate_size, list):
+                self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
+                if len(moe_intermediate_size) > 1:
+                    self.gguf_writer.add_vision_expert_feed_forward_length(moe_intermediate_size[1])
+            else:
+                self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+
+        if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
+            self.gguf_writer.add_expert_shared_count(shared_expert_count)
+            if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
+                self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Skip vision and multimodal tensors - they are not part of the text model
+        if name.startswith("vision_model") or name.startswith("resampler_model") or \
+           name.startswith("model.vision_model") or name.startswith("model.resampler_model"):
+            return []
+
+        # todo(megemini): gate_inp weight/weight_1
+        # weight
+        if name.endswith(".mlp.gate.weight") or name.endswith(".mlp.gate.weight_1"):
+            if name.endswith(".mlp.gate.weight_1"):
+                name = name.replace(".mlp.gate.weight_1", ".mlp.gate.vision.weight")
+
+            data_torch = data_torch.t()
+            # Extract bid from name if not provided
+            if bid is None:
+                match = re.search(r"model\.layers\.(\d+)", name)
+                if match:
+                    bid = int(match.group(1))
+            # todo(megemini):
+            logger.info("Processing gate.weight/weight_1: %s -> shape %s", name, data_torch.shape)
+            # Map the tensor name and ensure it has .weight suffix
+            mapped_name = self.map_tensor_name(name)
+
+            return [(mapped_name, data_torch)]
+
+        # todo(megemini): e_score_correction.bias/bias_1 for weight/weight_1
+        if name.endswith(".mlp.moe_statics.e_score_correction_bias"):
+            name_text = name.replace("e_score_correction_bias", "e_score_correction.bias")
+            data_torch_text = data_torch[0, :]
+
+            name_vision = name.replace("e_score_correction_bias", "e_score_correction.vision.bias")
+            data_torch_vision = data_torch[1, :]
+
+            return [(self.map_tensor_name(name_text), data_torch_text),
+                    (self.map_tensor_name(name_vision), data_torch_vision)]
+
+        # process the experts separately
+        if name.find("mlp.experts") != -1:
+            n_experts = self.hparams["moe_num_experts"]
+
+            # Handle n_experts being a list (for models with multiple expert groups)
+            if isinstance(n_experts, list):
+                total_experts = sum(n_experts)
+            else:
+                total_experts = n_experts
+
+            assert bid is not None
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            # Only merge routed experts (not shared experts)
+            # Total tensors = total_experts * 3 (gate, up, down)
+            if len(self._experts[bid]) >= total_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # For models with multiple expert groups of different sizes,
+                for w_name in ["gate_proj", "up_proj", "down_proj"]:
+                    # Collect all experts for this weight type
+                    expert_data: dict[int, Tensor] = {}
+                    for xid in range(total_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        if ename in self._experts[bid]:
+                            expert_data[xid] = self._experts[bid][ename]
+                            del self._experts[bid][ename]
+
+                    if not expert_data:
+                        continue
+
+                    # Group experts by shape (to handle different intermediate sizes)
+                    shape_groups: dict[tuple[int, ...], list[tuple[int, Tensor]]] = {}
+                    for xid, tensor in expert_data.items():
+                        shape_key = tuple(tensor.shape)
+                        if shape_key not in shape_groups:
+                            shape_groups[shape_key] = []
+                        shape_groups[shape_key].append((xid, tensor))
+
+                    # For each shape group, stack the experts
+                    # For ERNIE-4.5-VL with multiple expert groups of different sizes,
+                    # we need to save them separately as llama.cpp doesn't support mixed sizes yet
+                    if len(shape_groups) > 1:
+                        # Sort shape groups by number of experts (descending)
+                        sorted_groups = sorted(shape_groups.items(), key=lambda x: len(x[1]), reverse=True)
+
+                        for group_idx, (shape_key, expert_list) in enumerate(sorted_groups):
+                            # Sort by expert ID to maintain order
+                            expert_list.sort(key=lambda x: x[0])
+                            datas = [tensor for _, tensor in expert_list]
+
+                            data_torch = torch.stack(datas, dim=0)
+
+                            # Use group suffix for additional groups
+                            if group_idx == 0:
+                                merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+                            else:
+                                merged_name = f"model.vision.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                            new_name = self.map_tensor_name(merged_name)
+                            tensors.append((new_name, data_torch))
+                    else:
+                        # Single shape - stack all experts
+                        expert_list = list(shape_groups.values())[0]
+                        expert_list.sort(key=lambda x: x[0])
+                        datas = [tensor for _, tensor in expert_list]
+
+                        data_torch = torch.stack(datas, dim=0)
+
+                        merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+                        new_name = self.map_tensor_name(merged_name)
+                        tensors.append((new_name, data_torch))
+
+                return tensors
+            else:
+                return []
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
 
 @ModelBase.register(
     "Qwen2VLModel",

@@ -252,6 +252,7 @@
 #define GGML_ROPE_TYPE_MROPE  8
 #define GGML_ROPE_TYPE_VISION 24
 #define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
+#define GGML_ROPE_TYPE_ERNIE3D 72 // binary: 1001000, ERNIE-VL 3D RoPE (NORMAL rotation + interleaved h/w freq)
 
 #define GGML_MROPE_SECTIONS   4
 

@@ -5651,6 +5651,43 @@ static void rotate_pairs(const int64_t n, const int64_t n_offset, const float *
   }
 }
 
+static void ggml_ernie3d_rope_cache_init(
+     float theta_base_t, float theta_base_h, float theta_base_w,
+     int sections[4],
+     float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale) {
+    // n_hw = sections[0] + sections[1] = total number of interleaved h/w frequencies
+    int n_hw = sections[0] + sections[1];
+
+    float theta_accum = 1.0f; // accumulated theta_scale^freq_idx
+
+    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        int freq_idx = (int)(i0 / 2);
+        const float ff = freq_factors ? freq_factors[freq_idx] : 1.0f;
+
+        float theta;
+        if (freq_idx < n_hw) {
+            if (freq_idx % 2 == 0) {
+                // even freq index -> height position
+                theta = theta_base_h * theta_accum;
+            } else {
+                // odd freq index -> width position
+                theta = theta_base_w * theta_accum;
+            }
+        } else {
+            // temporal position
+            theta = theta_base_t * theta_accum;
+        }
+
+        rope_yarn(
+            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+        );
+        cache[i0 + 1] *= sin_sign;
+
+        theta_accum *= theta_scale;
+    }
+}
+
 template<typename T> //float or ggml_fp16_t
 static void ggml_compute_forward_rope_flt(
         const ggml_compute_params * params,
@@ -5723,7 +5760,7 @@ static void ggml_compute_forward_rope_flt(
     if (is_vision) {
         GGML_ASSERT(n_dims == ne0/2);
     }
-
+    const bool is_ernie3d = mode == GGML_ROPE_TYPE_ERNIE3D;
     const float * freq_factors = NULL;
     if (src2 != NULL) {
         GGML_ASSERT(src2->type == GGML_TYPE_F32);
@@ -5745,6 +5782,14 @@ static void ggml_compute_forward_rope_flt(
             if (!mrope_used) {
                 const int64_t p = pos[i2];
                 ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            } else if (is_ernie3d) {
+                // ERNIE-VL 3D RoPE: interleaved h/w freq with NORMAL rotation
+                const int64_t p_t = pos[i2];
+                const int64_t p_h = pos[i2 + ne2];
+                const int64_t p_w = pos[i2 + ne2 * 2];
+                ggml_ernie3d_rope_cache_init(
+                    p_t, p_h, p_w, sections,
+                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
             }
             else {
                 const int64_t p_t = pos[i2];
@@ -5765,6 +5810,7 @@ static void ggml_compute_forward_rope_flt(
 
                 switch (mode) {
                     case GGML_ROPE_TYPE_NORMAL:
+                    case GGML_ROPE_TYPE_ERNIE3D:
                         rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
                         break;
                     case GGML_ROPE_TYPE_NEOX:

@@ -264,6 +264,68 @@ static __global__ void rope_multi(const T *            x,
     dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 
+template<bool forward, bool has_ff, typename T>
+static __global__ void rope_ernie3d(
+        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
+        const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
+
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;
+
+    // NORMAL rotation: pair (x[i0], x[i0+1]), stored at adjacent positions
+    const int idst = row_dst*ne0 + i0;
+    const int ix   = channel_x*s2 + row_x*s1 + i0;
+
+    if (i0 >= n_dims) {
+        dst[idst + 0] = x[ix + 0];
+        dst[idst + 1] = x[ix + 1];
+        return;
+    }
+
+    // freq_idx = i0/2 (which frequency pair this is)
+    const int freq_idx = i0 / 2;
+    // n_hw = sections[0] + sections[1] = total number of h+w interleaved frequencies
+    const int n_hw = sections.v[0] + sections.v[1];
+
+    // Determine which position slot to use based on interleaved pattern
+    // Position slots: slot 0 = t_position, slot 1 = h_position, slot 2 = w_position
+    float theta_base = 0.0f;
+    if (freq_idx < n_hw) {
+        if (freq_idx % 2 == 0) {
+            // even freq index -> height position (slot 1)
+            theta_base = pos[channel_x + ne2 * 1] * powf(theta_scale, (float)freq_idx);
+        } else {
+            // odd freq index -> width position (slot 2)
+            theta_base = pos[channel_x + ne2 * 2] * powf(theta_scale, (float)freq_idx);
+        }
+    } else {
+        // temporal position (slot 0)
+        theta_base = pos[channel_x] * powf(theta_scale, (float)freq_idx);
+    }
+
+    const float freq_factor = has_ff ? freq_factors[freq_idx] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
+
+    // NORMAL (GPT-J) rotation: adjacent pair (x[i0], x[i0+1])
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + 1];
+
+    dst[idst + 0] = x0*cos_theta - x1*sin_theta;
+    dst[idst + 1] = x0*sin_theta + x1*cos_theta;
+}
+
 template <bool forward, bool has_ff, typename T>
 static __global__ void rope_vision(const T *            x,
                                    T *                  dst,
@@ -453,6 +515,29 @@ static void rope_multi_cuda(const T *            x,
     }
 }
 
+template<bool forward, typename T>
+static void rope_ernie3d_cuda(
+        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    if (freq_factors == nullptr) {
+        rope_ernie3d<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
+    } else {
+        rope_ernie3d<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
+    }
+}
+
 template <bool forward, typename T>
 static void rope_vision_cuda(const T *            x,
                              T *                  dst,
@@ -603,7 +688,20 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
                                                 s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
                                                 ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
                                                 set_rows_stride, stream);
-        } else {
+        } else if (is_ernie3d) {
+            if (src0->type == GGML_TYPE_F32) {
+                rope_ernie3d_cuda<forward>(
+                    (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                    freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+            } else if (src0->type == GGML_TYPE_F16) {
+                rope_ernie3d_cuda<forward>(
+                    (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                    freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+            } else {
+                GGML_ABORT("fatal error");
+            }
+        }
+        else {
             GGML_ABORT("fatal error");
         }
     } else if (is_mrope && !is_vision) {