cont: fix gate ordering

am17an · am17an · commit 0b2acfc2022a · 2025-10-17T15:20:49.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2030,7 +2030,15 @@ static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up, const ggml
         return false;
     }
 
-    if (glu->src[0] != ffn_up && glu->src[1] != ffn_gate) {
+    if (glu->src[0] != ffn_gate && glu->src[1] != ffn_up) {
+        return false;
+    }
+
+    if (ggml_get_glu_op(glu) != GGML_GLU_OP_SWIGLU) {
+        return false;
+    }
+
+    if (const bool swapped = ggml_get_op_params_i32(glu, 1); swapped) {
         return false;
     }
 
@@ -2938,11 +2946,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
             return false;
         }
 
-        const ggml_tensor * ffn_up   = cgraph->nodes[node_idx];
-        const ggml_tensor * ffn_gate = cgraph->nodes[node_idx+1];
+        const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
+        const ggml_tensor * ffn_up   = cgraph->nodes[node_idx+1];
         const ggml_tensor * glu      = cgraph->nodes[node_idx+2];
 
-       if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
+        if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
             return true;
         }
     }
@@ -3088,22 +3096,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
                     for (ggml_op op : {GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID}) {
                         if (ggml_cuda_can_fuse(cgraph, i, {op, op, GGML_OP_GLU}, {})) {
-                            const ggml_tensor * up   = cgraph->nodes[i];
-                            const ggml_tensor * gate = cgraph->nodes[i+1]->src[0];
                             ggml_tensor *       glu  = cgraph->nodes[i+2];
+                            ggml_tensor *       gate = glu->src[0];
+                            ggml_tensor *       up   = glu->src[1];
 
                             const ggml_tensor * src0 = up->src[0];
                             const ggml_tensor * src1 = up->src[1];
                             const ggml_tensor * ids  = up->src[2];
 
                             if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
-                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, gate, glu);
+                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, gate->src[0], glu);
                                 fused_mul_mat_vec = true;
                                 break;
                             }
 
                             if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
-                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, gate, glu);
+                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, gate->src[0], glu);
                                 fused_mul_mat_vec = true;
                                 break;
                             }
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
@@ -197,7 +197,7 @@ static __global__ void mul_mat_vec_f(
     }
 
     if constexpr (has_gate) {
-        dst[tid*stride_col_dst + row] = op(sumf[tid]) * sumf_gate[tid];
+        dst[tid*stride_col_dst + row] = sumf[tid] * op(sumf_gate[tid]);
     } else {
         dst[tid*stride_col_dst + row] = sumf[tid];
     }
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
@@ -242,7 +242,7 @@ static __global__ void mul_mat_vec_q(
         if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
             float result = tmp[j][threadIdx.x];
             if constexpr (has_gate) {
-                result = op(result) * tmp_gate[j][threadIdx.x];
+                result = result * op(tmp_gate[j][threadIdx.x]);
             }
             dst[j*stride_col_dst + threadIdx.x] = result;
         }
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
@@ -80,4 +80,4 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
     return x / (1.0f + expf(-x));
-}
+}
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -4630,7 +4630,7 @@ struct test_fused_ffn_gate : public test_case {
             ggml_tensor * ffn_up   = ggml_mul_mat(ctx, up, cur);
             ggml_tensor * ffn_gate = ggml_mul_mat(ctx, gate, cur);
 
-            ggml_tensor * out = ggml_glu_split(ctx, ffn_up, ffn_gate, glu_op);
+            ggml_tensor * out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op);
 
             ggml_set_name(out, "out");
             return out;
@@ -4652,10 +4652,10 @@ struct test_fused_ffn_gate : public test_case {
             ggml_tensor * cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, k, this->b ? 1 : n_used, m);
             ggml_set_name(cur, "cur");
 
-            ggml_tensor * ffn_gate = ggml_mul_mat_id(ctx, gates, cur, ids);
             ggml_tensor * ffn_up   = ggml_mul_mat_id(ctx, ups, cur, ids);
+            ggml_tensor * ffn_gate = ggml_mul_mat_id(ctx, gates, cur, ids);
 
-            ggml_tensor * out = ggml_glu_split(ctx, ffn_up, ffn_gate, glu_op);
+            ggml_tensor * out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op);
 
             ggml_set_name(out, "out");
             return out;

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,7 @@ static __global__ void mul_mat_vec_f(`
`197`	`197`	`}`
`198`	`198`
`199`	`199`	`if constexpr (has_gate) {`
`200`		`- dst[tidstride_col_dst + row] = op(sumf[tid]) sumf_gate[tid];`
	`200`	`+ dst[tidstride_col_dst + row] = sumf[tid] op(sumf_gate[tid]);`
`201`	`201`	`} else {`
`202`	`202`	`dst[tid*stride_col_dst + row] = sumf[tid];`
`203`	`203`	`}`
Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,7 @@ static __global__ void mul_mat_vec_q(`
`242`	`242`	`if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 \|\| uint32_t(row0 + threadIdx.x) < stride_col_dst)) {`
`243`	`243`	`float result = tmp[j][threadIdx.x];`
`244`	`244`	`if constexpr (has_gate) {`
`245`		`- result = op(result) * tmp_gate[j][threadIdx.x];`
	`245`	`+ result = result * op(tmp_gate[j][threadIdx.x]);`
`246`	`246`	`}`
`247`	`247`	`dst[j*stride_col_dst + threadIdx.x] = result;`
`248`	`248`	`}`