cont: fix gate ordering

am17an · am17an · commit 39da6fe2bed6 · 2025-10-17T15:16:40.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2030,7 +2030,15 @@ static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up, const ggml
         return false;
     }
 
-    if (glu->src[0] != ffn_up && glu->src[1] != ffn_gate) {
+    if (glu->src[0] != ffn_gate && glu->src[1] != ffn_up) {
+        return false;
+    }
+
+    if (ggml_get_glu_op(glu) != GGML_GLU_OP_SWIGLU) {
+        return false;
+    }
+
+    if (const bool swapped = ggml_get_op_params_i32(glu, 1); swapped) {
         return false;
     }
 
@@ -2938,11 +2946,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
             return false;
         }
 
-        const ggml_tensor * ffn_up   = cgraph->nodes[node_idx];
-        const ggml_tensor * ffn_gate = cgraph->nodes[node_idx+1];
+        const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
+        const ggml_tensor * ffn_up   = cgraph->nodes[node_idx+1];
         const ggml_tensor * glu      = cgraph->nodes[node_idx+2];
 
-       if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
+        if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
             return true;
         }
     }
@@ -3088,22 +3096,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
                     for (ggml_op op : {GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID}) {
                         if (ggml_cuda_can_fuse(cgraph, i, {op, op, GGML_OP_GLU}, {})) {
-                            const ggml_tensor * up   = cgraph->nodes[i];
-                            const ggml_tensor * gate = cgraph->nodes[i+1]->src[0];
                             ggml_tensor *       glu  = cgraph->nodes[i+2];
+                            ggml_tensor *       gate = glu->src[0];
+                            ggml_tensor *       up   = glu->src[1];
 
                             const ggml_tensor * src0 = up->src[0];
                             const ggml_tensor * src1 = up->src[1];
                             const ggml_tensor * ids  = up->src[2];
 
                             if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
-                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, gate, glu);
+                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, gate->src[0], glu);
                                 fused_mul_mat_vec = true;
                                 break;
                             }
 
                             if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
-                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, gate, glu);
+                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, gate->src[0], glu);
                                 fused_mul_mat_vec = true;
                                 break;
                             }
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
@@ -197,7 +197,7 @@ static __global__ void mul_mat_vec_f(
     }
 
     if constexpr (has_gate) {
-        dst[tid*stride_col_dst + row] = op(sumf[tid]) * sumf_gate[tid];
+        dst[tid*stride_col_dst + row] = sumf[tid] * op(sumf_gate[tid]);
     } else {
         dst[tid*stride_col_dst + row] = sumf[tid];
     }
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
@@ -242,7 +242,7 @@ static __global__ void mul_mat_vec_q(
         if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
             float result = tmp[j][threadIdx.x];
             if constexpr (has_gate) {
-                result = op(result) * tmp_gate[j][threadIdx.x];
+                result = result * op(tmp_gate[j][threadIdx.x]);
             }
             dst[j*stride_col_dst + threadIdx.x] = result;
         }
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
@@ -80,4 +80,4 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
     return x / (1.0f + expf(-x));
-}
+}
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -567,7 +567,11 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 
 // return true if the node's results are only used by N other nodes
 // and can be fused into their calculations.
-static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
+static inline bool ggml_node_has_n_uses_impl(
+        const struct ggml_cgraph * cgraph,
+        int node_idx,
+        int32_t n_uses,
+        bool allow_views) {
     const struct ggml_tensor * node = cgraph->nodes[node_idx];
 
     // check the use count against how many we're replacing
@@ -579,7 +583,14 @@ static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int n
     // if node is a view, some other node might be using the intermediate result
     // via the view source.
     if (node->view_src) {
-        return false;
+        if (!allow_views) {
+            return false;
+        }
+
+        size_t src_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node->view_src);
+        if (!ggml_bitset_get(cgraph->visited_hash_set.used, src_hash_pos) || cgraph->use_counts[src_hash_pos] != 1) {
+            return false;
+        }
     }
 
     // If the user requested output for the node, can't fuse
@@ -590,35 +601,83 @@ static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int n
     return true;
 }
 
+static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
+    return ggml_node_has_n_uses_impl(cgraph, node_idx, n_uses, false);
+}
+
 // Returns true if nodes with indices { node_idxs } are the sequence of ggml_ops in ops[]
 // and are fusable. Nodes are considered fusable according to this function if:
-// - all nodes except the last have only one use and are not views/outputs (see ggml_node_has_N_uses).
-// - all nodes except the last are a src of the following node.
-// - all nodes are the same shape.
+// - all nodes except the last have only one use and their consumers are inside the fusion set.
+// - dependencies between nodes follow the order provided in node_idxs.
 // TODO: Consider allowing GGML_OP_NONE nodes in between
 static inline bool ggml_can_fuse_ext(const struct ggml_cgraph * cgraph, const int * node_idxs, const enum ggml_op * ops, int num_ops) {
+    GGML_ASSERT(num_ops <= 32);
+
+    if (num_ops <= 0) {
+        return false;
+    }
+
+    struct ggml_tensor * nodes[32] = {0};
+
     for (int i = 0; i < num_ops; ++i) {
-        if (node_idxs[i] >= cgraph->n_nodes) {
+        const int idx = node_idxs[i];
+        if (idx >= cgraph->n_nodes) {
             return false;
         }
 
-        struct ggml_tensor * node = cgraph->nodes[node_idxs[i]];
-        if (node->op != ops[i]) {
-            return false;
-        }
-        if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) {
+        nodes[i] = cgraph->nodes[idx];
+        if (nodes[i]->op != ops[i]) {
             return false;
         }
-        if (i > 0) {
-            struct ggml_tensor * prev = cgraph->nodes[node_idxs[i - 1]];
-            if (node->src[0] != prev && node->src[1] != prev) {
+    }
+
+    for (int i = 0; i < num_ops; ++i) {
+        struct ggml_tensor * node = nodes[i];
+
+        if (i < num_ops - 1) {
+            const bool allow_views = node->view_src != NULL;
+            if (!ggml_node_has_n_uses_impl(cgraph, node_idxs[i], 1, allow_views)) {
                 return false;
             }
-            if (!ggml_are_same_shape(node, prev)) {
+        }
+
+        for (int j = 0; j < GGML_MAX_SRC; ++j) {
+            struct ggml_tensor * src = node->src[j];
+            if (!src) {
+                continue;
+            }
+
+            int src_pos = -1;
+            for (int k = 0; k < num_ops; ++k) {
+                if (nodes[k] == src) {
+                    src_pos = k;
+                    break;
+                }
+            }
+
+            if (src_pos != -1 && src_pos >= i) {
                 return false;
             }
         }
     }
+
+    for (int i = 0; i < num_ops - 1; ++i) {
+        bool has_consumer = false;
+        for (int k = i + 1; k < num_ops && !has_consumer; ++k) {
+            struct ggml_tensor * consumer = nodes[k];
+            for (int s = 0; s < GGML_MAX_SRC; ++s) {
+                if (consumer->src[s] == nodes[i]) {
+                    has_consumer = true;
+                    break;
+                }
+            }
+        }
+
+        if (!has_consumer) {
+            return false;
+        }
+    }
+
     return true;
 }
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -4630,7 +4630,7 @@ struct test_fused_ffn_gate : public test_case {
             ggml_tensor * ffn_up   = ggml_mul_mat(ctx, up, cur);
             ggml_tensor * ffn_gate = ggml_mul_mat(ctx, gate, cur);
 
-            ggml_tensor * out = ggml_glu_split(ctx, ffn_up, ffn_gate, glu_op);
+            ggml_tensor * out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op);
 
             ggml_set_name(out, "out");
             return out;
@@ -4652,10 +4652,10 @@ struct test_fused_ffn_gate : public test_case {
             ggml_tensor * cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, k, this->b ? 1 : n_used, m);
             ggml_set_name(cur, "cur");
 
-            ggml_tensor * ffn_gate = ggml_mul_mat_id(ctx, gates, cur, ids);
             ggml_tensor * ffn_up   = ggml_mul_mat_id(ctx, ups, cur, ids);
+            ggml_tensor * ffn_gate = ggml_mul_mat_id(ctx, gates, cur, ids);
 
-            ggml_tensor * out = ggml_glu_split(ctx, ffn_up, ffn_gate, glu_op);
+            ggml_tensor * out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op);
 
             ggml_set_name(out, "out");
             return out;

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,7 @@ static __global__ void mul_mat_vec_f(`
`197`	`197`	`}`
`198`	`198`
`199`	`199`	`if constexpr (has_gate) {`
`200`		`- dst[tidstride_col_dst + row] = op(sumf[tid]) sumf_gate[tid];`
	`200`	`+ dst[tidstride_col_dst + row] = sumf[tid] op(sumf_gate[tid]);`
`201`	`201`	`} else {`
`202`	`202`	`dst[tid*stride_col_dst + row] = sumf[tid];`
`203`	`203`	`}`
Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,7 @@ static __global__ void mul_mat_vec_q(`
`242`	`242`	`if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 \|\| uint32_t(row0 + threadIdx.x) < stride_col_dst)) {`
`243`	`243`	`float result = tmp[j][threadIdx.x];`
`244`	`244`	`if constexpr (has_gate) {`
`245`		`- result = op(result) * tmp_gate[j][threadIdx.x];`
	`245`	`+ result = result * op(tmp_gate[j][threadIdx.x]);`
`246`	`246`	`}`
`247`	`247`	`dst[j*stride_col_dst + threadIdx.x] = result;`
`248`	`248`	`}`