diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 7dc688483ad..0d50e1a2e28 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2867,11 +2867,13 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; - const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; - const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; - const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; - const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; - const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; + const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; + const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; + const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; + const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; + const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; + const std::string qwen3next_diag_mask = "diag_mask"; + const std::string delta_net_linear_attn_prefix = "new_state-"; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2894,15 +2896,16 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { #endif } - if (node->op == GGML_OP_ADD && - node->src[1] && node->src[1]->ne[1] > 1 && + if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && - strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { + strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 && + strncmp(node->name, qwen3next_diag_mask.c_str(), qwen3next_diag_mask.size()) != 0 && + strncmp(node->name, delta_net_linear_attn_prefix.c_str(), delta_net_linear_attn_prefix.size()) != 0) { // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation // by means of matching node names. See // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 99b1a76a485..aa74393fc4c 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -22,6 +22,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f)); ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity); + ggml_set_name(diag_mask, "diag_mask"); ggml_build_forward_expand(gf, causal_mask); ggml_build_forward_expand(gf, identity);