diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 195904ee20..721edd9994 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -310,8 +310,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } } - const bool V_is_K_view = V->view_src && (V->view_src == K || (V->view_src == K->view_src && V->view_offs == K->view_offs)); - const int cc = ggml_cuda_info().devices[device].cc; switch (K->ne[0]) { @@ -334,9 +332,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const if (!gqa_opt_applies) { return BEST_FATTN_KERNEL_NONE; } - if (!V_is_K_view) { - return BEST_FATTN_KERNEL_NONE; - } break; default: return BEST_FATTN_KERNEL_NONE; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b3198b7e3a..16d42c4ae3 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1630,11 +1630,6 @@ ggml_tensor * llm_graph_context::build_attn_mha( hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); cb(cur, LLAMA_TENSOR_NAME_FATTN, il); - if (!cparams.offload_kqv) { - // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu); - } - ggml_flash_attn_ext_add_sinks(cur, sinks); ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);