diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 16d42c4ae3..acc13b0bee 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -533,113 +533,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) { return res; } -void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { - const auto * attn_ctx = mctx->get_attn(); - - // base tensors may not be allocated if there are no non-SWA attention layers - if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { - attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); - attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch); - - attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); - } - - // swa tensors may not be allocated if there are no SWA attention layers - if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { - attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch); - attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch); - - attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); - } - - const int64_t n_rs = mctx->get_recr()->get_n_rs(); - - if (inp_rs->s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer)); - int32_t * data = (int32_t *) inp_rs->s_copy->data; - - // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n - for (uint32_t i = 0; i < n_rs; ++i) { - data[i] = mctx->get_recr()->s_copy(i); - } - } -} - -bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) { - const auto * mctx = static_cast(params.mctx); - - this->mctx = mctx; - - bool res = true; - - const auto * attn_ctx = mctx->get_attn(); - - // base tensors may not be allocated if there are no non-SWA attention layers - if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { - res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens; - //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= inp_attn->self_kq_mask->ne[0] == attn_ctx->get_base()->get_n_kv(); - res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens; - } - - // swa tensors may not be allocated if there are no SWA attention layers - if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { - res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; - //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= inp_attn->self_kq_mask_swa->ne[0] == attn_ctx->get_swa()->get_n_kv(); - res &= inp_attn->self_kq_mask_swa->ne[1] == params.ubatch.n_tokens; - } - - res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); - - res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; - res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; - - res &= inp_rs->head == mctx->get_recr()->get_head(); - res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z(); - - return res; -} - -void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) { - // set the inputs only for the active samplers in the current ubatch - std::unordered_set active_samplers; - for (uint32_t i = 0; i < ubatch->n_tokens; i++) { - if (ubatch->output[i]) { - llama_seq_id seq_id = ubatch->seq_id[i][0]; - active_samplers.insert(seq_id); - } - } - - for (auto seq_id : active_samplers) { - if (samplers.find(seq_id) == samplers.end()) { - continue; - } - - auto & sampler = samplers[seq_id]; - - if (sampler->iface->backend_set_input) { - sampler->iface->backend_set_input(sampler); - } - } -} - -bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) { - if (samplers.size() != params.samplers.size()) { - return false; - } - - for (const auto & [seq_id, sampler] : params.samplers) { - if (samplers[seq_id] != sampler) { - return false; - } - } - - return true; -} - // // llm_graph_result //