diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 674d06c8910..4cc9412c24d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7390,6 +7390,95 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } +#if defined(GGML_USE_HIP) + // QKV weight fusion: concat wq/wk/wv into wqkv for supported architectures + // This reduces 3 kernel launches to 1 per layer during inference + if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_LLAMA_EMBED) { + const uint32_t n_layer_count = hparams.n_layer; + + LLAMA_LOG_INFO("%s: fusing QKV weights for %u layers\n", __func__, n_layer_count); + + for (uint32_t il = 0; il < n_layer_count; ++il) { + auto & layer = layers[il]; + + // skip if already fused or if any weight is missing + if (layer.wqkv || !layer.wq || !layer.wk || !layer.wv) { + continue; + } + + // skip if types don't match (safety check) + if (layer.wq->type != layer.wk->type || layer.wq->type != layer.wv->type) { + LLAMA_LOG_WARN("%s: layer %u: QKV type mismatch, skipping fusion\n", __func__, il); + continue; + } + + const int64_t n_embd = layer.wq->ne[0]; // inner dimension (K) + const int64_t n_out_q = layer.wq->ne[1]; // Q output rows + const int64_t n_out_k = layer.wk->ne[1]; // K output rows + const int64_t n_out_v = layer.wv->ne[1]; // V output rows + const int64_t n_out_qkv = n_out_q + n_out_k + n_out_v; + const ggml_type wtype = layer.wq->type; + + // compute row size in bytes for the quantized type + const size_t row_size = ggml_row_size(wtype, n_embd); + const size_t qkv_nbytes = row_size * n_out_qkv; + + // allocate host buffer for concat + std::vector buf(qkv_nbytes); + + // copy Q rows + ggml_backend_tensor_get(layer.wq, buf.data(), 0, row_size * n_out_q); + // copy K rows after Q + ggml_backend_tensor_get(layer.wk, buf.data() + row_size * n_out_q, 0, row_size * n_out_k); + // copy V rows after K + ggml_backend_tensor_get(layer.wv, buf.data() + row_size * (n_out_q + n_out_k), 0, row_size * n_out_v); + + // allocate a new buffer for wqkv + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(layer.wq->buffer); + ggml_backend_buffer_t qkv_buf = ggml_backend_buft_alloc_buffer(buft, qkv_nbytes + 512); + if (!qkv_buf) { + LLAMA_LOG_WARN("%s: layer %u: failed to allocate QKV buffer, skipping fusion\n", __func__, il); + continue; + } + + // create a standalone ggml context for the qkv tensor + struct ggml_init_params ctx_params = { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_context * ctx_qkv = ggml_init(ctx_params); + if (!ctx_qkv) { + ggml_backend_buffer_free(qkv_buf); + LLAMA_LOG_WARN("%s: layer %u: failed to create QKV context, skipping fusion\n", __func__, il); + continue; + } + + ggml_tensor * wqkv = ggml_new_tensor_2d(ctx_qkv, wtype, n_embd, n_out_qkv); + ggml_set_name(wqkv, (std::string("blk.") + std::to_string(il) + ".attn_qkv.weight").c_str()); + + // allocate the tensor in the buffer + ggml_backend_tensor_alloc(qkv_buf, wqkv, ggml_backend_buffer_get_base(qkv_buf)); + + // copy concatenated data to device + ggml_backend_tensor_set(wqkv, buf.data(), 0, qkv_nbytes); + + // store the fused tensor + layer.wqkv = wqkv; + + // keep the context and buffer alive + pimpl->ctxs_bufs.emplace_back(ggml_context_ptr(ctx_qkv), std::vector()); + pimpl->ctxs_bufs.back().second.emplace_back(qkv_buf); + + if (il == 0) { + LLAMA_LOG_INFO("%s: QKV fused: Q[%lld,%lld] + K[%lld,%lld] + V[%lld,%lld] -> QKV[%lld,%lld] (%s)\n", + __func__, (long long)n_embd, (long long)n_out_q, (long long)n_embd, (long long)n_out_k, (long long)n_embd, (long long)n_out_v, + (long long)n_embd, (long long)n_out_qkv, ggml_type_name(wtype)); + } + } + } +#endif // GGML_USE_HIP + return true; } diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 42b5fcdf42e..e85d28a425c 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -43,27 +43,78 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_gra ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + ggml_tensor * Qcur; + ggml_tensor * Kcur; + ggml_tensor * Vcur; + +#if defined(GGML_USE_HIP) + if (model.layers[il].wqkv) { + // fused QKV path: single mul_mat + view_3d split + ggml_tensor * qkv_cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(qkv_cur, "qkv_cur", il); + + const int64_t n_embd_q = model.layers[il].wq ? model.layers[il].wq->ne[1] : (n_embd_head * n_head); + const int64_t n_embd_kgqa = hparams.n_embd_k_gqa(il); + + Qcur = ggml_view_3d(ctx0, qkv_cur, n_embd_head, n_head, n_tokens, + n_embd_head * sizeof(float), qkv_cur->nb[1], + 0); + Kcur = ggml_view_3d(ctx0, qkv_cur, n_embd_head, n_head_kv, n_tokens, + n_embd_head * sizeof(float), qkv_cur->nb[1], + n_embd_q * sizeof(float)); + Vcur = ggml_view_3d(ctx0, qkv_cur, n_embd_head, n_head_kv, n_tokens, + n_embd_head * sizeof(float), qkv_cur->nb[1], + (n_embd_q + n_embd_kgqa) * sizeof(float)); + cb(Qcur, "Qcur", il); - } - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - } - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); + + // apply biases if present + if (model.layers[il].bq) { + Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_q, n_tokens); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + } + if (model.layers[il].bk) { + Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_kgqa, n_tokens); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + } + if (model.layers[il].bv) { + Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_kgqa, n_tokens); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + cb(Vcur, "Vcur", il); + } + } else +#endif // GGML_USE_HIP + { + // original separate Q/K/V path + Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors,