diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 1d0d7197e1f..e292300f0aa 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -21,7 +21,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { } if (ubatch->embd) { - const int64_t n_embd = embd->ne[0]; + GGML_ASSERT(n_embd == embd->ne[0]); + const int64_t n_tokens = ubatch->n_tokens; ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd)); @@ -1206,17 +1207,21 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { const int64_t n_embd = hparams.n_embd_inp(); - auto inp = std::make_unique(); + auto inp = std::make_unique(n_embd); - ggml_tensor * cur = nullptr; + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); + res->t_tokens = inp->tokens; - if (ubatch.token) { - inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp->tokens, "inp_tokens", -1); - ggml_set_input(inp->tokens); - res->t_tokens = inp->tokens; + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + ggml_set_input(inp->embd); + + ggml_tensor * cur = nullptr; + { cur = ggml_get_rows(ctx0, tok_embd, inp->tokens); + cur = ggml_scale(ctx0, cur, ubatch.token ? 1.0f : 0.0f); // apply lora for embedding tokens if needed for (const auto & lora : *loras) { @@ -1235,13 +1240,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { cur = ggml_add(ctx0, cur, inpL_delta); } - } else { - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - ggml_set_input(inp->embd); - - cur = inp->embd; } + cur = ggml_add(ctx0, cur, ggml_scale(ctx0, inp->embd, ubatch.embd ? 1.0f : 0.0f)); + // For Granite architecture if (hparams.f_embedding_scale != 0.0f) { cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale); diff --git a/src/llama-graph.h b/src/llama-graph.h index 81ac329cc31..07c81e79f91 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -104,7 +104,7 @@ using llm_graph_input_ptr = std::unique_ptr; class llm_graph_input_embd : public llm_graph_input_i { public: - llm_graph_input_embd() = default; + llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {} virtual ~llm_graph_input_embd() = default; void set_input(const llama_ubatch * ubatch) override; @@ -113,6 +113,8 @@ class llm_graph_input_embd : public llm_graph_input_i { ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] + + const int64_t n_embd = 0; }; class llm_graph_input_pos : public llm_graph_input_i { diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index edf0d1424ce..0ceae3aaeb5 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -3,12 +3,14 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; - float kq_scale = 1.0f / sqrtf(float(n_embd_head)); + const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - ggml_tensor *inpL, *cur; + ggml_tensor * inpL; + ggml_tensor * cur; + inpL = build_inp_embd(model.tok_embd); ggml_tensor * inp_pos = build_inp_pos(); @@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa } ggml_tensor * inpSA = inpL; - cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); // build self attention { diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp index 90a98f7abf0..944c198bf95 100644 --- a/src/models/gemma-embedding.cpp +++ b/src/models/gemma-embedding.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_k; @@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, inpL = build_inp_embd(model.tok_embd); // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) - if (ubatch.token) { - inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); - cb(inpL, "inp_scaled", -1); - } + inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); + cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index ae60ef4790c..dec3fc4b8bc 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -10,10 +10,9 @@ llm_build_gemma3::llm_build_gemma3(const llama_model & model, const llm_gr inpL = build_inp_embd(model.tok_embd); // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) - if (ubatch.token) { - inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); - cb(inpL, "inp_scaled", -1); - } + inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); + cb(inpL, "inp_scaled", -1); + // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp index a0bdd6a15a1..15054cf1cec 100644 --- a/src/models/gemma3n-iswa.cpp +++ b/src/models/gemma3n-iswa.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model), @@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const inpL = build_inp_embd(model.tok_embd); // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) - if (ubatch.token) { - inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); - cb(inpL, "inp_scaled", -1); - } + inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); + cb(inpL, "inp_scaled", -1); + // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) { // equivalent to get_per_layer_inputs() in python code // output shape: [n_embd_altup, n_layer, n_tokens] ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { - auto inp = std::make_unique(); + auto inp = std::make_unique(n_embd); ggml_tensor * inp_per_layer; if (ubatch.token) { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);