llama : fix embeddings

ggerganov · ggerganov · commit d0347840c1d7 · 2024-03-04T11:43:16.000+02:00
ggml-ci
diff --git a/common/common.cpp b/common/common.cpp
@@ -1299,7 +1299,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
-    cparams.embedding         = params.embedding;
+    cparams.embeddings        = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
     cparams.rope_freq_base    = params.rope_freq_base;
     cparams.rope_freq_scale   = params.rope_freq_scale;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -19,7 +19,7 @@ static std::vector<std::string> split_lines(const std::string & s) {
 
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
     for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, false);
+        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
     }
 }
 
@@ -45,9 +45,13 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     }
 
     // normalize on copy
-    for (int k = 0; k < n_seq; k++) {
-        float * emb = llama_get_embeddings_ith(ctx, k);
-        float * out = output + k * n_embd;
+    for (int i = 0; i < batch.n_tokens; i++) {
+        if (!batch.logits[i]) {
+            continue;
+        }
+
+        float * emb = llama_get_embeddings_ith(ctx, i);
+        float * out = output + batch.seq_id[i][0] * n_embd;
         normalize(emb, out, n_embd);
     }
 }
@@ -145,6 +149,7 @@ int main(int argc, char ** argv) {
     for (int k = 0; k < n_prompts; k++) {
         // clamp to n_batch tokens
         auto & inp = inputs[k];
+
         const uint64_t n_toks = inp.size();
 
         // encode if at capacity
diff --git a/examples/server-embd.py b/examples/server-embd.py
@@ -0,0 +1,34 @@
+import asyncio
+import requests
+import numpy as np
+
+n = 8
+
+result = []
+
+async def requests_post_async(*args, **kwargs):
+    return await asyncio.to_thread(requests.post, *args, **kwargs)
+
+async def main():
+    model_url = "http://127.0.0.1:6900"
+    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
+        url= f"{model_url}/embedding",
+        json= {"content": str(i)*32}
+    ) for i in range(n)])
+
+    for response in responses:
+        embedding = response.json()["embedding"]
+        print(embedding[-8:])
+        result.append(embedding)
+
+asyncio.run(main())
+
+# compute cosine similarity
+
+for i in range(n-1):
+    for j in range(i+1, n):
+        embedding1 = np.array(result[i])
+        embedding2 = np.array(result[j])
+        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
+        print(f"Similarity between {i} and {j}: {similarity:.2f}")
+
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1210,7 +1210,7 @@ struct llama_server_context
         queue_results.send(res);
     }
 
-    void send_embedding(server_slot &slot)
+    void send_embedding(server_slot & slot, const llama_batch & batch)
     {
         task_result res;
         res.id = slot.task_id;
@@ -1219,6 +1219,7 @@ struct llama_server_context
         res.stop = true;
 
         const int n_embd = llama_n_embd(model);
+
         if (!params.embedding)
         {
             LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
@@ -1229,12 +1230,19 @@ struct llama_server_context
         }
         else
         {
-            const float *data = llama_get_embeddings(ctx);
-            std::vector<float> embedding(data, data + n_embd);
-            res.result_json = json
-            {
-                {"embedding", embedding},
-            };
+            for (int i = 0; i < batch.n_tokens; ++i) {
+                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                    continue;
+                }
+
+                const float * data = llama_get_embeddings_ith(ctx, i);
+                std::vector<float> embedding(data, data + n_embd);
+
+                res.result_json = json
+                {
+                    {"embedding", embedding },
+                };
+            }
         }
         queue_results.send(res);
     }
@@ -1845,7 +1853,7 @@ struct llama_server_context
                                 ga_i += ga_w/ga_n;
                             }
                         }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
                         slot_npast++;
                     }
 
@@ -1881,7 +1889,7 @@ struct llama_server_context
 
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
         {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
             for (auto & slot : slots)
             {
@@ -1954,7 +1962,7 @@ struct llama_server_context
                 // prompt evaluated for embedding
                 if (slot.embedding)
                 {
-                    send_embedding(slot);
+                    send_embedding(slot, batch_view);
                     slot.release();
                     slot.i_batch = -1;
                     continue;
@@ -2330,7 +2338,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
         }
         else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
         {
diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.h b/llama.h

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ static std::vector<std::string> split_lines(const std::string & s) {`
`19`	`19`
`20`	`20`	`static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {`
`21`	`21`	`for (size_t i = 0; i < tokens.size(); i++) {`
`22`		`- llama_batch_add(batch, tokens[i], i, { seq_id }, false);`
	`22`	`+ llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);`
`23`	`23`	`}`
`24`	`24`	`}`
`25`	`25`
`@@ -45,9 +45,13 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu`
`45`	`45`	`}`
`46`	`46`
`47`	`47`	`// normalize on copy`
`48`		`- for (int k = 0; k < n_seq; k++) {`
`49`		`- float * emb = llama_get_embeddings_ith(ctx, k);`
`50`		`- float * out = output + k * n_embd;`
	`48`	`+ for (int i = 0; i < batch.n_tokens; i++) {`
	`49`	`+ if (!batch.logits[i]) {`
	`50`	`+ continue;`
	`51`	`+ }`
	`52`	`+`
	`53`	`+ float * emb = llama_get_embeddings_ith(ctx, i);`
	`54`	`+ float * out = output + batch.seq_id[i][0] * n_embd;`
`51`	`55`	`normalize(emb, out, n_embd);`
`52`	`56`	`}`
`53`	`57`	`}`
`@@ -145,6 +149,7 @@ int main(int argc, char ** argv) {`
`145`	`149`	`for (int k = 0; k < n_prompts; k++) {`
`146`	`150`	`// clamp to n_batch tokens`
`147`	`151`	`auto & inp = inputs[k];`
	`152`	`+`
`148`	`153`	`const uint64_t n_toks = inp.size();`
`149`	`154`
`150`	`155`	`// encode if at capacity`
Original file line number	Diff line number	Diff line change
`@@ -1210,7 +1210,7 @@ struct llama_server_context`
`1210`	`1210`	`queue_results.send(res);`
`1211`	`1211`	`}`
`1212`	`1212`
`1213`		`- void send_embedding(server_slot &slot)`
	`1213`	`+ void send_embedding(server_slot & slot, const llama_batch & batch)`
`1214`	`1214`	`{`
`1215`	`1215`	`task_result res;`
`1216`	`1216`	`res.id = slot.task_id;`
`@@ -1219,6 +1219,7 @@ struct llama_server_context`
`1219`	`1219`	`res.stop = true;`
`1220`	`1220`
`1221`	`1221`	`const int n_embd = llama_n_embd(model);`
	`1222`	`+`
`1222`	`1223`	`if (!params.embedding)`
`1223`	`1224`	`{`
`1224`	`1225`	`LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});`
`@@ -1229,12 +1230,19 @@ struct llama_server_context`
`1229`	`1230`	`}`
`1230`	`1231`	`else`
`1231`	`1232`	`{`
`1232`		`- const float *data = llama_get_embeddings(ctx);`
`1233`		`- std::vector<float> embedding(data, data + n_embd);`
`1234`		`- res.result_json = json`
`1235`		`- {`
`1236`		`- {"embedding", embedding},`
`1237`		`- };`
	`1233`	`+ for (int i = 0; i < batch.n_tokens; ++i) {`
	`1234`	`+ if (!batch.logits[i] \|\| batch.seq_id[i][0] != slot.id) {`
	`1235`	`+ continue;`
	`1236`	`+ }`
	`1237`	`+`
	`1238`	`+ const float * data = llama_get_embeddings_ith(ctx, i);`
	`1239`	`+ std::vector<float> embedding(data, data + n_embd);`
	`1240`	`+`
	`1241`	`+ res.result_json = json`
	`1242`	`+ {`
	`1243`	`+ {"embedding", embedding },`
	`1244`	`+ };`
	`1245`	`+ }`
`1238`	`1246`	`}`
`1239`	`1247`	`queue_results.send(res);`
`1240`	`1248`	`}`
`@@ -1845,7 +1853,7 @@ struct llama_server_context`
`1845`	`1853`	`ga_i += ga_w/ga_n;`
`1846`	`1854`	`}`
`1847`	`1855`	`}`
`1848`		`- llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);`
	`1856`	`+ llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);`
`1849`	`1857`	`slot_npast++;`
`1850`	`1858`	`}`
`1851`	`1859`
`@@ -1881,7 +1889,7 @@ struct llama_server_context`
`1881`	`1889`
`1882`	`1890`	`for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)`
`1883`	`1891`	`{`
`1884`		`- const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));`
	`1892`	`+ const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);`
`1885`	`1893`
`1886`	`1894`	`for (auto & slot : slots)`
`1887`	`1895`	`{`
`@@ -1954,7 +1962,7 @@ struct llama_server_context`
`1954`	`1962`	`// prompt evaluated for embedding`
`1955`	`1963`	`if (slot.embedding)`
`1956`	`1964`	`{`
`1957`		`- send_embedding(slot);`
	`1965`	`+ send_embedding(slot, batch_view);`
`1958`	`1966`	`slot.release();`
`1959`	`1967`	`slot.i_batch = -1;`
`1960`	`1968`	`continue;`
`@@ -2330,7 +2338,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,`
`2330`	`2338`	`break;`
`2331`	`2339`	`}`
`2332`	`2340`	`params.n_batch = std::stoi(argv[i]);`
`2333`		`- params.n_batch = std::min(512, params.n_batch);`
`2334`	`2341`	`}`
`2335`	`2342`	`else if (arg == "--gpu-layers" \|\| arg == "-ngl" \|\| arg == "--n-gpu-layers")`
`2336`	`2343`	`{`