SciSharp · martindevans · May 3, 2025 · Apr 23, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/LLama.Examples/Examples/KernelMemory.cs b/LLama.Examples/Examples/KernelMemory.cs
@@ -46,7 +46,7 @@ and answer questions about them in an interactive chat prompt.
 
             // Ask a predefined question
             Console.ForegroundColor = ConsoleColor.Green;
-            string question1 = "What formats does KM support";
+            string question1 = "What is Kernel Memory";
             Console.WriteLine($"Question: {question1}");
             await AnswerQuestion(memory, question1);
 

diff --git a/LLama.Examples/Examples/KernelMemorySaveAndLoad.cs b/LLama.Examples/Examples/KernelMemorySaveAndLoad.cs
@@ -54,7 +54,7 @@ Press ENTER to proceed...
             await IngestDocuments(memory);
         }
 
-        await AskSingleQuestion(memory, "What formats does KM support?");
+        await AskSingleQuestion(memory, "What is Kernel Memory");
         await StartUserChatSession(memory);
     }
 

diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj
@@ -15,9 +15,9 @@
 
   <ItemGroup>
     <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="9.0.3" />
-    <PackageReference Include="Microsoft.KernelMemory.Core" Version="0.97.250211.1" />
+    <PackageReference Include="Microsoft.KernelMemory.Core" Version="0.98.250323.1" />
     <PackageReference Include="Microsoft.SemanticKernel" Version="1.44.0" />
-    <PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.6.2-alpha" />
+    <PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.44.0-alpha" />
     <PackageReference Include="NAudio" Version="2.2.1" />
     <PackageReference Include="SixLabors.ImageSharp" Version="3.1.7" />
     <PackageReference Include="Spectre.Console" Version="0.49.1" />

diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
@@ -5,7 +5,9 @@
 using LLama.Abstractions;
 using LLama.Exceptions;
 using LLama.Native;
+using Microsoft.Extensions.AI;
 using Microsoft.Extensions.Logging;
+using static System.Net.Mime.MediaTypeNames;
 
 namespace LLama;
 
@@ -65,37 +67,51 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
     {
         // Add all of the tokens to the batch
         var tokens = Context.Tokenize(input, special: true);
-        var batch = new LLamaBatch();
-        for (var i = 0; i < tokens.Length; i++)
-            batch.Add(tokens[i], i, LLamaSeqId.Zero, true);
+        if (tokens.Length > Context.ContextSize)
+            throw new ArgumentException($"Embedding prompt is longer than the context window ({tokens.Length} > {Context.ContextSize})", nameof(input));
 
         // clear previous kv_cache values
         Context.NativeHandle.KvCacheClear();
 
         // Check if we should cancel the work, just before doing anything expensive (encode/decode)
         cancellationToken.ThrowIfCancellationRequested();
 
-        // Run model
-        switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder)
+        // Evaluate prompt in batch-size chunks
+        var n_past = 0;
+        var batch = new LLamaBatch();
+        var batchSize = (int)Context.Params.BatchSize;
+        for (var i = 0; i < tokens.Length; i += batchSize)
         {
-            case (true, false):
-            {
-                var result = await Context.EncodeAsync(batch, cancellationToken);
-                if (result != EncodeResult.Ok)
-                    throw new RuntimeError($"Failed to encode: {result}");
-                break;
-            }
+            var n_eval = tokens.Length - i;
+            if (n_eval > batchSize)
+                n_eval = batchSize;
+
+            batch.Clear();
+            batch.AddRange(tokens.AsSpan(i, n_eval), n_past, LLamaSeqId.Zero, true);
+            n_past += n_eval;
 
-            case (false, true):
+            // Run model
+            switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder)
             {
-                var result = await Context.DecodeAsync(batch, cancellationToken);
-                if (result != DecodeResult.Ok)
-                    throw new RuntimeError($"Failed to decode: {result}");
-                break;
+                case (true, false):
+                    {
+                        var result = await Context.EncodeAsync(batch, cancellationToken);
+                        if (result != EncodeResult.Ok)
+                            throw new RuntimeError($"Failed to encode: {result}");
+                        break;
+                    }
+
+                case (false, true):
+                    {
+                        var result = await Context.DecodeAsync(batch, cancellationToken);
+                        if (result != DecodeResult.Ok)
+                            throw new RuntimeError($"Failed to decode: {result}");
+                        break;
+                    }
+
+                default:
+                    throw new NotSupportedException("Unsupported model type");
             }
-
-            default:
-                throw new NotSupportedException("Unsupported model type");
         }
 
         // Extract results
@@ -114,6 +130,13 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
             results.Add(Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero).ToArray());
         }
 
+        // Normalize the embeddings vector
+        // https://github.com/ggerganov/llama.cpp/blob/2891c8aa9af17f4ff636ff3868bc34ff72b56e25/examples/embedding/embedding.cpp#L92
+        foreach (var embedding in results)
+        {
+            embedding.EuclideanNormalization();
+        }
+
         Context.NativeHandle.KvCacheClear();
 
         return (results, tokens.Length);

diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
@@ -290,6 +290,14 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         internal static extern void llama_kv_self_clear(SafeLLamaContextHandle ctx);
 
+        [Obsolete("Use `llama_kv_self_clear` instead")]
+        /// <summary>
+        /// Clear the KV cache. Both cell info is erased and KV data is zeroed
+        /// </summary>
+        /// <param name="ctx"></param>        
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        internal static extern void llama_kv_cache_clear(SafeLLamaContextHandle ctx);
+
         /// <summary>
         /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
         /// </summary>

diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
@@ -809,7 +809,8 @@ public int KvCacheCountTokens()
         /// </summary>
         public void KvCacheClear()
         {
-            NativeApi.llama_kv_self_clear(this);
+            //NativeApi.llama_kv_self_clear(this);
+            NativeApi.llama_kv_cache_clear(this);
         }
 
         /// <summary>