Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LLama.Examples/Examples/KernelMemory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ and answer questions about them in an interactive chat prompt.

// Ask a predefined question
Console.ForegroundColor = ConsoleColor.Green;
string question1 = "What formats does KM support";
string question1 = "What is Kernel Memory";
Console.WriteLine($"Question: {question1}");
await AnswerQuestion(memory, question1);

Expand Down
2 changes: 1 addition & 1 deletion LLama.Examples/Examples/KernelMemorySaveAndLoad.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Press ENTER to proceed...
await IngestDocuments(memory);
}

await AskSingleQuestion(memory, "What formats does KM support?");
await AskSingleQuestion(memory, "What is Kernel Memory");
await StartUserChatSession(memory);
}

Expand Down
4 changes: 2 additions & 2 deletions LLama.Examples/LLama.Examples.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="9.0.3" />
<PackageReference Include="Microsoft.KernelMemory.Core" Version="0.97.250211.1" />
<PackageReference Include="Microsoft.KernelMemory.Core" Version="0.98.250323.1" />
<PackageReference Include="Microsoft.SemanticKernel" Version="1.44.0" />
<PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.6.2-alpha" />
<PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.44.0-alpha" />
<PackageReference Include="NAudio" Version="2.2.1" />
<PackageReference Include="SixLabors.ImageSharp" Version="3.1.7" />
<PackageReference Include="Spectre.Console" Version="0.49.1" />
Expand Down
63 changes: 43 additions & 20 deletions LLama/LLamaEmbedder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
using LLama.Abstractions;
using LLama.Exceptions;
using LLama.Native;
using Microsoft.Extensions.AI;
using Microsoft.Extensions.Logging;
using static System.Net.Mime.MediaTypeNames;

namespace LLama;

Expand Down Expand Up @@ -65,37 +67,51 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
{
// Add all of the tokens to the batch
var tokens = Context.Tokenize(input, special: true);
var batch = new LLamaBatch();
for (var i = 0; i < tokens.Length; i++)
batch.Add(tokens[i], i, LLamaSeqId.Zero, true);
if (tokens.Length > Context.ContextSize)
throw new ArgumentException($"Embedding prompt is longer than the context window ({tokens.Length} > {Context.ContextSize})", nameof(input));

// clear previous kv_cache values
Context.NativeHandle.KvCacheClear();

// Check if we should cancel the work, just before doing anything expensive (encode/decode)
cancellationToken.ThrowIfCancellationRequested();

// Run model
switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder)
// Evaluate prompt in batch-size chunks
var n_past = 0;
var batch = new LLamaBatch();
var batchSize = (int)Context.Params.BatchSize;
for (var i = 0; i < tokens.Length; i += batchSize)
{
case (true, false):
{
var result = await Context.EncodeAsync(batch, cancellationToken);
if (result != EncodeResult.Ok)
throw new RuntimeError($"Failed to encode: {result}");
break;
}
var n_eval = tokens.Length - i;
if (n_eval > batchSize)
n_eval = batchSize;

batch.Clear();
batch.AddRange(tokens.AsSpan(i, n_eval), n_past, LLamaSeqId.Zero, true);
n_past += n_eval;

case (false, true):
// Run model
switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder)
{
var result = await Context.DecodeAsync(batch, cancellationToken);
if (result != DecodeResult.Ok)
throw new RuntimeError($"Failed to decode: {result}");
break;
case (true, false):
{
var result = await Context.EncodeAsync(batch, cancellationToken);
if (result != EncodeResult.Ok)
throw new RuntimeError($"Failed to encode: {result}");
break;
}

case (false, true):
{
var result = await Context.DecodeAsync(batch, cancellationToken);
if (result != DecodeResult.Ok)
throw new RuntimeError($"Failed to decode: {result}");
break;
}

default:
throw new NotSupportedException("Unsupported model type");
}

default:
throw new NotSupportedException("Unsupported model type");
}

// Extract results
Expand All @@ -114,6 +130,13 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
results.Add(Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero).ToArray());
}

// Normalize the embeddings vector
// https://github.com/ggerganov/llama.cpp/blob/2891c8aa9af17f4ff636ff3868bc34ff72b56e25/examples/embedding/embedding.cpp#L92
foreach (var embedding in results)
{
embedding.EuclideanNormalization();
}

Context.NativeHandle.KvCacheClear();

return (results, tokens.Length);
Expand Down
8 changes: 8 additions & 0 deletions LLama/Native/NativeApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,14 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
internal static extern void llama_kv_self_clear(SafeLLamaContextHandle ctx);

[Obsolete("Use `llama_kv_self_clear` instead")]

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is exposed as KvCacheClear on SafeLLamaContextHandle now, it shouldn't be re-introduced here.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh it's also an obsolete function in llama.cpp anyway!

/// <summary>
/// Clear the KV cache. Both cell info is erased and KV data is zeroed
/// </summary>
/// <param name="ctx"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
internal static extern void llama_kv_cache_clear(SafeLLamaContextHandle ctx);

/// <summary>
/// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
/// </summary>
Expand Down
3 changes: 2 additions & 1 deletion LLama/Native/SafeLLamaContextHandle.cs
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,8 @@ public int KvCacheCountTokens()
/// </summary>
public void KvCacheClear()
{
NativeApi.llama_kv_self_clear(this);
//NativeApi.llama_kv_self_clear(this);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why this change was made? llama_kv_cache_clear was renamed llama_kv_self_clear in llama.cpp recently, so that should always be used instead now.

@zsogitbe zsogitbe Apr 23, 2025

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, for some reason there are/were problems with the llama.cpp submodule. I had a slightly lower version (after downloading a version initially) and that version of llama.cpp does not have the llama_kv_self_clear yet. This was the reason for why I have reintroduced the old clear. I think that it would be better to keep it still in the code for this reason (it is marked as obsolete), but it is up to you to decide.
p.s.: both functions do the same :)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was probably me forgetting to update the submodule again on the last update. If you keep an eye on the update PRs when they're available feel free to give me a poke if you spot that in the future.

For llama_kv_cache_clear vs llama_kv_self_clear, please put it back to the newer version. If it was a public API it'd make sense to keep it around for a while marked as Obsolete to give people a chance to migrate, but this is an internal function.

NativeApi.llama_kv_cache_clear(this);
}

/// <summary>
Expand Down