-
Notifications
You must be signed in to change notification settings - Fork 849
Add DocumentTokenChunker #7093
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add DocumentTokenChunker #7093
Changes from all commits
4b06bfb
6f24e77
7aecf83
e05b4e7
a719837
bb8f975
2a49ac9
68d6827
408db33
68b6ed6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,125 @@ | ||||||||||||
| // Licensed to the .NET Foundation under one or more agreements. | ||||||||||||
| // The .NET Foundation licenses this file to you under the MIT license. | ||||||||||||
|
|
||||||||||||
| using System; | ||||||||||||
| using System.Collections.Generic; | ||||||||||||
| using System.Runtime.CompilerServices; | ||||||||||||
| using System.Runtime.InteropServices; | ||||||||||||
| using System.Text; | ||||||||||||
| using System.Threading; | ||||||||||||
| using Microsoft.ML.Tokenizers; | ||||||||||||
| using Microsoft.Shared.Diagnostics; | ||||||||||||
|
|
||||||||||||
| namespace Microsoft.Extensions.DataIngestion.Chunkers | ||||||||||||
| { | ||||||||||||
| /// <summary> | ||||||||||||
| /// Processes a document by tokenizing its content and dividing it into overlapping chunks of tokens. | ||||||||||||
| /// </summary> | ||||||||||||
| /// <remarks> | ||||||||||||
| /// <para>This class uses a tokenizer to convert the document's content into tokens and then splits the | ||||||||||||
| /// tokens into chunks of a specified size, with a configurable overlap between consecutive chunks.</para> | ||||||||||||
| /// <para>Note that tables may be split mid-row.</para> | ||||||||||||
| /// </remarks> | ||||||||||||
| public sealed class DocumentTokenChunker : IngestionChunker<string> | ||||||||||||
| { | ||||||||||||
| private readonly Tokenizer _tokenizer; | ||||||||||||
| private readonly int _maxTokensPerChunk; | ||||||||||||
| private readonly int _chunkOverlap; | ||||||||||||
|
|
||||||||||||
| /// <summary> | ||||||||||||
| /// Initializes a new instance of the <see cref="DocumentTokenChunker"/> class with the specified options. | ||||||||||||
| /// </summary> | ||||||||||||
| /// <param name="options">The options used to configure the chunker, including tokenizer and chunk sizes.</param> | ||||||||||||
| public DocumentTokenChunker(IngestionChunkerOptions options) | ||||||||||||
| { | ||||||||||||
| _ = Throw.IfNull(options); | ||||||||||||
|
|
||||||||||||
| _tokenizer = options.Tokenizer; | ||||||||||||
| _maxTokensPerChunk = options.MaxTokensPerChunk; | ||||||||||||
| _chunkOverlap = options.OverlapTokens; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /// <inheritdoc/> | ||||||||||||
| public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document, [EnumeratorCancellation] CancellationToken cancellationToken = default) | ||||||||||||
| { | ||||||||||||
| _ = Throw.IfNull(document); | ||||||||||||
|
|
||||||||||||
| int stringBuilderTokenCount = 0; | ||||||||||||
| StringBuilder stringBuilder = new(); | ||||||||||||
| foreach (IngestionDocumentElement element in document.EnumerateContent()) | ||||||||||||
| { | ||||||||||||
| cancellationToken.ThrowIfCancellationRequested(); | ||||||||||||
| string? elementContent = element.GetSemanticContent(); | ||||||||||||
| if (string.IsNullOrEmpty(elementContent)) | ||||||||||||
| { | ||||||||||||
| continue; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| int contentToProcessTokenCount = _tokenizer.CountTokens(elementContent!, considerNormalization: false); | ||||||||||||
| ReadOnlyMemory<char> contentToProcess = elementContent.AsMemory(); | ||||||||||||
| while (stringBuilderTokenCount + contentToProcessTokenCount >= _maxTokensPerChunk) | ||||||||||||
| { | ||||||||||||
| int index = _tokenizer.GetIndexByTokenCount( | ||||||||||||
| text: contentToProcess.Span, | ||||||||||||
| maxTokenCount: _maxTokensPerChunk - stringBuilderTokenCount, | ||||||||||||
| out string? _, | ||||||||||||
| out int _, | ||||||||||||
| considerNormalization: false); | ||||||||||||
|
|
||||||||||||
| unsafe | ||||||||||||
| { | ||||||||||||
| fixed (char* ptr = &MemoryMarshal.GetReference(contentToProcess.Span)) | ||||||||||||
| { | ||||||||||||
| _ = stringBuilder.Append(ptr, index); | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
Comment on lines
+69
to
+75
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suspect you are using Please follow the pattern of passing span to builder on modern .NET and allocating otherwise: extensions/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs Lines 229 to 233 in e801349
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recommended it. This could be a ton of string allocation, entirely unnecessarily. The unsafe use is very small and scoped and easily audited. I don't see a problem with it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm with @stephentoub on this one. I agree that unsafe use is contained to a very small portion of the code.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We had a debate about the same thing in some other PR and I removed the unsafe. We could at least move the unsafe to #if NET
stringBuilder.Append(chars);
#else
unsafe goes here
#endif Or introduce an extension method that does take care of that of But I don't want to block @KrystofS, we can deal with it later. cc @EgorBo Who is leading the effort of unsafe removal. |
||||||||||||
| yield return FinalizeChunk(); | ||||||||||||
|
|
||||||||||||
| contentToProcess = contentToProcess.Slice(index); | ||||||||||||
| contentToProcessTokenCount = _tokenizer.CountTokens(contentToProcess.Span, considerNormalization: false); | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| _ = stringBuilder.Append(contentToProcess); | ||||||||||||
| stringBuilderTokenCount += contentToProcessTokenCount; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| if (stringBuilder.Length > 0) | ||||||||||||
| { | ||||||||||||
| yield return FinalizeChunk(); | ||||||||||||
| } | ||||||||||||
| yield break; | ||||||||||||
|
|
||||||||||||
| IngestionChunk<string> FinalizeChunk() | ||||||||||||
| { | ||||||||||||
| IngestionChunk<string> chunk = new IngestionChunk<string>( | ||||||||||||
| content: stringBuilder.ToString(), | ||||||||||||
| document: document, | ||||||||||||
| context: string.Empty); | ||||||||||||
| _ = stringBuilder.Clear(); | ||||||||||||
| stringBuilderTokenCount = 0; | ||||||||||||
|
|
||||||||||||
| if (_chunkOverlap > 0) | ||||||||||||
| { | ||||||||||||
| int index = _tokenizer.GetIndexByTokenCountFromEnd( | ||||||||||||
| text: chunk.Content, | ||||||||||||
| maxTokenCount: _chunkOverlap, | ||||||||||||
| out string? _, | ||||||||||||
| out stringBuilderTokenCount, | ||||||||||||
| considerNormalization: false); | ||||||||||||
|
|
||||||||||||
| ReadOnlySpan<char> overlapContent = chunk.Content.AsSpan().Slice(index); | ||||||||||||
| unsafe | ||||||||||||
| { | ||||||||||||
| fixed (char* ptr = &MemoryMarshal.GetReference(overlapContent)) | ||||||||||||
| { | ||||||||||||
| _ = stringBuilder.Append(ptr, overlapContent.Length); | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| return chunk; | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion; | ||
|
|
||
| /// <summary> | ||
| /// Extension methods for <see cref="IngestionDocumentElement"/>. | ||
| /// </summary> | ||
| internal static class IngestionDocumentElementExtensions | ||
| { | ||
| /// <summary> | ||
| /// Gets the semantic content of the element if available. | ||
| /// </summary> | ||
| /// <param name="element">The element to get semantic content from.</param> | ||
| /// <returns>The semantic content suitable for embedding generation.</returns> | ||
| internal static string? GetSemanticContent(this IngestionDocumentElement element) | ||
| { | ||
| return element switch | ||
| { | ||
| IngestionDocumentImage image => image.AlternativeText ?? image.Text, | ||
| _ => element.GetMarkdown() | ||
| }; | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System.Collections.Generic; | ||
| using System.Linq; | ||
| using System.Threading.Tasks; | ||
| using Xunit; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests | ||
| { | ||
| public abstract class DocumentTokenChunkerTests : DocumentChunkerTests | ||
| { | ||
| [Fact] | ||
| public async Task SingleChunkText() | ||
| { | ||
| string text = "This is a short document that fits within a single chunk."; | ||
| IngestionDocument doc = new IngestionDocument("singleChunkDoc"); | ||
| doc.Sections.Add(new IngestionDocumentSection | ||
| { | ||
| Elements = | ||
| { | ||
| new IngestionDocumentParagraph(text) | ||
| } | ||
| }); | ||
|
|
||
| IngestionChunker<string> chunker = CreateDocumentChunker(); | ||
| IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync(); | ||
|
|
||
| IngestionChunk<string> chunk = Assert.Single(chunks); | ||
| Assert.Equal(text, chunk.Content, ignoreLineEndingDifferences: true); | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System.Collections.Generic; | ||
| using System.Linq; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.ML.Tokenizers; | ||
| using Xunit; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests | ||
| { | ||
| public class NoOverlapTokenChunkerTests : DocumentTokenChunkerTests | ||
| { | ||
| protected override IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500) | ||
| { | ||
| var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); | ||
| return new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 }); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task TwoChunks() | ||
| { | ||
| string text = string.Join(" ", Enumerable.Repeat("word", 600)); // each word is 1 token | ||
| IngestionDocument doc = new IngestionDocument("twoChunksNoOverlapDoc"); | ||
| doc.Sections.Add(new IngestionDocumentSection | ||
| { | ||
| Elements = | ||
| { | ||
| new IngestionDocumentParagraph(text) | ||
| } | ||
| }); | ||
| IngestionChunker<string> chunker = CreateDocumentChunker(maxTokensPerChunk: 512); | ||
| IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync(); | ||
| Assert.Equal(2, chunks.Count); | ||
| Assert.True(chunks[0].Content.Split(' ').Length <= 512); | ||
| Assert.True(chunks[1].Content.Split(' ').Length <= 512); | ||
| Assert.Equal(text, string.Join("", chunks.Select(c => c.Content))); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task ManyChunks() | ||
| { | ||
| string text = string.Join(" ", Enumerable.Repeat("word", 1500)); // each word is 1 token | ||
| IngestionDocument doc = new IngestionDocument("smallChunksNoOverlapDoc"); | ||
| doc.Sections.Add(new IngestionDocumentSection | ||
| { | ||
| Elements = | ||
| { | ||
| new IngestionDocumentParagraph(text) | ||
| } | ||
| }); | ||
|
|
||
| IngestionChunker<string> chunker = CreateDocumentChunker(maxTokensPerChunk: 200, overlapTokens: 0); | ||
| IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync(); | ||
| Assert.Equal(8, chunks.Count); | ||
| foreach (var chunk in chunks) | ||
| { | ||
| Assert.True(chunk.Content.Split(' ').Count(str => str.Contains("word")) <= 200); | ||
| } | ||
|
|
||
| Assert.Equal(text, string.Join("", chunks.Select(c => c.Content))); | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.Linq; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.ML.Tokenizers; | ||
| using Xunit; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests | ||
| { | ||
| public class OverlapTokenChunkerTests : DocumentTokenChunkerTests | ||
| { | ||
| protected override IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500) | ||
| { | ||
| var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); | ||
| return new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = overlapTokens }); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task TokenChunking_WithOverlap() | ||
| { | ||
| string text = "The quick brown fox jumps over the lazy dog"; | ||
| var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); | ||
| int chunkSize = 4; // Small chunk size to demonstrate overlap | ||
| int chunkOverlap = 1; | ||
|
|
||
| var chunker = new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = chunkSize, OverlapTokens = chunkOverlap }); | ||
| IngestionDocument doc = new IngestionDocument("overlapExample"); | ||
| doc.Sections.Add(new IngestionDocumentSection | ||
| { | ||
| Elements = | ||
| { | ||
| new IngestionDocumentParagraph(text) | ||
| } | ||
| }); | ||
|
|
||
| IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync(); | ||
| Assert.Equal(3, chunks.Count); | ||
| Assert.Equal("The quick brown fox", chunks[0].Content, ignoreLineEndingDifferences: true); | ||
| Assert.Equal(" fox jumps over the", chunks[1].Content, ignoreLineEndingDifferences: true); | ||
| Assert.Equal(" the lazy dog", chunks[2].Content, ignoreLineEndingDifferences: true); | ||
|
|
||
| Assert.True(tokenizer.CountTokens(chunks.Last().Content) <= chunkSize); | ||
|
|
||
| for (int i = 0; i < chunks.Count - 1; i++) | ||
| { | ||
| var currentChunk = chunks[i]; | ||
| var nextChunk = chunks[i + 1]; | ||
|
|
||
| var currentWords = currentChunk.Content.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); | ||
| var nextWords = nextChunk.Content.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); | ||
|
|
||
| bool hasOverlap = currentWords.Intersect(nextWords).Any(); | ||
| Assert.True(hasOverlap, $"Chunks {i} and {i + 1} should have overlapping content"); | ||
| } | ||
|
|
||
| Assert.NotEmpty(string.Concat(chunks.Select(c => c.Content))); | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't appear to be making any attempt to move the start/end of the chunk to a "good" location, e.g. this could be in the middle of a word?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct. I don't think it is an issue because any RAG system should be resilient enough not to be affected by this assuming reasonable overlap size. Similarly this could take just a part of some table cell etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe. But I see other chunking systems going to great lengths to try to find good boundaries for the chunks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@stephentoub what other chunking systems are you referring to? Langchains
TokenTextSplittercould split any word in similar fashion, so couldTokenChunkerfrom chonkie. I'd say it's true in general not for this type of token count based chunker.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest adding only a warning to documentation and keeping the current behavior.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@stephentoub can we move on with this one? I in my testing this method actually performed the best in RAG tasks with the default settings on my test dataset.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll leave it up to @adamsitnik.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Assuming that it's documented, works fine in some cases and our competitors provide similar feature, I am fine merging it.
Just out of curiosity, have you tried the
HeaderChunkerI've implemented?