Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using Microsoft.ML.Tokenizers;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.DataIngestion.Chunkers
{
/// <summary>
/// Processes a document by tokenizing its content and dividing it into overlapping chunks of tokens.
/// </summary>
/// <remarks>
/// <para>This class uses a tokenizer to convert the document's content into tokens and then splits the
/// tokens into chunks of a specified size, with a configurable overlap between consecutive chunks.</para>
/// <para>Note that tables may be split mid-row.</para>
/// </remarks>
public sealed class DocumentTokenChunker : IngestionChunker<string>
{
private readonly Tokenizer _tokenizer;
private readonly int _maxTokensPerChunk;
private readonly int _chunkOverlap;

/// <summary>
/// Initializes a new instance of the <see cref="DocumentTokenChunker"/> class with the specified options.
/// </summary>
/// <param name="options">The options used to configure the chunker, including tokenizer and chunk sizes.</param>
public DocumentTokenChunker(IngestionChunkerOptions options)
{
_ = Throw.IfNull(options);

_tokenizer = options.Tokenizer;
_maxTokensPerChunk = options.MaxTokensPerChunk;
_chunkOverlap = options.OverlapTokens;
}

/// <inheritdoc/>
public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(document);

int stringBuilderTokenCount = 0;
StringBuilder stringBuilder = new();
foreach (IngestionDocumentElement element in document.EnumerateContent())
{
cancellationToken.ThrowIfCancellationRequested();
string? elementContent = element.GetSemanticContent();
if (string.IsNullOrEmpty(elementContent))
{
continue;
}

int contentToProcessTokenCount = _tokenizer.CountTokens(elementContent!, considerNormalization: false);
ReadOnlyMemory<char> contentToProcess = elementContent.AsMemory();
while (stringBuilderTokenCount + contentToProcessTokenCount >= _maxTokensPerChunk)
{
int index = _tokenizer.GetIndexByTokenCount(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't appear to be making any attempt to move the start/end of the chunk to a "good" location, e.g. this could be in the middle of a word?

Copy link
Contributor Author

@KrystofS KrystofS Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. I don't think it is an issue because any RAG system should be resilient enough not to be affected by this assuming reasonable overlap size. Similarly this could take just a part of some table cell etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe. But I see other chunking systems going to great lengths to try to find good boundaries for the chunks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stephentoub what other chunking systems are you referring to? Langchains TokenTextSplitter could split any word in similar fashion, so could TokenChunker from chonkie. I'd say it's true in general not for this type of token count based chunker.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest adding only a warning to documentation and keeping the current behavior.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stephentoub can we move on with this one? I in my testing this method actually performed the best in RAG tasks with the default settings on my test dataset.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll leave it up to @adamsitnik.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming that it's documented, works fine in some cases and our competitors provide similar feature, I am fine merging it.

I in my testing this method actually performed the best in RAG tasks with the default settings on my test dataset.

Just out of curiosity, have you tried the HeaderChunker I've implemented?

text: contentToProcess.Span,
maxTokenCount: _maxTokensPerChunk - stringBuilderTokenCount,
out string? _,
out int _,
considerNormalization: false);

unsafe
{
fixed (char* ptr = &MemoryMarshal.GetReference(contentToProcess.Span))
{
_ = stringBuilder.Append(ptr, index);
}
}
Comment on lines +69 to +75
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect you are using unsafe to avoid string allocation of .NET Standard/Full Framework. I don't believe it's worth the struggle (we aim to not use unsafe at all when possible).

Please follow the pattern of passing span to builder on modern .NET and allocating otherwise:

#if NET
stringBuilder.Append(chars);
#else
stringBuilder.Append(chars.ToString());
#endif

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommended it. This could be a ton of string allocation, entirely unnecessarily. The unsafe use is very small and scoped and easily audited. I don't see a problem with it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm with @stephentoub on this one. I agree that unsafe use is contained to a very small portion of the code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We had a debate about the same thing in some other PR and I removed the unsafe.

We could at least move the unsafe to !NET:

 #if NET 
         stringBuilder.Append(chars); 
 #else 
         unsafe goes here 
 #endif 

Or introduce an extension method that does take care of that of !NET

But I don't want to block @KrystofS, we can deal with it later.

cc @EgorBo Who is leading the effort of unsafe removal.

yield return FinalizeChunk();

contentToProcess = contentToProcess.Slice(index);
contentToProcessTokenCount = _tokenizer.CountTokens(contentToProcess.Span, considerNormalization: false);
}

_ = stringBuilder.Append(contentToProcess);
stringBuilderTokenCount += contentToProcessTokenCount;
}

if (stringBuilder.Length > 0)
{
yield return FinalizeChunk();
}
yield break;

IngestionChunk<string> FinalizeChunk()
{
IngestionChunk<string> chunk = new IngestionChunk<string>(
content: stringBuilder.ToString(),
document: document,
context: string.Empty);
_ = stringBuilder.Clear();
stringBuilderTokenCount = 0;

if (_chunkOverlap > 0)
{
int index = _tokenizer.GetIndexByTokenCountFromEnd(
text: chunk.Content,
maxTokenCount: _chunkOverlap,
out string? _,
out stringBuilderTokenCount,
considerNormalization: false);

ReadOnlySpan<char> overlapContent = chunk.Content.AsSpan().Slice(index);
unsafe
{
fixed (char* ptr = &MemoryMarshal.GetReference(overlapContent))
{
_ = stringBuilder.Append(ptr, overlapContent.Length);
}
}
}

return chunk;
}
}

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

namespace Microsoft.Extensions.DataIngestion;

/// <summary>
/// Extension methods for <see cref="IngestionDocumentElement"/>.
/// </summary>
internal static class IngestionDocumentElementExtensions
{
/// <summary>
/// Gets the semantic content of the element if available.
/// </summary>
/// <param name="element">The element to get semantic content from.</param>
/// <returns>The semantic content suitable for embedding generation.</returns>
internal static string? GetSemanticContent(this IngestionDocumentElement element)
{
return element switch
{
IngestionDocumentImage image => image.AlternativeText ?? image.Text,
_ => element.GetMarkdown()
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
<EnablePackageValidation>false</EnablePackageValidation>
<MinCodeCoverage>75</MinCodeCoverage>
<MinMutationScore>75</MinMutationScore>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>

<ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Xunit;

namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests
{
public abstract class DocumentTokenChunkerTests : DocumentChunkerTests
{
[Fact]
public async Task SingleChunkText()
{
string text = "This is a short document that fits within a single chunk.";
IngestionDocument doc = new IngestionDocument("singleChunkDoc");
doc.Sections.Add(new IngestionDocumentSection
{
Elements =
{
new IngestionDocumentParagraph(text)
}
});

IngestionChunker<string> chunker = CreateDocumentChunker();
IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();

IngestionChunk<string> chunk = Assert.Single(chunks);
Assert.Equal(text, chunk.Content, ignoreLineEndingDifferences: true);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.ML.Tokenizers;
using Xunit;

namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests
{
public class NoOverlapTokenChunkerTests : DocumentTokenChunkerTests
{
protected override IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500)
{
var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
return new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 });
}

[Fact]
public async Task TwoChunks()
{
string text = string.Join(" ", Enumerable.Repeat("word", 600)); // each word is 1 token
IngestionDocument doc = new IngestionDocument("twoChunksNoOverlapDoc");
doc.Sections.Add(new IngestionDocumentSection
{
Elements =
{
new IngestionDocumentParagraph(text)
}
});
IngestionChunker<string> chunker = CreateDocumentChunker(maxTokensPerChunk: 512);
IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
Assert.Equal(2, chunks.Count);
Assert.True(chunks[0].Content.Split(' ').Length <= 512);
Assert.True(chunks[1].Content.Split(' ').Length <= 512);
Assert.Equal(text, string.Join("", chunks.Select(c => c.Content)));
}

[Fact]
public async Task ManyChunks()
{
string text = string.Join(" ", Enumerable.Repeat("word", 1500)); // each word is 1 token
IngestionDocument doc = new IngestionDocument("smallChunksNoOverlapDoc");
doc.Sections.Add(new IngestionDocumentSection
{
Elements =
{
new IngestionDocumentParagraph(text)
}
});

IngestionChunker<string> chunker = CreateDocumentChunker(maxTokensPerChunk: 200, overlapTokens: 0);
IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
Assert.Equal(8, chunks.Count);
foreach (var chunk in chunks)
{
Assert.True(chunk.Content.Split(' ').Count(str => str.Contains("word")) <= 200);
}

Assert.Equal(text, string.Join("", chunks.Select(c => c.Content)));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.ML.Tokenizers;
using Xunit;

namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests
{
public class OverlapTokenChunkerTests : DocumentTokenChunkerTests
{
protected override IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500)
{
var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
return new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = overlapTokens });
}

[Fact]
public async Task TokenChunking_WithOverlap()
{
string text = "The quick brown fox jumps over the lazy dog";
var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
int chunkSize = 4; // Small chunk size to demonstrate overlap
int chunkOverlap = 1;

var chunker = new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = chunkSize, OverlapTokens = chunkOverlap });
IngestionDocument doc = new IngestionDocument("overlapExample");
doc.Sections.Add(new IngestionDocumentSection
{
Elements =
{
new IngestionDocumentParagraph(text)
}
});

IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
Assert.Equal(3, chunks.Count);
Assert.Equal("The quick brown fox", chunks[0].Content, ignoreLineEndingDifferences: true);
Assert.Equal(" fox jumps over the", chunks[1].Content, ignoreLineEndingDifferences: true);
Assert.Equal(" the lazy dog", chunks[2].Content, ignoreLineEndingDifferences: true);

Assert.True(tokenizer.CountTokens(chunks.Last().Content) <= chunkSize);

for (int i = 0; i < chunks.Count - 1; i++)
{
var currentChunk = chunks[i];
var nextChunk = chunks[i + 1];

var currentWords = currentChunk.Content.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
var nextWords = nextChunk.Content.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

bool hasOverlap = currentWords.Intersect(nextWords).Any();
Assert.True(hasOverlap, $"Chunks {i} and {i + 1} should have overlapping content");
}

Assert.NotEmpty(string.Concat(chunks.Select(c => c.Content)));
}
}
}
Loading