dotnet · adamsitnik · Dec 12, 2025 · Nov 30, 2025 · Nov 30, 2025 · Dec 6, 2025
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs
@@ -0,0 +1,125 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text;
+using System.Threading;
+using Microsoft.ML.Tokenizers;
+using Microsoft.Shared.Diagnostics;
+
+namespace Microsoft.Extensions.DataIngestion.Chunkers
+{
+    /// <summary>
+    /// Processes a document by tokenizing its content and dividing it into overlapping chunks of tokens.
+    /// </summary>
+    /// <remarks>
+    /// <para>This class uses a tokenizer to convert the document's content into tokens and then splits the
+    /// tokens into chunks of a specified size, with a configurable overlap between consecutive chunks.</para>
+    /// <para>Note that tables may be split mid-row.</para>
+    /// </remarks>
+    public sealed class DocumentTokenChunker : IngestionChunker<string>
+    {
+        private readonly Tokenizer _tokenizer;
+        private readonly int _maxTokensPerChunk;
+        private readonly int _chunkOverlap;
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="DocumentTokenChunker"/> class with the specified options.
+        /// </summary>
+        /// <param name="options">The options used to configure the chunker, including tokenizer and chunk sizes.</param>
+        public DocumentTokenChunker(IngestionChunkerOptions options)
+        {
+            _ = Throw.IfNull(options);
+
+            _tokenizer = options.Tokenizer;
+            _maxTokensPerChunk = options.MaxTokensPerChunk;
+            _chunkOverlap = options.OverlapTokens;
+        }
+
+        /// <inheritdoc/>
+        public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document, [EnumeratorCancellation] CancellationToken cancellationToken = default)
+        {
+            _ = Throw.IfNull(document);
+
+            int stringBuilderTokenCount = 0;
+            StringBuilder stringBuilder = new();
+            foreach (IngestionDocumentElement element in document.EnumerateContent())
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                string? elementContent = element.GetSemanticContent();
+                if (string.IsNullOrEmpty(elementContent))
+                {
+                    continue;
+                }
+
+                int contentToProcessTokenCount = _tokenizer.CountTokens(elementContent!, considerNormalization: false);
+                ReadOnlyMemory<char> contentToProcess = elementContent.AsMemory();
+                while (stringBuilderTokenCount + contentToProcessTokenCount >= _maxTokensPerChunk)
+                {
+                    int index = _tokenizer.GetIndexByTokenCount(
+                        text: contentToProcess.Span,
+                        maxTokenCount: _maxTokensPerChunk - stringBuilderTokenCount,
+                        out string? _,
+                        out int _,
+                        considerNormalization: false);
+
+                    unsafe
+                    {
+                        fixed (char* ptr = &MemoryMarshal.GetReference(contentToProcess.Span))
+                        {
+                            _ = stringBuilder.Append(ptr, index);
+                        }
+                    }
 #if NET 
         stringBuilder.Append(chars); 
 #else 
         stringBuilder.Append(chars.ToString()); 
 #endif 
 #if NET 
         stringBuilder.Append(chars); 
 #else 
         stringBuilder.Append(chars.ToString()); 
 #endif 
+                    yield return FinalizeChunk();
+
+                    contentToProcess = contentToProcess.Slice(index);
+                    contentToProcessTokenCount = _tokenizer.CountTokens(contentToProcess.Span, considerNormalization: false);
+                }
+
+                _ = stringBuilder.Append(contentToProcess);
+                stringBuilderTokenCount += contentToProcessTokenCount;
+            }
+
+            if (stringBuilder.Length > 0)
+            {
+                yield return FinalizeChunk();
+            }
+            yield break;
+
+            IngestionChunk<string> FinalizeChunk()
+            {
+                IngestionChunk<string> chunk = new IngestionChunk<string>(
+                    content: stringBuilder.ToString(),
+                    document: document,
+                    context: string.Empty);
+                _ = stringBuilder.Clear();
+                stringBuilderTokenCount = 0;
+
+                if (_chunkOverlap > 0)
+                {
+                    int index = _tokenizer.GetIndexByTokenCountFromEnd(
+                        text: chunk.Content,
+                        maxTokenCount: _chunkOverlap,
+                        out string? _,
+                        out stringBuilderTokenCount,
+                        considerNormalization: false);
+
+                    ReadOnlySpan<char> overlapContent = chunk.Content.AsSpan().Slice(index);
+                    unsafe
+                    {
+                        fixed (char* ptr = &MemoryMarshal.GetReference(overlapContent))
+                        {
+                            _ = stringBuilder.Append(ptr, overlapContent.Length);
+                        }
+                    }
+                }
+
+                return chunk;
+            }
+        }
+
+    }
+}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionDocumentElementExtensions.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionDocumentElementExtensions.cs
@@ -0,0 +1,24 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace Microsoft.Extensions.DataIngestion;
+
+/// <summary>
+/// Extension methods for <see cref="IngestionDocumentElement"/>.
+/// </summary>
+internal static class IngestionDocumentElementExtensions
+{
+    /// <summary>
+    /// Gets the semantic content of the element if available.
+    /// </summary>
+    /// <param name="element">The element to get semantic content from.</param>
+    /// <returns>The semantic content suitable for embedding generation.</returns>
+    internal static string? GetSemanticContent(this IngestionDocumentElement element)
+    {
+        return element switch
+        {
+            IngestionDocumentImage image => image.AlternativeText ?? image.Text,
+            _ => element.GetMarkdown()
+        };
+    }
+}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Microsoft.Extensions.DataIngestion.csproj b/src/Libraries/Microsoft.Extensions.DataIngestion/Microsoft.Extensions.DataIngestion.csproj
@@ -13,6 +13,7 @@
     <EnablePackageValidation>false</EnablePackageValidation>
     <MinCodeCoverage>75</MinCodeCoverage>
     <MinMutationScore>75</MinMutationScore>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
   <ItemGroup>

diff --git a/.../Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/DocumentTokenChunkerTests.cs b/.../Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/DocumentTokenChunkerTests.cs
@@ -0,0 +1,33 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using Xunit;
+
+namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests
+{
+    public abstract class DocumentTokenChunkerTests : DocumentChunkerTests
+    {
+        [Fact]
+        public async Task SingleChunkText()
+        {
+            string text = "This is a short document that fits within a single chunk.";
+            IngestionDocument doc = new IngestionDocument("singleChunkDoc");
+            doc.Sections.Add(new IngestionDocumentSection
+            {
+                Elements =
+                {
+                    new IngestionDocumentParagraph(text)
+                }
+            });
+
+            IngestionChunker<string> chunker = CreateDocumentChunker();
+            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
+
+            IngestionChunk<string> chunk = Assert.Single(chunks);
+            Assert.Equal(text, chunk.Content, ignoreLineEndingDifferences: true);
+        }
+    }
+}
diff --git a/...Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/NoOverlapTokenChunkerTests.cs b/...Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/NoOverlapTokenChunkerTests.cs
@@ -0,0 +1,64 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using Microsoft.ML.Tokenizers;
+using Xunit;
+
+namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests
+{
+    public class NoOverlapTokenChunkerTests : DocumentTokenChunkerTests
+    {
+        protected override IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500)
+        {
+            var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
+            return new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = 0 });
+        }
+
+        [Fact]
+        public async Task TwoChunks()
+        {
+            string text = string.Join(" ", Enumerable.Repeat("word", 600)); // each word is 1 token
+            IngestionDocument doc = new IngestionDocument("twoChunksNoOverlapDoc");
+            doc.Sections.Add(new IngestionDocumentSection
+            {
+                Elements =
+                {
+                    new IngestionDocumentParagraph(text)
+                }
+            });
+            IngestionChunker<string> chunker = CreateDocumentChunker(maxTokensPerChunk: 512);
+            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
+            Assert.Equal(2, chunks.Count);
+            Assert.True(chunks[0].Content.Split(' ').Length <= 512);
+            Assert.True(chunks[1].Content.Split(' ').Length <= 512);
+            Assert.Equal(text, string.Join("", chunks.Select(c => c.Content)));
+        }
+
+        [Fact]
+        public async Task ManyChunks()
+        {
+            string text = string.Join(" ", Enumerable.Repeat("word", 1500)); // each word is 1 token
+            IngestionDocument doc = new IngestionDocument("smallChunksNoOverlapDoc");
+            doc.Sections.Add(new IngestionDocumentSection
+            {
+                Elements =
+                {
+                    new IngestionDocumentParagraph(text)
+                }
+            });
+
+            IngestionChunker<string> chunker = CreateDocumentChunker(maxTokensPerChunk: 200, overlapTokens: 0);
+            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
+            Assert.Equal(8, chunks.Count);
+            foreach (var chunk in chunks)
+            {
+                Assert.True(chunk.Content.Split(' ').Count(str => str.Contains("word")) <= 200);
+            }
+
+            Assert.Equal(text, string.Join("", chunks.Select(c => c.Content)));
+        }
+    }
+}
diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/OverlapTokenChunkerTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/OverlapTokenChunkerTests.cs
@@ -0,0 +1,62 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using Microsoft.ML.Tokenizers;
+using Xunit;
+
+namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests
+{
+    public class OverlapTokenChunkerTests : DocumentTokenChunkerTests
+    {
+        protected override IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500)
+        {
+            var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
+            return new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = maxTokensPerChunk, OverlapTokens = overlapTokens });
+        }
+
+        [Fact]
+        public async Task TokenChunking_WithOverlap()
+        {
+            string text = "The quick brown fox jumps over the lazy dog";
+            var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
+            int chunkSize = 4;  // Small chunk size to demonstrate overlap
+            int chunkOverlap = 1;
+
+            var chunker = new DocumentTokenChunker(new(tokenizer) { MaxTokensPerChunk = chunkSize, OverlapTokens = chunkOverlap });
+            IngestionDocument doc = new IngestionDocument("overlapExample");
+            doc.Sections.Add(new IngestionDocumentSection
+            {
+                Elements =
+                {
+                    new IngestionDocumentParagraph(text)
+                }
+            });
+
+            IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(doc).ToListAsync();
+            Assert.Equal(3, chunks.Count);
+            Assert.Equal("The quick brown fox", chunks[0].Content, ignoreLineEndingDifferences: true);
+            Assert.Equal(" fox jumps over the", chunks[1].Content, ignoreLineEndingDifferences: true);
+            Assert.Equal(" the lazy dog", chunks[2].Content, ignoreLineEndingDifferences: true);
+
+            Assert.True(tokenizer.CountTokens(chunks.Last().Content) <= chunkSize);
+
+            for (int i = 0; i < chunks.Count - 1; i++)
+            {
+                var currentChunk = chunks[i];
+                var nextChunk = chunks[i + 1];
+
+                var currentWords = currentChunk.Content.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+                var nextWords = nextChunk.Content.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+
+                bool hasOverlap = currentWords.Intersect(nextWords).Any();
+                Assert.True(hasOverlap, $"Chunks {i} and {i + 1} should have overlapping content");
+            }
+
+            Assert.NotEmpty(string.Concat(chunks.Select(c => c.Content)));
+        }
+    }
+}