diff --git a/core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt b/core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt index 788cbe016..fda22cc8e 100644 --- a/core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt +++ b/core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt @@ -2,6 +2,7 @@ package com.xebia.functional.xef.textsplitters import com.xebia.functional.tokenizer.Encoding import com.xebia.functional.tokenizer.ModelType +import kotlin.ranges.IntProgression fun TokenTextSplitter(modelType: ModelType, chunkSize: Int, chunkOverlap: Int): TextSplitter = TokenTextSplitterImpl(modelType.encoding, chunkSize, chunkOverlap) @@ -14,14 +15,13 @@ private class TokenTextSplitterImpl( override suspend fun splitText(text: String): List { val inputIds = tokenizer.encode(text) - val stepSize = chunkSize - chunkOverlap - return inputIds.indices - .asSequence() - .filter { it % stepSize == 0 } - .map { startIdx -> inputIds.subList(startIdx, minOf(startIdx + chunkSize, inputIds.size)) } - .map { chunkIds -> tokenizer.decode(chunkIds) } - .toList() + fun decodeSegment(startIdx: Int): String { + val end = minOf(startIdx + chunkSize, inputIds.size) + return tokenizer.decode(inputIds.subList(startIdx, end)) + } + val segments = IntProgression.fromClosedRange(0, inputIds.size - 1, chunkSize - chunkOverlap) + segments.toList().map { seg -> decodeSegment(seg) } } override suspend fun splitDocuments(documents: List): List =