From 3b3b27c07126b37839cfb5460ebd0690fd6bd770 Mon Sep 17 00:00:00 2001 From: "Diego E. Alonso" Date: Thu, 22 Jun 2023 11:52:23 +0100 Subject: [PATCH] TokenTextSplitter Ref: extract method, use range. --- .../xef/textsplitters/TokenTextSplitter.kt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt b/core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt index 788cbe016..fda22cc8e 100644 --- a/core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt +++ b/core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt @@ -2,6 +2,7 @@ package com.xebia.functional.xef.textsplitters import com.xebia.functional.tokenizer.Encoding import com.xebia.functional.tokenizer.ModelType +import kotlin.ranges.IntProgression fun TokenTextSplitter(modelType: ModelType, chunkSize: Int, chunkOverlap: Int): TextSplitter = TokenTextSplitterImpl(modelType.encoding, chunkSize, chunkOverlap) @@ -14,14 +15,13 @@ private class TokenTextSplitterImpl( override suspend fun splitText(text: String): List { val inputIds = tokenizer.encode(text) - val stepSize = chunkSize - chunkOverlap - return inputIds.indices - .asSequence() - .filter { it % stepSize == 0 } - .map { startIdx -> inputIds.subList(startIdx, minOf(startIdx + chunkSize, inputIds.size)) } - .map { chunkIds -> tokenizer.decode(chunkIds) } - .toList() + fun decodeSegment(startIdx: Int): String { + val end = minOf(startIdx + chunkSize, inputIds.size) + return tokenizer.decode(inputIds.subList(startIdx, end)) + } + val segments = IntProgression.fromClosedRange(0, inputIds.size - 1, chunkSize - chunkOverlap) + segments.toList().map { seg -> decodeSegment(seg) } } override suspend fun splitDocuments(documents: List): List =