Skip to content

Commit 5f83cb8

Browse files
committed
TokenTextSplitter Ref: extract method, use range.
1 parent 3a7a08d commit 5f83cb8

File tree

1 file changed

+7
-8
lines changed

1 file changed

+7
-8
lines changed

core/src/commonMain/kotlin/com/xebia/functional/xef/textsplitters/TokenTextSplitter.kt

+7-8
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package com.xebia.functional.xef.textsplitters
22

33
import com.xebia.functional.tokenizer.Encoding
44
import com.xebia.functional.tokenizer.ModelType
5+
import kotlin.ranges.IntProgression
56

67
fun TokenTextSplitter(modelType: ModelType, chunkSize: Int, chunkOverlap: Int): TextSplitter =
78
TokenTextSplitterImpl(modelType.encoding, chunkSize, chunkOverlap)
@@ -14,14 +15,12 @@ private class TokenTextSplitterImpl(
1415

1516
override suspend fun splitText(text: String): List<String> {
1617
val inputIds = tokenizer.encode(text)
17-
val stepSize = chunkSize - chunkOverlap
18-
19-
return inputIds.indices
20-
.asSequence()
21-
.filter { it % stepSize == 0 }
22-
.map { startIdx -> inputIds.subList(startIdx, minOf(startIdx + chunkSize, inputIds.size)) }
23-
.map { chunkIds -> tokenizer.decode(chunkIds) }
24-
.toList()
18+
fun decodeSegment(startIdx: Int): String {
19+
val end = minOf(startIdx + chunkSize, inputIds.size)
20+
return tokenizer.decode(inputIds.subList(startIdx, end))
21+
}
22+
val segments = IntProgression.fromClosedRange(0, inputIds.size - 1, chunkSize - chunkOverlap)
23+
return segments.toList().map { decodeSegment(it) }
2524
}
2625

2726
override suspend fun splitDocuments(documents: List<String>): List<String> =

0 commit comments

Comments
 (0)