@@ -2,6 +2,7 @@ package com.xebia.functional.xef.textsplitters
2
2
3
3
import com.xebia.functional.tokenizer.Encoding
4
4
import com.xebia.functional.tokenizer.ModelType
5
+ import kotlin.ranges.IntProgression
5
6
6
7
fun TokenTextSplitter (modelType : ModelType , chunkSize : Int , chunkOverlap : Int ): TextSplitter =
7
8
TokenTextSplitterImpl (modelType.encoding, chunkSize, chunkOverlap)
@@ -14,14 +15,12 @@ private class TokenTextSplitterImpl(
14
15
15
16
override suspend fun splitText (text : String ): List <String > {
16
17
val inputIds = tokenizer.encode(text)
17
- val stepSize = chunkSize - chunkOverlap
18
-
19
- return inputIds.indices
20
- .asSequence()
21
- .filter { it % stepSize == 0 }
22
- .map { startIdx -> inputIds.subList(startIdx, minOf(startIdx + chunkSize, inputIds.size)) }
23
- .map { chunkIds -> tokenizer.decode(chunkIds) }
24
- .toList()
18
+ fun decodeSegment (startIdx : Int ): String {
19
+ val end = minOf(startIdx + chunkSize, inputIds.size)
20
+ return tokenizer.decode(inputIds.subList(startIdx, end))
21
+ }
22
+ val segments = IntProgression .fromClosedRange(0 , inputIds.size - 1 , chunkSize - chunkOverlap)
23
+ return segments.toList().map { decodeSegment(it) }
25
24
}
26
25
27
26
override suspend fun splitDocuments (documents : List <String >): List <String > =
0 commit comments