Skip to content

Commit

Permalink
TokenTextSplitter Ref: extract method, use range.
Browse files Browse the repository at this point in the history
  • Loading branch information
diesalbla committed Jun 22, 2023
1 parent 3a7a08d commit 3b3b27c
Showing 1 changed file with 7 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package com.xebia.functional.xef.textsplitters

import com.xebia.functional.tokenizer.Encoding
import com.xebia.functional.tokenizer.ModelType
import kotlin.ranges.IntProgression

fun TokenTextSplitter(modelType: ModelType, chunkSize: Int, chunkOverlap: Int): TextSplitter =
TokenTextSplitterImpl(modelType.encoding, chunkSize, chunkOverlap)
Expand All @@ -14,14 +15,13 @@ private class TokenTextSplitterImpl(

override suspend fun splitText(text: String): List<String> {
val inputIds = tokenizer.encode(text)
val stepSize = chunkSize - chunkOverlap

return inputIds.indices
.asSequence()
.filter { it % stepSize == 0 }
.map { startIdx -> inputIds.subList(startIdx, minOf(startIdx + chunkSize, inputIds.size)) }
.map { chunkIds -> tokenizer.decode(chunkIds) }
.toList()
fun decodeSegment(startIdx: Int): String {
val end = minOf(startIdx + chunkSize, inputIds.size)
return tokenizer.decode(inputIds.subList(startIdx, end))
}
val segments = IntProgression.fromClosedRange(0, inputIds.size - 1, chunkSize - chunkOverlap)
segments.toList().map { seg -> decodeSegment(seg) }
}

override suspend fun splitDocuments(documents: List<String>): List<String> =
Expand Down

0 comments on commit 3b3b27c

Please sign in to comment.