Skip to content

Commit

Permalink
TokenTextSplitter Ref: extract method, use range.
Browse files Browse the repository at this point in the history
  • Loading branch information
diesalbla committed Jun 22, 2023
1 parent 3a7a08d commit c4bf421
Showing 1 changed file with 9 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package com.xebia.functional.xef.textsplitters

import com.xebia.functional.tokenizer.Encoding
import com.xebia.functional.tokenizer.ModelType
import kotlin.ranges.IntProgression

fun TokenTextSplitter(modelType: ModelType, chunkSize: Int, chunkOverlap: Int): TextSplitter =
TokenTextSplitterImpl(modelType.encoding, chunkSize, chunkOverlap)
Expand All @@ -16,12 +17,14 @@ private class TokenTextSplitterImpl(
val inputIds = tokenizer.encode(text)
val stepSize = chunkSize - chunkOverlap

return inputIds.indices
.asSequence()
.filter { it % stepSize == 0 }
.map { startIdx -> inputIds.subList(startIdx, minOf(startIdx + chunkSize, inputIds.size)) }
.map { chunkIds -> tokenizer.decode(chunkIds) }
.toList()
fun decodeSegment(startIdx: Int): String {
val end = minOf(startIdx + chunkSize, inputIds.size)
return tokenizer.decode(inputIds.subList(startIdx, end))
}

return IntProgression.fromClosedRange(0, inputIds.size - 1, stepSize).toList().map { seg ->
decodeSegment(seg)
}
}

override suspend fun splitDocuments(documents: List<String>): List<String> =
Expand Down

0 comments on commit c4bf421

Please sign in to comment.