Skip to content

Commit

Permalink
TokenTextSplitter Ref: extract method, use range.
Browse files Browse the repository at this point in the history
Co-authored-by: Simon Vergauwen <[email protected]>
  • Loading branch information
diesalbla and nomisRev committed Jun 23, 2023
1 parent 3a7a08d commit 4ae7d77
Showing 1 changed file with 6 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@ private class TokenTextSplitterImpl(

override suspend fun splitText(text: String): List<String> {
val inputIds = tokenizer.encode(text)
val stepSize = chunkSize - chunkOverlap

return inputIds.indices
.asSequence()
.filter { it % stepSize == 0 }
.map { startIdx -> inputIds.subList(startIdx, minOf(startIdx + chunkSize, inputIds.size)) }
.map { chunkIds -> tokenizer.decode(chunkIds) }
.toList()
fun decodeSegment(startIdx: Int): String {
val end = minOf(startIdx + chunkSize, inputIds.size)
return tokenizer.decode(inputIds.subList(startIdx, end))
}
val segments = inputIds.indices step (chunkSize - chunkOverlap)
return segments.map { decodeSegment(it) }
}

override suspend fun splitDocuments(documents: List<String>): List<String> =
Expand Down

0 comments on commit 4ae7d77

Please sign in to comment.