Add Encoding.truncateText (#84)

nomisRev · web-flow · commit 62070bc9ba4f · 2023-05-22T12:42:54.000+02:00
* Add Encoding.truncateText

* Add warning
diff --git a/tokenizer/src/commonMain/kotlin/com/xebia/functional/tokenizer/Encoding.kt b/tokenizer/src/commonMain/kotlin/com/xebia/functional/tokenizer/Encoding.kt
@@ -1,5 +1,7 @@
 package com.xebia.functional.tokenizer
 
+import kotlin.math.roundToInt
+
 /** The result of encoding operation. */
 data class EncodingResult(
   val tokens: List<Int>,
@@ -199,3 +201,34 @@ interface Encoding {
    */
   fun decodeBytes(tokens: List<Int>): ByteArray
 }
+
+/**
+ * Truncates the given [text] to the given [maxTokens] by removing tokens from the end of the text.
+ * It removes tokens from the tail of the [text].
+ * Tokens are chosen to be removed based on the percentage of the [maxTokens]
+ * compared to the total amount of tokens in the [text].
+ *
+ * If the truncation fails,
+ * it will retry by recursively calling this function until a text with maxTokens is found.
+ *
+ * **WARNING:** for small [maxTokens] this function may hang forever,
+ * some [text] like emoticons, or special characters have token length of 9.
+ * So trying to truncateText to maxToken = 5 might hang forever for them.
+ *
+ * **WARNING:** This method might truncate crucial information from your prompt,
+ * and as a result might degrade reliability of your prompts.
+ */
+tailrec fun Encoding.truncateText(text: String, maxTokens: Int): String {
+  val tokenCount = countTokens(text)
+  return if (tokenCount <= maxTokens) text
+  else {
+    val percentage = maxTokens.toDouble() / tokenCount.toDouble()
+    val truncatedTextLength = (text.length * percentage).roundToInt()
+    val result = text.substring(0, truncatedTextLength)
+    val tokenCountResult = countTokens(result)
+    when {
+      tokenCountResult >= maxTokens -> truncateText(result, maxTokens)
+      else -> result
+    }
+  }
+}
diff --git a/tokenizer/src/commonTest/kotlin/com/xebia/functional/tokenizer/EncodingTest.kt b/tokenizer/src/commonTest/kotlin/com/xebia/functional/tokenizer/EncodingTest.kt
@@ -0,0 +1,20 @@
+package com.xebia.functional.tokenizer
+
+import com.goncalossilva.resources.Resource
+import io.kotest.assertions.withClue
+import io.kotest.matchers.ints.shouldBeLessThan
+import io.kotest.matchers.ints.shouldBeLessThanOrEqual
+import kotlin.test.Test
+
+class EncodingTest {
+  private val resource = Resource("src/commonTest/resources/cl100k_base_encodings.csv")
+  private val ENCODING = EncodingType.CL100K_BASE.encoding
+
+  @Test
+  fun truncateText() {
+    resource.splitCSV().forEach { (input, _, _) ->
+      val result = ENCODING.truncateText(input, 10)
+      ENCODING.countTokens(result) shouldBeLessThanOrEqual 10
+    }
+  }
+}