From 6ee648aac81da352a01b1925bfde6d38da1f0c9d Mon Sep 17 00:00:00 2001 From: David Vega Lichacz <7826728+realdavidvega@users.noreply.github.com> Date: Mon, 24 Apr 2023 11:45:37 +0200 Subject: [PATCH] Port BaseLoader + BaseTextSplitter interfaces and TextLoader + CharacterTextSplitter impl. (#8) * feat: create basic text loader and splitter * feat: add JVM and native TextLoaders * test: add TextLoader simple test * test: add CharacterTextSplitter simple test * style: add new lines at end of some files * feat: apply suggestion of using nested use on Source and Buffer --- .../com/xebia/functional/domain/Document.kt | 3 + .../xebia/functional/loaders/BaseLoader.kt | 9 +++ .../xebia/functional/loaders/TextLoader.kt | 36 +++++++++++ .../xebia/functional/prompt/PromptTemplate.kt | 12 ++-- .../textsplitters/BaseTextSplitter.kt | 9 +++ .../textsplitters/CharacterTextSplitter.kt | 17 ++++++ .../functional/loaders/TextLoaderSpec.kt | 59 +++++++++++++++++++ .../CharacterTextSplitterSpec.kt | 52 ++++++++++++++++ .../xebia/functional/loaders/TextLoaderJVM.kt | 9 +++ .../xebia/functional/loaders/TextLoader.kt | 9 +++ 10 files changed, 207 insertions(+), 8 deletions(-) create mode 100644 src/commonMain/kotlin/com/xebia/functional/domain/Document.kt create mode 100644 src/commonMain/kotlin/com/xebia/functional/loaders/BaseLoader.kt create mode 100644 src/commonMain/kotlin/com/xebia/functional/loaders/TextLoader.kt create mode 100644 src/commonMain/kotlin/com/xebia/functional/textsplitters/BaseTextSplitter.kt create mode 100644 src/commonMain/kotlin/com/xebia/functional/textsplitters/CharacterTextSplitter.kt create mode 100644 src/commonTest/kotlin/com/xebia/functional/loaders/TextLoaderSpec.kt create mode 100644 src/commonTest/kotlin/com/xebia/functional/textsplitters/CharacterTextSplitterSpec.kt create mode 100644 src/jvmMain/kotlin/com/xebia/functional/loaders/TextLoaderJVM.kt create mode 100644 src/nativeMain/kotlin/com/xebia/functional/loaders/TextLoader.kt diff --git a/src/commonMain/kotlin/com/xebia/functional/domain/Document.kt b/src/commonMain/kotlin/com/xebia/functional/domain/Document.kt new file mode 100644 index 000000000..0eebbe86a --- /dev/null +++ b/src/commonMain/kotlin/com/xebia/functional/domain/Document.kt @@ -0,0 +1,3 @@ +package com.xebia.functional.domain + +data class Document(val content: String) diff --git a/src/commonMain/kotlin/com/xebia/functional/loaders/BaseLoader.kt b/src/commonMain/kotlin/com/xebia/functional/loaders/BaseLoader.kt new file mode 100644 index 000000000..f2b92d38d --- /dev/null +++ b/src/commonMain/kotlin/com/xebia/functional/loaders/BaseLoader.kt @@ -0,0 +1,9 @@ +package com.xebia.functional.loaders + +import com.xebia.functional.domain.Document +import com.xebia.functional.textsplitters.BaseTextSplitter + +interface BaseLoader { + suspend fun load(): List + suspend fun loadAndSplit(textSplitter: BaseTextSplitter): List +} diff --git a/src/commonMain/kotlin/com/xebia/functional/loaders/TextLoader.kt b/src/commonMain/kotlin/com/xebia/functional/loaders/TextLoader.kt new file mode 100644 index 000000000..2a55c47e9 --- /dev/null +++ b/src/commonMain/kotlin/com/xebia/functional/loaders/TextLoader.kt @@ -0,0 +1,36 @@ +package com.xebia.functional.loaders + +import com.xebia.functional.domain.Document +import com.xebia.functional.textsplitters.BaseTextSplitter +import okio.FileSystem +import okio.Path + +/** + * Creates a TextLoader based on a Path + * JVM & Native have overloads for FileSystem.SYSTEM, + * on NodeJs you need to manually pass FileSystem.SYSTEM. + * + * This function can currently not be used on the browser. + * + * https://github.com/square/okio/issues/1070 + * https://youtrack.jetbrains.com/issue/KT-47038 + */ +suspend fun TextLoader( + filePath: Path, + fileSystem: FileSystem +): BaseLoader = object : BaseLoader { + + override suspend fun load(): List = + buildList { + fileSystem.read(filePath) { + while (true) { + val line = readUtf8Line() ?: break + val document = Document(line) + add(document) + } + } + } + + override suspend fun loadAndSplit(textSplitter: BaseTextSplitter): List = + textSplitter.splitDocuments(documents = load()) +} diff --git a/src/commonMain/kotlin/com/xebia/functional/prompt/PromptTemplate.kt b/src/commonMain/kotlin/com/xebia/functional/prompt/PromptTemplate.kt index fe1f408ba..1c6e03194 100644 --- a/src/commonMain/kotlin/com/xebia/functional/prompt/PromptTemplate.kt +++ b/src/commonMain/kotlin/com/xebia/functional/prompt/PromptTemplate.kt @@ -3,8 +3,6 @@ package com.xebia.functional.prompt import arrow.core.raise.Raise import okio.FileSystem import okio.Path -import okio.buffer -import okio.use fun Raise.PromptTemplate( examples: List, @@ -38,12 +36,10 @@ suspend fun Raise.PromptTemplate( variables: List, fileSystem: FileSystem ): PromptTemplate = - fileSystem.source(path).use { source -> - source.buffer().use { buffer -> - val template = buffer.readUtf8() - val config = Config(template, variables) - PromptTemplate(config) - } + fileSystem.read(path) { + val template = readUtf8() + val config = Config(template, variables) + PromptTemplate(config) } interface PromptTemplate { diff --git a/src/commonMain/kotlin/com/xebia/functional/textsplitters/BaseTextSplitter.kt b/src/commonMain/kotlin/com/xebia/functional/textsplitters/BaseTextSplitter.kt new file mode 100644 index 000000000..a428139c4 --- /dev/null +++ b/src/commonMain/kotlin/com/xebia/functional/textsplitters/BaseTextSplitter.kt @@ -0,0 +1,9 @@ +package com.xebia.functional.textsplitters + +import com.xebia.functional.domain.Document + +interface BaseTextSplitter { + suspend fun splitText(text: String): List + suspend fun splitDocuments(documents: List): List + suspend fun splitTextInDocuments(text: String): List +} diff --git a/src/commonMain/kotlin/com/xebia/functional/textsplitters/CharacterTextSplitter.kt b/src/commonMain/kotlin/com/xebia/functional/textsplitters/CharacterTextSplitter.kt new file mode 100644 index 000000000..cd9c19866 --- /dev/null +++ b/src/commonMain/kotlin/com/xebia/functional/textsplitters/CharacterTextSplitter.kt @@ -0,0 +1,17 @@ +package com.xebia.functional.textsplitters + +import com.xebia.functional.domain.Document + +suspend fun CharacterTextSplitter( + separator: String +): BaseTextSplitter = object : BaseTextSplitter { + + override suspend fun splitText(text: String): List = + text.split(separator) + + override suspend fun splitDocuments(documents: List): List = + documents.flatMap { doc -> doc.content.split(separator) }.map(::Document) + + override suspend fun splitTextInDocuments(text: String): List = + text.split(separator).map(::Document) +} diff --git a/src/commonTest/kotlin/com/xebia/functional/loaders/TextLoaderSpec.kt b/src/commonTest/kotlin/com/xebia/functional/loaders/TextLoaderSpec.kt new file mode 100644 index 000000000..99f7e9a9c --- /dev/null +++ b/src/commonTest/kotlin/com/xebia/functional/loaders/TextLoaderSpec.kt @@ -0,0 +1,59 @@ +package com.xebia.functional.loaders + +import com.xebia.functional.domain.Document +import com.xebia.functional.textsplitters.CharacterTextSplitter +import io.kotest.core.spec.style.StringSpec +import io.kotest.matchers.shouldBe +import okio.Path.Companion.toPath +import okio.fakefilesystem.FakeFileSystem + +class TextLoaderSpec : StringSpec({ + "should return a list of documents with the contents of each line of the specified file" { + val fileSystem = FakeFileSystem().apply { + val templates = "templates".toPath() + createDirectory(templates) + val example = templates / "example.txt" + write(example) { + writeUtf8( + """ + |Lorem ipsum dolor sit amet, consectetur adipiscing elit. + |Sed do eiusmod tempor incididunt, ut labore et dolore magna aliqua. + """.trimMargin() + ) + } + } + val textLoader = TextLoader("templates/example.txt".toPath(), fileSystem) + val documentList = textLoader.load() + + documentList shouldBe listOf( + Document("Lorem ipsum dolor sit amet, consectetur adipiscing elit."), + Document("Sed do eiusmod tempor incididunt, ut labore et dolore magna aliqua.") + ) + } + + "should return a list of documents with the contents of each trim of the specified file" { + val fileSystem = FakeFileSystem().apply { + val templates = "templates".toPath() + createDirectory(templates) + val example = templates / "example.txt" + write(example) { + writeUtf8( + """ + |Lorem ipsum dolor sit amet, consectetur adipiscing elit. + |Sed do eiusmod tempor incididunt, ut labore et dolore magna aliqua. + """.trimMargin() + ) + } + } + val textLoader = TextLoader("templates/example.txt".toPath(), fileSystem) + val textSplitter = CharacterTextSplitter(", ") + val documentList = textLoader.loadAndSplit(textSplitter) + + documentList shouldBe listOf( + Document("Lorem ipsum dolor sit amet"), + Document("consectetur adipiscing elit."), + Document("Sed do eiusmod tempor incididunt"), + Document("ut labore et dolore magna aliqua.") + ) + } +}) diff --git a/src/commonTest/kotlin/com/xebia/functional/textsplitters/CharacterTextSplitterSpec.kt b/src/commonTest/kotlin/com/xebia/functional/textsplitters/CharacterTextSplitterSpec.kt new file mode 100644 index 000000000..307ccee27 --- /dev/null +++ b/src/commonTest/kotlin/com/xebia/functional/textsplitters/CharacterTextSplitterSpec.kt @@ -0,0 +1,52 @@ +package com.xebia.functional.textsplitters + +import com.xebia.functional.domain.Document +import io.kotest.core.spec.style.StringSpec +import io.kotest.matchers.shouldBe + +class CharacterTextSplitterSpec : StringSpec({ + "should return a list of strings after split with a given separator" { + + val text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + + val separator = ", " + val textSplitter = CharacterTextSplitter(separator) + + textSplitter.splitText(text) shouldBe listOf( + "Lorem ipsum dolor sit amet", + "consectetur adipiscing elit." + ) + } + + "should return a list of documents after split on a list of documents with a given separator" { + + val documents = listOf( + Document("Lorem ipsum dolor sit amet, consectetur adipiscing elit."), + Document("Sed do eiusmod tempor incididunt, ut labore et dolore magna aliqua.") + ) + + val separator = ", " + val textSplitter = CharacterTextSplitter(separator) + + textSplitter.splitDocuments(documents) shouldBe listOf( + Document("Lorem ipsum dolor sit amet"), + Document("consectetur adipiscing elit."), + Document("Sed do eiusmod tempor incididunt"), + Document("ut labore et dolore magna aliqua.") + ) + } + + "should return a list of documents after split on a text with a given separator" { + + val text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, ut labore et dolore magna aliqua." + + val separator = ", " + val textSplitter = CharacterTextSplitter(separator) + + textSplitter.splitTextInDocuments(text) shouldBe listOf( + Document("Lorem ipsum dolor sit amet"), + Document("consectetur adipiscing elit"), + Document("ut labore et dolore magna aliqua.") + ) + } +}) diff --git a/src/jvmMain/kotlin/com/xebia/functional/loaders/TextLoaderJVM.kt b/src/jvmMain/kotlin/com/xebia/functional/loaders/TextLoaderJVM.kt new file mode 100644 index 000000000..718a92e67 --- /dev/null +++ b/src/jvmMain/kotlin/com/xebia/functional/loaders/TextLoaderJVM.kt @@ -0,0 +1,9 @@ +package com.xebia.functional.loaders + +import okio.FileSystem +import okio.Path + +suspend fun TextLoader( + filePath: Path +): BaseLoader = + TextLoader(filePath, FileSystem.SYSTEM) diff --git a/src/nativeMain/kotlin/com/xebia/functional/loaders/TextLoader.kt b/src/nativeMain/kotlin/com/xebia/functional/loaders/TextLoader.kt new file mode 100644 index 000000000..2bc87c6f1 --- /dev/null +++ b/src/nativeMain/kotlin/com/xebia/functional/loaders/TextLoader.kt @@ -0,0 +1,9 @@ +package com.xebia.functional.loaders + +import okio.FileSystem +import okio.Path + +suspend fun TextLoader( + filePath: Path +): BaseLoader = + TextLoader(filePath, FileSystem.SYSTEM)