From f3006ec4dc9e36549d27498813306e9238c2d19a Mon Sep 17 00:00:00 2001 From: Alejandro Serrano Date: Mon, 15 May 2023 10:41:30 +0200 Subject: [PATCH 1/3] Lucene as vector store --- core/build.gradle.kts | 1 + .../xebia/functional/vectorstores/Lucene.kt | 79 +++++++++++++++++++ gradle/libs.versions.toml | 2 + 3 files changed, 82 insertions(+) create mode 100644 core/src/jvmMain/kotlin/com/xebia/functional/vectorstores/Lucene.kt diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 777a765b8..85a37bdc4 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -81,6 +81,7 @@ kotlin { implementation(libs.jtokk.it) implementation(libs.scrape.it) implementation(libs.rss.reader) + implementation(libs.apache.lucene) } } diff --git a/core/src/jvmMain/kotlin/com/xebia/functional/vectorstores/Lucene.kt b/core/src/jvmMain/kotlin/com/xebia/functional/vectorstores/Lucene.kt new file mode 100644 index 000000000..faeb15b70 --- /dev/null +++ b/core/src/jvmMain/kotlin/com/xebia/functional/vectorstores/Lucene.kt @@ -0,0 +1,79 @@ +package com.xebia.functional.vectorstores + +import com.xebia.functional.embeddings.Embedding +import com.xebia.functional.embeddings.Embeddings +import com.xebia.functional.llm.openai.EmbeddingModel +import com.xebia.functional.llm.openai.RequestConfig +import org.apache.lucene.document.Document +import org.apache.lucene.document.Field +import org.apache.lucene.document.KnnFloatVectorField +import org.apache.lucene.document.TextField +import org.apache.lucene.index.* +import org.apache.lucene.search.FuzzyQuery +import org.apache.lucene.search.IndexSearcher +import org.apache.lucene.search.KnnFloatVectorQuery +import org.apache.lucene.search.Query +import org.apache.lucene.store.Directory +import org.apache.lucene.store.MMapDirectory +import java.nio.file.Path + +open class Lucene( + private val writer: IndexWriter, + private val searcher: IndexSearcher, + private val embeddings: Embeddings, + private val similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +): VectorStore, AutoCloseable { + + constructor( + writer: IndexWriter, + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN + ): this(writer, IndexSearcher(DirectoryReader.open(writer)), embeddings, similarity) + + private val requestConfig = + RequestConfig(EmbeddingModel.TextEmbeddingAda002, RequestConfig.Companion.User("user")) + + override suspend fun addTexts(texts: List) = texts.forEach { + val embedding = embeddings.embedQuery(it, requestConfig) + val doc = Document().apply { + add(TextField("contents", it, Field.Store.YES)) + add(KnnFloatVectorField("embedding", embedding.toFloatArray(), similarity)) + } + writer.addDocument(doc) + } + + override suspend fun similaritySearch(query: String, limit: Int): List = + search(FuzzyQuery(Term("contents", query)), limit) + + override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List = + search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit) + + private fun search(q: Query, limit: Int): List = searcher.search(q, limit).scoreDocs.map { + searcher.storedFields().document(it.doc).get("contents") + } + + override fun close() { + writer.close() + } +} + +class DirectoryLucene( + private val directory: Directory, + writerConfig: IndexWriterConfig = IndexWriterConfig(), + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +): Lucene(IndexWriter(directory, writerConfig), embeddings, similarity) { + override fun close() { + super.close() + directory.close() + } +} + +fun InMemoryLucene( + path: Path, + writerConfig: IndexWriterConfig = IndexWriterConfig(), + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +): DirectoryLucene = DirectoryLucene(MMapDirectory(path), writerConfig, embeddings, similarity) + +fun List.toFloatArray(): FloatArray = flatMap { it.data }.toFloatArray() diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index dfbd90e91..1c58feab4 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -33,6 +33,7 @@ munitCatsEffect = "1.0.5" scrapeit = "1.1.5" jtokkit = "0.4.0" rssreader = "3.4.4" +lucene = "9.6.0" [libraries] arrow-core = { module = "io.arrow-kt:arrow-core", version.ref = "arrow" } @@ -89,6 +90,7 @@ scrape-it-browser-fetcher = { module = "it.skrape:skrapeit-browser-fetcher", ver scrape-it-async-fetcher = { module = "it.skrape:skrapeit-asyn-fetcher", version.ref = "scrapeit" } jtokk-it = { module = "com.knuddels:jtokkit", version.ref = "jtokkit" } rss-reader = { module = "com.apptasticsoftware:rssreader", version.ref = "rssreader" } +apache-lucene = { module = "org.apache.lucene:lucene-core", version.ref = "lucene" } [bundles] arrow = [ From ee525f5be0ec7a295b0c5a9762c7afdfa6c0d752 Mon Sep 17 00:00:00 2001 From: Alejandro Serrano Date: Mon, 15 May 2023 11:40:33 +0200 Subject: [PATCH 2/3] Split into an integration module --- core/build.gradle.kts | 1 - .../xebia/functional/vectorstores/Lucene.kt | 79 ------------------ integrations/lucene/build.gradle.kts | 21 +++++ .../xebia/functional/vectorstores/Lucene.kt | 82 +++++++++++++++++++ settings.gradle.kts | 5 +- 5 files changed, 107 insertions(+), 81 deletions(-) delete mode 100644 core/src/jvmMain/kotlin/com/xebia/functional/vectorstores/Lucene.kt create mode 100644 integrations/lucene/build.gradle.kts create mode 100644 integrations/lucene/src/main/kotlin/com/xebia/functional/vectorstores/Lucene.kt diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 85a37bdc4..777a765b8 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -81,7 +81,6 @@ kotlin { implementation(libs.jtokk.it) implementation(libs.scrape.it) implementation(libs.rss.reader) - implementation(libs.apache.lucene) } } diff --git a/core/src/jvmMain/kotlin/com/xebia/functional/vectorstores/Lucene.kt b/core/src/jvmMain/kotlin/com/xebia/functional/vectorstores/Lucene.kt deleted file mode 100644 index faeb15b70..000000000 --- a/core/src/jvmMain/kotlin/com/xebia/functional/vectorstores/Lucene.kt +++ /dev/null @@ -1,79 +0,0 @@ -package com.xebia.functional.vectorstores - -import com.xebia.functional.embeddings.Embedding -import com.xebia.functional.embeddings.Embeddings -import com.xebia.functional.llm.openai.EmbeddingModel -import com.xebia.functional.llm.openai.RequestConfig -import org.apache.lucene.document.Document -import org.apache.lucene.document.Field -import org.apache.lucene.document.KnnFloatVectorField -import org.apache.lucene.document.TextField -import org.apache.lucene.index.* -import org.apache.lucene.search.FuzzyQuery -import org.apache.lucene.search.IndexSearcher -import org.apache.lucene.search.KnnFloatVectorQuery -import org.apache.lucene.search.Query -import org.apache.lucene.store.Directory -import org.apache.lucene.store.MMapDirectory -import java.nio.file.Path - -open class Lucene( - private val writer: IndexWriter, - private val searcher: IndexSearcher, - private val embeddings: Embeddings, - private val similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN -): VectorStore, AutoCloseable { - - constructor( - writer: IndexWriter, - embeddings: Embeddings, - similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN - ): this(writer, IndexSearcher(DirectoryReader.open(writer)), embeddings, similarity) - - private val requestConfig = - RequestConfig(EmbeddingModel.TextEmbeddingAda002, RequestConfig.Companion.User("user")) - - override suspend fun addTexts(texts: List) = texts.forEach { - val embedding = embeddings.embedQuery(it, requestConfig) - val doc = Document().apply { - add(TextField("contents", it, Field.Store.YES)) - add(KnnFloatVectorField("embedding", embedding.toFloatArray(), similarity)) - } - writer.addDocument(doc) - } - - override suspend fun similaritySearch(query: String, limit: Int): List = - search(FuzzyQuery(Term("contents", query)), limit) - - override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List = - search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit) - - private fun search(q: Query, limit: Int): List = searcher.search(q, limit).scoreDocs.map { - searcher.storedFields().document(it.doc).get("contents") - } - - override fun close() { - writer.close() - } -} - -class DirectoryLucene( - private val directory: Directory, - writerConfig: IndexWriterConfig = IndexWriterConfig(), - embeddings: Embeddings, - similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN -): Lucene(IndexWriter(directory, writerConfig), embeddings, similarity) { - override fun close() { - super.close() - directory.close() - } -} - -fun InMemoryLucene( - path: Path, - writerConfig: IndexWriterConfig = IndexWriterConfig(), - embeddings: Embeddings, - similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN -): DirectoryLucene = DirectoryLucene(MMapDirectory(path), writerConfig, embeddings, similarity) - -fun List.toFloatArray(): FloatArray = flatMap { it.data }.toFloatArray() diff --git a/integrations/lucene/build.gradle.kts b/integrations/lucene/build.gradle.kts new file mode 100644 index 000000000..1cfd3877e --- /dev/null +++ b/integrations/lucene/build.gradle.kts @@ -0,0 +1,21 @@ +plugins { + id(libs.plugins.kotlin.jvm.get().pluginId) + id(libs.plugins.kotlinx.serialization.get().pluginId) +} + +repositories { + mavenCentral() +} + +java { + sourceCompatibility = JavaVersion.VERSION_11 + targetCompatibility = JavaVersion.VERSION_11 + toolchain { + languageVersion = JavaLanguageVersion.of(11) + } +} + +dependencies { + implementation(projects.langchain4kCore) + implementation(libs.apache.lucene) +} diff --git a/integrations/lucene/src/main/kotlin/com/xebia/functional/vectorstores/Lucene.kt b/integrations/lucene/src/main/kotlin/com/xebia/functional/vectorstores/Lucene.kt new file mode 100644 index 000000000..040f40f4e --- /dev/null +++ b/integrations/lucene/src/main/kotlin/com/xebia/functional/vectorstores/Lucene.kt @@ -0,0 +1,82 @@ +package com.xebia.functional.vectorstores + +import com.xebia.functional.embeddings.Embedding +import com.xebia.functional.embeddings.Embeddings +import com.xebia.functional.llm.openai.EmbeddingModel +import com.xebia.functional.llm.openai.RequestConfig +import java.nio.file.Path +import org.apache.lucene.document.Document +import org.apache.lucene.document.Field +import org.apache.lucene.document.KnnFloatVectorField +import org.apache.lucene.document.TextField +import org.apache.lucene.index.* +import org.apache.lucene.search.FuzzyQuery +import org.apache.lucene.search.IndexSearcher +import org.apache.lucene.search.KnnFloatVectorQuery +import org.apache.lucene.search.Query +import org.apache.lucene.store.Directory +import org.apache.lucene.store.MMapDirectory + +open class Lucene( + private val writer: IndexWriter, + private val searcher: IndexSearcher, + private val embeddings: Embeddings, + private val similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +) : VectorStore, AutoCloseable { + + constructor( + writer: IndexWriter, + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN + ) : this(writer, IndexSearcher(DirectoryReader.open(writer)), embeddings, similarity) + + private val requestConfig = + RequestConfig(EmbeddingModel.TextEmbeddingAda002, RequestConfig.Companion.User("user")) + + override suspend fun addTexts(texts: List) = + texts.forEach { + val embedding = embeddings.embedQuery(it, requestConfig) + val doc = + Document().apply { + add(TextField("contents", it, Field.Store.YES)) + add(KnnFloatVectorField("embedding", embedding.toFloatArray(), similarity)) + } + writer.addDocument(doc) + } + + override suspend fun similaritySearch(query: String, limit: Int): List = + search(FuzzyQuery(Term("contents", query)), limit) + + override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List = + search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit) + + private fun search(q: Query, limit: Int): List = + searcher.search(q, limit).scoreDocs.map { + searcher.storedFields().document(it.doc).get("contents") + } + + override fun close() { + writer.close() + } +} + +class DirectoryLucene( + private val directory: Directory, + writerConfig: IndexWriterConfig = IndexWriterConfig(), + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +) : Lucene(IndexWriter(directory, writerConfig), embeddings, similarity) { + override fun close() { + super.close() + directory.close() + } +} + +fun InMemoryLucene( + path: Path, + writerConfig: IndexWriterConfig = IndexWriterConfig(), + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +): DirectoryLucene = DirectoryLucene(MMapDirectory(path), writerConfig, embeddings, similarity) + +fun List.toFloatArray(): FloatArray = flatMap { it.data }.toFloatArray() diff --git a/settings.gradle.kts b/settings.gradle.kts index aa4358aca..77f5a46a5 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -24,4 +24,7 @@ include("langchain4k-scala") project(":langchain4k-scala").projectDir = file("scala") include("langchain4k-core") -project(":langchain4k-core").projectDir = file("core") \ No newline at end of file +project(":langchain4k-core").projectDir = file("core") + +include("langchain4k-lucene") +project(":langchain4k-lucene").projectDir = file("integrations/lucene") From 84200f6f394d5d169aa39a46daa624acd3c36bf7 Mon Sep 17 00:00:00 2001 From: Alejandro Serrano Date: Mon, 15 May 2023 14:05:23 +0200 Subject: [PATCH 3/3] More time for build --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0ccec99f4..460c7839e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,7 +13,7 @@ concurrency: jobs: check: runs-on: ubuntu-latest - timeout-minutes: 10 + timeout-minutes: 15 steps: - uses: actions/checkout@v3