diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0ccec99f4..460c7839e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,7 +13,7 @@ concurrency: jobs: check: runs-on: ubuntu-latest - timeout-minutes: 10 + timeout-minutes: 15 steps: - uses: actions/checkout@v3 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index dfbd90e91..1c58feab4 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -33,6 +33,7 @@ munitCatsEffect = "1.0.5" scrapeit = "1.1.5" jtokkit = "0.4.0" rssreader = "3.4.4" +lucene = "9.6.0" [libraries] arrow-core = { module = "io.arrow-kt:arrow-core", version.ref = "arrow" } @@ -89,6 +90,7 @@ scrape-it-browser-fetcher = { module = "it.skrape:skrapeit-browser-fetcher", ver scrape-it-async-fetcher = { module = "it.skrape:skrapeit-asyn-fetcher", version.ref = "scrapeit" } jtokk-it = { module = "com.knuddels:jtokkit", version.ref = "jtokkit" } rss-reader = { module = "com.apptasticsoftware:rssreader", version.ref = "rssreader" } +apache-lucene = { module = "org.apache.lucene:lucene-core", version.ref = "lucene" } [bundles] arrow = [ diff --git a/integrations/lucene/build.gradle.kts b/integrations/lucene/build.gradle.kts new file mode 100644 index 000000000..ac40c7f4a --- /dev/null +++ b/integrations/lucene/build.gradle.kts @@ -0,0 +1,21 @@ +plugins { + id(libs.plugins.kotlin.jvm.get().pluginId) + id(libs.plugins.kotlinx.serialization.get().pluginId) +} + +repositories { + mavenCentral() +} + +java { + sourceCompatibility = JavaVersion.VERSION_11 + targetCompatibility = JavaVersion.VERSION_11 + toolchain { + languageVersion = JavaLanguageVersion.of(11) + } +} + +dependencies { + implementation(projects.xefCore) + implementation(libs.apache.lucene) +} diff --git a/integrations/lucene/src/main/kotlin/com/xebia/functional/xef/vectorstores/Lucene.kt b/integrations/lucene/src/main/kotlin/com/xebia/functional/xef/vectorstores/Lucene.kt new file mode 100644 index 000000000..225a3d003 --- /dev/null +++ b/integrations/lucene/src/main/kotlin/com/xebia/functional/xef/vectorstores/Lucene.kt @@ -0,0 +1,82 @@ +package com.xebia.functional.xef.vectorstores + +import com.xebia.functional.xef.embeddings.Embedding +import com.xebia.functional.xef.embeddings.Embeddings +import com.xebia.functional.xef.llm.openai.EmbeddingModel +import com.xebia.functional.xef.llm.openai.RequestConfig +import java.nio.file.Path +import org.apache.lucene.document.Document +import org.apache.lucene.document.Field +import org.apache.lucene.document.KnnFloatVectorField +import org.apache.lucene.document.TextField +import org.apache.lucene.index.* +import org.apache.lucene.search.FuzzyQuery +import org.apache.lucene.search.IndexSearcher +import org.apache.lucene.search.KnnFloatVectorQuery +import org.apache.lucene.search.Query +import org.apache.lucene.store.Directory +import org.apache.lucene.store.MMapDirectory + +open class Lucene( + private val writer: IndexWriter, + private val searcher: IndexSearcher, + private val embeddings: Embeddings, + private val similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +) : VectorStore, AutoCloseable { + + constructor( + writer: IndexWriter, + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN + ) : this(writer, IndexSearcher(DirectoryReader.open(writer)), embeddings, similarity) + + private val requestConfig = + RequestConfig(EmbeddingModel.TextEmbeddingAda002, RequestConfig.Companion.User("user")) + + override suspend fun addTexts(texts: List) = + texts.forEach { + val embedding = embeddings.embedQuery(it, requestConfig) + val doc = + Document().apply { + add(TextField("contents", it, Field.Store.YES)) + add(KnnFloatVectorField("embedding", embedding.toFloatArray(), similarity)) + } + writer.addDocument(doc) + } + + override suspend fun similaritySearch(query: String, limit: Int): List = + search(FuzzyQuery(Term("contents", query)), limit) + + override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List = + search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit) + + private fun search(q: Query, limit: Int): List = + searcher.search(q, limit).scoreDocs.map { + searcher.storedFields().document(it.doc).get("contents") + } + + override fun close() { + writer.close() + } +} + +class DirectoryLucene( + private val directory: Directory, + writerConfig: IndexWriterConfig = IndexWriterConfig(), + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +) : Lucene(IndexWriter(directory, writerConfig), embeddings, similarity) { + override fun close() { + super.close() + directory.close() + } +} + +fun InMemoryLucene( + path: Path, + writerConfig: IndexWriterConfig = IndexWriterConfig(), + embeddings: Embeddings, + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN +): DirectoryLucene = DirectoryLucene(MMapDirectory(path), writerConfig, embeddings, similarity) + +fun List.toFloatArray(): FloatArray = flatMap { it.data }.toFloatArray() diff --git a/settings.gradle.kts b/settings.gradle.kts index 0c7898b44..4ec8b4671 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -26,4 +26,7 @@ include("xef-scala") project(":xef-scala").projectDir = file("scala") include("xef-core") -project(":xef-core").projectDir = file("core") \ No newline at end of file +project(":xef-core").projectDir = file("core") + +include("xef-lucene") +project(":xef-lucene").projectDir = file("integrations/lucene")