Skip to content

Commit 2937da5

Browse files
authored
Lucene as vector store (#59)
1 parent d2afde1 commit 2937da5

File tree

5 files changed

+110
-2
lines changed

5 files changed

+110
-2
lines changed

.github/workflows/build.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ concurrency:
1313
jobs:
1414
check:
1515
runs-on: ubuntu-latest
16-
timeout-minutes: 10
16+
timeout-minutes: 15
1717

1818
steps:
1919
- uses: actions/checkout@v3

gradle/libs.versions.toml

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ munitCatsEffect = "1.0.5"
3333
scrapeit = "1.1.5"
3434
jtokkit = "0.4.0"
3535
rssreader = "3.4.4"
36+
lucene = "9.6.0"
3637

3738
[libraries]
3839
arrow-core = { module = "io.arrow-kt:arrow-core", version.ref = "arrow" }
@@ -89,6 +90,7 @@ scrape-it-browser-fetcher = { module = "it.skrape:skrapeit-browser-fetcher", ver
8990
scrape-it-async-fetcher = { module = "it.skrape:skrapeit-asyn-fetcher", version.ref = "scrapeit" }
9091
jtokk-it = { module = "com.knuddels:jtokkit", version.ref = "jtokkit" }
9192
rss-reader = { module = "com.apptasticsoftware:rssreader", version.ref = "rssreader" }
93+
apache-lucene = { module = "org.apache.lucene:lucene-core", version.ref = "lucene" }
9294

9395
[bundles]
9496
arrow = [

integrations/lucene/build.gradle.kts

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
plugins {
2+
id(libs.plugins.kotlin.jvm.get().pluginId)
3+
id(libs.plugins.kotlinx.serialization.get().pluginId)
4+
}
5+
6+
repositories {
7+
mavenCentral()
8+
}
9+
10+
java {
11+
sourceCompatibility = JavaVersion.VERSION_11
12+
targetCompatibility = JavaVersion.VERSION_11
13+
toolchain {
14+
languageVersion = JavaLanguageVersion.of(11)
15+
}
16+
}
17+
18+
dependencies {
19+
implementation(projects.xefCore)
20+
implementation(libs.apache.lucene)
21+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
package com.xebia.functional.xef.vectorstores
2+
3+
import com.xebia.functional.xef.embeddings.Embedding
4+
import com.xebia.functional.xef.embeddings.Embeddings
5+
import com.xebia.functional.xef.llm.openai.EmbeddingModel
6+
import com.xebia.functional.xef.llm.openai.RequestConfig
7+
import java.nio.file.Path
8+
import org.apache.lucene.document.Document
9+
import org.apache.lucene.document.Field
10+
import org.apache.lucene.document.KnnFloatVectorField
11+
import org.apache.lucene.document.TextField
12+
import org.apache.lucene.index.*
13+
import org.apache.lucene.search.FuzzyQuery
14+
import org.apache.lucene.search.IndexSearcher
15+
import org.apache.lucene.search.KnnFloatVectorQuery
16+
import org.apache.lucene.search.Query
17+
import org.apache.lucene.store.Directory
18+
import org.apache.lucene.store.MMapDirectory
19+
20+
open class Lucene(
21+
private val writer: IndexWriter,
22+
private val searcher: IndexSearcher,
23+
private val embeddings: Embeddings,
24+
private val similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
25+
) : VectorStore, AutoCloseable {
26+
27+
constructor(
28+
writer: IndexWriter,
29+
embeddings: Embeddings,
30+
similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
31+
) : this(writer, IndexSearcher(DirectoryReader.open(writer)), embeddings, similarity)
32+
33+
private val requestConfig =
34+
RequestConfig(EmbeddingModel.TextEmbeddingAda002, RequestConfig.Companion.User("user"))
35+
36+
override suspend fun addTexts(texts: List<String>) =
37+
texts.forEach {
38+
val embedding = embeddings.embedQuery(it, requestConfig)
39+
val doc =
40+
Document().apply {
41+
add(TextField("contents", it, Field.Store.YES))
42+
add(KnnFloatVectorField("embedding", embedding.toFloatArray(), similarity))
43+
}
44+
writer.addDocument(doc)
45+
}
46+
47+
override suspend fun similaritySearch(query: String, limit: Int): List<String> =
48+
search(FuzzyQuery(Term("contents", query)), limit)
49+
50+
override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List<String> =
51+
search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit)
52+
53+
private fun search(q: Query, limit: Int): List<String> =
54+
searcher.search(q, limit).scoreDocs.map {
55+
searcher.storedFields().document(it.doc).get("contents")
56+
}
57+
58+
override fun close() {
59+
writer.close()
60+
}
61+
}
62+
63+
class DirectoryLucene(
64+
private val directory: Directory,
65+
writerConfig: IndexWriterConfig = IndexWriterConfig(),
66+
embeddings: Embeddings,
67+
similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
68+
) : Lucene(IndexWriter(directory, writerConfig), embeddings, similarity) {
69+
override fun close() {
70+
super.close()
71+
directory.close()
72+
}
73+
}
74+
75+
fun InMemoryLucene(
76+
path: Path,
77+
writerConfig: IndexWriterConfig = IndexWriterConfig(),
78+
embeddings: Embeddings,
79+
similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
80+
): DirectoryLucene = DirectoryLucene(MMapDirectory(path), writerConfig, embeddings, similarity)
81+
82+
fun List<Embedding>.toFloatArray(): FloatArray = flatMap { it.data }.toFloatArray()

settings.gradle.kts

+4-1
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,7 @@ include("xef-scala")
2626
project(":xef-scala").projectDir = file("scala")
2727

2828
include("xef-core")
29-
project(":xef-core").projectDir = file("core")
29+
project(":xef-core").projectDir = file("core")
30+
31+
include("xef-lucene")
32+
project(":xef-lucene").projectDir = file("integrations/lucene")

0 commit comments

Comments
 (0)