Skip to content

Commit 8118c14

Browse files
authored
Improve Lucene similarity search (#108)
1 parent 0970f12 commit 8118c14

File tree

3 files changed

+25
-22
lines changed

3 files changed

+25
-22
lines changed

gradle/libs.versions.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,8 @@ skrape = { module = "it.skrape:skrapeit", version.ref = "scrapeit" }
9696
skrape-browser-fetcher = { module = "it.skrape:skrapeit-browser-fetcher", version.ref = "scrapeit" }
9797
skrape-async-fetcher = { module = "it.skrape:skrapeit-asyn-fetcher", version.ref = "scrapeit" }
9898
rss-reader = { module = "com.apptasticsoftware:rssreader", version.ref = "rssreader" }
99-
apache-lucene = { module = "org.apache.lucene:lucene-core", version.ref = "lucene" }
99+
lucene-core = { module = "org.apache.lucene:lucene-core", version.ref = "lucene" }
100+
lucene-queries = { module = "org.apache.lucene:lucene-queries", version.ref = "lucene" }
100101
assertj = { module = "org.assertj:assertj-core", version.ref = "assertj" }
101102
apache-pdf-box = { module = "org.apache.pdfbox:pdfbox", version.ref = "pdfbox" }
102103
jdbc-mysql-connector = { module = "mysql:mysql-connector-java", version.ref = "mysql" }

integrations/lucene/build.gradle.kts

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ java {
1919

2020
dependencies {
2121
implementation(projects.xefCore)
22-
api(libs.apache.lucene)
22+
api(libs.lucene.core)
23+
api(libs.lucene.queries)
2324
}
2425

2526
tasks.withType<AbstractPublishToMaven> {

integrations/lucene/src/main/kotlin/com/xebia/functional/xef/vectorstores/Lucene.kt

+21-20
Original file line numberDiff line numberDiff line change
@@ -6,32 +6,22 @@ import com.xebia.functional.xef.embeddings.Embedding
66
import com.xebia.functional.xef.embeddings.Embeddings
77
import com.xebia.functional.xef.llm.openai.EmbeddingModel
88
import com.xebia.functional.xef.llm.openai.RequestConfig
9+
import org.apache.lucene.analysis.standard.StandardAnalyzer
10+
import org.apache.lucene.document.*
911
import java.nio.file.Path
10-
import org.apache.lucene.document.Document
11-
import org.apache.lucene.document.Field
12-
import org.apache.lucene.document.KnnFloatVectorField
13-
import org.apache.lucene.document.TextField
1412
import org.apache.lucene.index.*
15-
import org.apache.lucene.search.FuzzyQuery
16-
import org.apache.lucene.search.IndexSearcher
17-
import org.apache.lucene.search.KnnFloatVectorQuery
18-
import org.apache.lucene.search.Query
13+
import org.apache.lucene.queries.mlt.MoreLikeThis
14+
import org.apache.lucene.search.*
1915
import org.apache.lucene.store.Directory
2016
import org.apache.lucene.store.MMapDirectory
17+
import java.io.StringReader
2118

2219
open class Lucene(
2320
private val writer: IndexWriter,
24-
private val searcher: IndexSearcher,
2521
private val embeddings: Embeddings?,
2622
private val similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
2723
) : VectorStore, AutoCloseable {
2824

29-
constructor(
30-
writer: IndexWriter,
31-
embeddings: Embeddings?,
32-
similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN
33-
) : this(writer, IndexSearcher(DirectoryReader.open(writer)), embeddings, similarity)
34-
3525
private val requestConfig =
3626
RequestConfig(EmbeddingModel.TextEmbeddingAda002, RequestConfig.Companion.User("user"))
3727

@@ -48,16 +38,27 @@ open class Lucene(
4838
writer.commit()
4939
}
5040

51-
override suspend fun similaritySearch(query: String, limit: Int): List<String> =
52-
search(FuzzyQuery(Term("contents", query)), limit)
41+
override suspend fun similaritySearch(query: String, limit: Int): List<String> {
42+
val reader = DirectoryReader.open(writer)
43+
val mlt = MoreLikeThis(reader)
44+
mlt.analyzer = StandardAnalyzer()
45+
mlt.minTermFreq = 1
46+
mlt.minDocFreq = 1
47+
mlt.minWordLen = 3
48+
val luceneQuery = mlt.like("contents", StringReader(query))
49+
val searcher = IndexSearcher(reader)
50+
return IndexSearcher(reader).search(luceneQuery, limit).extract(searcher)
51+
}
5352

5453
override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List<String> {
5554
requireNotNull(embeddings) { "no embeddings were computed for this model" }
56-
return search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit)
55+
val luceneQuery = KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit)
56+
val searcher = IndexSearcher(DirectoryReader.open(writer))
57+
return searcher.search(luceneQuery, limit).extract(searcher)
5758
}
5859

59-
private fun search(q: Query, limit: Int): List<String> =
60-
searcher.search(q, limit).scoreDocs.map {
60+
fun TopDocs.extract(searcher: IndexSearcher): List<String> =
61+
scoreDocs.map {
6162
searcher.storedFields().document(it.doc).get("contents")
6263
}
6364

0 commit comments

Comments
 (0)