@@ -6,32 +6,22 @@ import com.xebia.functional.xef.embeddings.Embedding
6
6
import com.xebia.functional.xef.embeddings.Embeddings
7
7
import com.xebia.functional.xef.llm.openai.EmbeddingModel
8
8
import com.xebia.functional.xef.llm.openai.RequestConfig
9
+ import org.apache.lucene.analysis.standard.StandardAnalyzer
10
+ import org.apache.lucene.document.*
9
11
import java.nio.file.Path
10
- import org.apache.lucene.document.Document
11
- import org.apache.lucene.document.Field
12
- import org.apache.lucene.document.KnnFloatVectorField
13
- import org.apache.lucene.document.TextField
14
12
import org.apache.lucene.index.*
15
- import org.apache.lucene.search.FuzzyQuery
16
- import org.apache.lucene.search.IndexSearcher
17
- import org.apache.lucene.search.KnnFloatVectorQuery
18
- import org.apache.lucene.search.Query
13
+ import org.apache.lucene.queries.mlt.MoreLikeThis
14
+ import org.apache.lucene.search.*
19
15
import org.apache.lucene.store.Directory
20
16
import org.apache.lucene.store.MMapDirectory
17
+ import java.io.StringReader
21
18
22
19
open class Lucene (
23
20
private val writer : IndexWriter ,
24
- private val searcher : IndexSearcher ,
25
21
private val embeddings : Embeddings ? ,
26
22
private val similarity : VectorSimilarityFunction = VectorSimilarityFunction .EUCLIDEAN
27
23
) : VectorStore, AutoCloseable {
28
24
29
- constructor (
30
- writer: IndexWriter ,
31
- embeddings: Embeddings ? ,
32
- similarity: VectorSimilarityFunction = VectorSimilarityFunction .EUCLIDEAN
33
- ) : this (writer, IndexSearcher (DirectoryReader .open(writer)), embeddings, similarity)
34
-
35
25
private val requestConfig =
36
26
RequestConfig (EmbeddingModel .TextEmbeddingAda002 , RequestConfig .Companion .User (" user" ))
37
27
@@ -48,16 +38,27 @@ open class Lucene(
48
38
writer.commit()
49
39
}
50
40
51
- override suspend fun similaritySearch (query : String , limit : Int ): List <String > =
52
- search(FuzzyQuery (Term (" contents" , query)), limit)
41
+ override suspend fun similaritySearch (query : String , limit : Int ): List <String > {
42
+ val reader = DirectoryReader .open(writer)
43
+ val mlt = MoreLikeThis (reader)
44
+ mlt.analyzer = StandardAnalyzer ()
45
+ mlt.minTermFreq = 1
46
+ mlt.minDocFreq = 1
47
+ mlt.minWordLen = 3
48
+ val luceneQuery = mlt.like(" contents" , StringReader (query))
49
+ val searcher = IndexSearcher (reader)
50
+ return IndexSearcher (reader).search(luceneQuery, limit).extract(searcher)
51
+ }
53
52
54
53
override suspend fun similaritySearchByVector (embedding : Embedding , limit : Int ): List <String > {
55
54
requireNotNull(embeddings) { " no embeddings were computed for this model" }
56
- return search(KnnFloatVectorQuery (" embedding" , embedding.data.toFloatArray(), limit), limit)
55
+ val luceneQuery = KnnFloatVectorQuery (" embedding" , embedding.data.toFloatArray(), limit)
56
+ val searcher = IndexSearcher (DirectoryReader .open(writer))
57
+ return searcher.search(luceneQuery, limit).extract(searcher)
57
58
}
58
59
59
- private fun search ( q : Query , limit : Int ): List <String > =
60
- searcher.search(q, limit). scoreDocs.map {
60
+ fun TopDocs. extract ( searcher : IndexSearcher ): List <String > =
61
+ scoreDocs.map {
61
62
searcher.storedFields().document(it.doc).get(" contents" )
62
63
}
63
64
0 commit comments