|
| 1 | +package com.xebia.functional.xef.vectorstores |
| 2 | + |
| 3 | +import com.xebia.functional.xef.embeddings.Embedding |
| 4 | +import com.xebia.functional.xef.embeddings.Embeddings |
| 5 | +import com.xebia.functional.xef.llm.openai.EmbeddingModel |
| 6 | +import com.xebia.functional.xef.llm.openai.RequestConfig |
| 7 | +import java.nio.file.Path |
| 8 | +import org.apache.lucene.document.Document |
| 9 | +import org.apache.lucene.document.Field |
| 10 | +import org.apache.lucene.document.KnnFloatVectorField |
| 11 | +import org.apache.lucene.document.TextField |
| 12 | +import org.apache.lucene.index.* |
| 13 | +import org.apache.lucene.search.FuzzyQuery |
| 14 | +import org.apache.lucene.search.IndexSearcher |
| 15 | +import org.apache.lucene.search.KnnFloatVectorQuery |
| 16 | +import org.apache.lucene.search.Query |
| 17 | +import org.apache.lucene.store.Directory |
| 18 | +import org.apache.lucene.store.MMapDirectory |
| 19 | + |
| 20 | +open class Lucene( |
| 21 | + private val writer: IndexWriter, |
| 22 | + private val searcher: IndexSearcher, |
| 23 | + private val embeddings: Embeddings, |
| 24 | + private val similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN |
| 25 | +) : VectorStore, AutoCloseable { |
| 26 | + |
| 27 | + constructor( |
| 28 | + writer: IndexWriter, |
| 29 | + embeddings: Embeddings, |
| 30 | + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN |
| 31 | + ) : this(writer, IndexSearcher(DirectoryReader.open(writer)), embeddings, similarity) |
| 32 | + |
| 33 | + private val requestConfig = |
| 34 | + RequestConfig(EmbeddingModel.TextEmbeddingAda002, RequestConfig.Companion.User("user")) |
| 35 | + |
| 36 | + override suspend fun addTexts(texts: List<String>) = |
| 37 | + texts.forEach { |
| 38 | + val embedding = embeddings.embedQuery(it, requestConfig) |
| 39 | + val doc = |
| 40 | + Document().apply { |
| 41 | + add(TextField("contents", it, Field.Store.YES)) |
| 42 | + add(KnnFloatVectorField("embedding", embedding.toFloatArray(), similarity)) |
| 43 | + } |
| 44 | + writer.addDocument(doc) |
| 45 | + } |
| 46 | + |
| 47 | + override suspend fun similaritySearch(query: String, limit: Int): List<String> = |
| 48 | + search(FuzzyQuery(Term("contents", query)), limit) |
| 49 | + |
| 50 | + override suspend fun similaritySearchByVector(embedding: Embedding, limit: Int): List<String> = |
| 51 | + search(KnnFloatVectorQuery("embedding", embedding.data.toFloatArray(), limit), limit) |
| 52 | + |
| 53 | + private fun search(q: Query, limit: Int): List<String> = |
| 54 | + searcher.search(q, limit).scoreDocs.map { |
| 55 | + searcher.storedFields().document(it.doc).get("contents") |
| 56 | + } |
| 57 | + |
| 58 | + override fun close() { |
| 59 | + writer.close() |
| 60 | + } |
| 61 | +} |
| 62 | + |
| 63 | +class DirectoryLucene( |
| 64 | + private val directory: Directory, |
| 65 | + writerConfig: IndexWriterConfig = IndexWriterConfig(), |
| 66 | + embeddings: Embeddings, |
| 67 | + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN |
| 68 | +) : Lucene(IndexWriter(directory, writerConfig), embeddings, similarity) { |
| 69 | + override fun close() { |
| 70 | + super.close() |
| 71 | + directory.close() |
| 72 | + } |
| 73 | +} |
| 74 | + |
| 75 | +fun InMemoryLucene( |
| 76 | + path: Path, |
| 77 | + writerConfig: IndexWriterConfig = IndexWriterConfig(), |
| 78 | + embeddings: Embeddings, |
| 79 | + similarity: VectorSimilarityFunction = VectorSimilarityFunction.EUCLIDEAN |
| 80 | +): DirectoryLucene = DirectoryLucene(MMapDirectory(path), writerConfig, embeddings, similarity) |
| 81 | + |
| 82 | +fun List<Embedding>.toFloatArray(): FloatArray = flatMap { it.data }.toFloatArray() |
0 commit comments