diff --git a/docs/changelog/144938.yaml b/docs/changelog/144938.yaml new file mode 100644 index 0000000000000..972f6b0846748 --- /dev/null +++ b/docs/changelog/144938.yaml @@ -0,0 +1,5 @@ +area: Vector Search +issues: [] +pr: 144938 +summary: "[DiskBBQ] Fix index sorting on flush" +type: bug diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java index b4034875fe15f..a48aa63a4d6f7 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java @@ -197,7 +197,8 @@ public final void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { fieldWriter.fieldInfo, fieldWriter.delegate, maxDoc, - preconditionVectors(preconditioner) + preconditionVectors(preconditioner), + sortMap ); // build centroids @@ -257,17 +258,18 @@ private static KMeansFloatVectorValues getKMeansFloatVectorValues( FieldInfo fieldInfo, FlatFieldVectorsWriter fieldVectorsWriter, int maxDoc, - Consumer> vectorTransform + Consumer> vectorTransform, + Sorter.DocMap sortMap ) throws IOException { List vectors = fieldVectorsWriter.getVectors(); vectorTransform.accept(vectors); - if (vectors.size() == maxDoc) { + if (vectors.size() == maxDoc && sortMap == null) { return KMeansFloatVectorValues.build(vectors, null, fieldInfo.getVectorDimension()); } final DocIdSetIterator iterator = fieldVectorsWriter.getDocsWithFieldSet().iterator(); final int[] docIds = new int[vectors.size()]; for (int i = 0; i < docIds.length; i++) { - docIds[i] = iterator.nextDoc(); + docIds[i] = sortMap == null ? iterator.nextDoc() : sortMap.oldToNew(iterator.nextDoc()); } assert iterator.nextDoc() == NO_MORE_DOCS; return KMeansFloatVectorValues.build(vectors, docIds, fieldInfo.getVectorDimension()); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java index a69db9c38f79c..15a2a13deed40 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java @@ -17,6 +17,7 @@ import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FloatVectorValues; @@ -26,15 +27,22 @@ import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.AcceptDocs; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.LogConfigurator; +import org.elasticsearch.index.codec.vectors.diskbbq.next.ESNextDiskBBQVectorsFormat; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.junit.AssumptionViolatedException; import org.junit.Before; @@ -296,6 +304,42 @@ public void testOneRepeatedVector() throws IOException { } } + public void testIndexSortOnFlush() throws IOException { + format = new ESNextDiskBBQVectorsFormat( + ESNextDiskBBQVectorsFormat.MIN_VECTORS_PER_CLUSTER, + ESNextDiskBBQVectorsFormat.MIN_CENTROIDS_PER_PARENT_CLUSTER + ); + IndexWriterConfig config = newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(format)) + .setIndexSort(new Sort(new SortField("sort", SortField.Type.STRING))) + .setMergePolicy(NoMergePolicy.INSTANCE); + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, config)) { + float[] vectorA = new float[] { 0f, 3f }; + float[] vectorB = new float[] { 0f, 2f }; + float[] vectorC = new float[] { 0f, 1f }; + addSortedVectorDoc(w, "c", vectorC); + addSortedVectorDoc(w, "a", vectorA); + addSortedVectorDoc(w, "b", vectorB); + w.commit(); + try (IndexReader reader = DirectoryReader.open(dir)) { + LeafReader leafReader = getOnlyLeafReader(reader); + + // we might collect the same document twice because of soar assignments + KnnCollector collector = new TopKnnCollector(3, Integer.MAX_VALUE); + leafReader.searchNearestVectors( + "f", + vectorA, + collector, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()) + ); + TopDocs topDocs = collector.topDocs(); + assertEquals(3, topDocs.scoreDocs.length); + assertEquals(0, topDocs.scoreDocs[0].doc); + assertEquals(1, topDocs.scoreDocs[1].doc); + assertEquals(2, topDocs.scoreDocs[2].doc); + } + } + } + // this is a modified version of lucene's TestSearchWithThreads test case public void testWithThreads() throws Exception { final int numThreads = random().nextInt(2, 5); @@ -346,4 +390,11 @@ public void testWithThreads() throws Exception { } } } + + private static void addSortedVectorDoc(IndexWriter writer, String id, float[] vector) throws IOException { + Document doc = new Document(); + doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new SortedDocValuesField("sort", new BytesRef(id))); + writer.addDocument(doc); + } } diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/es94/ES940DiskBBQVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/es94/ES940DiskBBQVectorsFormatTests.java index 018d9a97946a7..0e10196b40d63 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/es94/ES940DiskBBQVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/es94/ES940DiskBBQVectorsFormatTests.java @@ -19,18 +19,23 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.KeywordField; import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; @@ -38,6 +43,7 @@ import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.LogConfigurator; +import org.elasticsearch.index.codec.vectors.diskbbq.next.ESNextDiskBBQVectorsFormat; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.search.vectors.IVFKnnSearchStrategy; import org.junit.Before; @@ -356,6 +362,42 @@ public void testRestrictiveFilterSparse() throws IOException { doRestrictiveFilter(false); } + public void testIndexSortOnFlush() throws IOException { + format = new ESNextDiskBBQVectorsFormat( + ESNextDiskBBQVectorsFormat.MIN_VECTORS_PER_CLUSTER, + ESNextDiskBBQVectorsFormat.MIN_CENTROIDS_PER_PARENT_CLUSTER + ); + IndexWriterConfig config = newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(format)) + .setIndexSort(new Sort(new SortField("sort", SortField.Type.STRING))) + .setMergePolicy(NoMergePolicy.INSTANCE); + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, config)) { + float[] vectorA = new float[] { 0f, 3f }; + float[] vectorB = new float[] { 0f, 2f }; + float[] vectorC = new float[] { 0f, 1f }; + addSortedVectorDoc(w, "c", vectorC); + addSortedVectorDoc(w, "a", vectorA); + addSortedVectorDoc(w, "b", vectorB); + w.commit(); + try (IndexReader reader = DirectoryReader.open(dir)) { + LeafReader leafReader = getOnlyLeafReader(reader); + + // we might collect the same document twice because of soar assignments + KnnCollector collector = new TopKnnCollector(3, Integer.MAX_VALUE); + leafReader.searchNearestVectors( + "f", + vectorA, + collector, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()) + ); + TopDocs topDocs = collector.topDocs(); + assertEquals(3, topDocs.scoreDocs.length); + assertEquals(0, topDocs.scoreDocs[0].doc); + assertEquals(1, topDocs.scoreDocs[1].doc); + assertEquals(2, topDocs.scoreDocs[2].doc); + } + } + } + private void doRestrictiveFilter(boolean dense) throws IOException { int dimensions = random().nextInt(12, 500); int maxMatchingDocs = random().nextInt(1, 10); @@ -435,4 +477,11 @@ private void doRestrictiveFilter(boolean dense) throws IOException { } } } + + private static void addSortedVectorDoc(IndexWriter writer, String id, float[] vector) throws IOException { + Document doc = new Document(); + doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new SortedDocValuesField("sort", new BytesRef(id))); + writer.addDocument(doc); + } } diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/next/ESNextDiskBBQVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/next/ESNextDiskBBQVectorsFormatTests.java index c7b4dcb299ddf..b3ba1c6105b11 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/next/ESNextDiskBBQVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/next/ESNextDiskBBQVectorsFormatTests.java @@ -19,18 +19,23 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.KeywordField; import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; @@ -356,6 +361,39 @@ public void testRestrictiveFilterSparse() throws IOException { doRestrictiveFilter(false); } + public void testIndexSortOnFlush() throws IOException { + format = new ESNextDiskBBQVectorsFormat(MIN_VECTORS_PER_CLUSTER, MIN_CENTROIDS_PER_PARENT_CLUSTER); + IndexWriterConfig config = newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(format)) + .setIndexSort(new Sort(new SortField("sort", SortField.Type.STRING))) + .setMergePolicy(NoMergePolicy.INSTANCE); + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, config)) { + float[] vectorA = new float[] { 0f, 3f }; + float[] vectorB = new float[] { 0f, 2f }; + float[] vectorC = new float[] { 0f, 1f }; + addSortedVectorDoc(w, "c", vectorC); + addSortedVectorDoc(w, "a", vectorA); + addSortedVectorDoc(w, "b", vectorB); + w.commit(); + try (IndexReader reader = DirectoryReader.open(dir)) { + LeafReader leafReader = getOnlyLeafReader(reader); + + // we might collect the same document twice because of soar assignments + KnnCollector collector = new TopKnnCollector(3, Integer.MAX_VALUE); + leafReader.searchNearestVectors( + "f", + vectorA, + collector, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()) + ); + TopDocs topDocs = collector.topDocs(); + assertEquals(3, topDocs.scoreDocs.length); + assertEquals(0, topDocs.scoreDocs[0].doc); + assertEquals(1, topDocs.scoreDocs[1].doc); + assertEquals(2, topDocs.scoreDocs[2].doc); + } + } + } + private void doRestrictiveFilter(boolean dense) throws IOException { int dimensions = random().nextInt(12, 500); int maxMatchingDocs = random().nextInt(1, 10); @@ -435,4 +473,11 @@ private void doRestrictiveFilter(boolean dense) throws IOException { } } } + + private static void addSortedVectorDoc(IndexWriter writer, String id, float[] vector) throws IOException { + Document doc = new Document(); + doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new SortedDocValuesField("sort", new BytesRef(id))); + writer.addDocument(doc); + } }