From 1a4bca072b8d1fea1f2536c323e2fdad50b4514e Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 26 Mar 2026 07:48:12 +0000 Subject: [PATCH] [9.2][DiskBBQ] Fix index sorting on flush --- docs/changelog/144938.yaml | 5 ++ .../vectors/diskbbq/IVFVectorsWriter.java | 9 ++-- .../ES920DiskBBQVectorsFormatTests.java | 47 +++++++++++++++++++ 3 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 docs/changelog/144938.yaml diff --git a/docs/changelog/144938.yaml b/docs/changelog/144938.yaml new file mode 100644 index 0000000000000..972f6b0846748 --- /dev/null +++ b/docs/changelog/144938.yaml @@ -0,0 +1,5 @@ +area: Vector Search +issues: [] +pr: 144938 +summary: "[DiskBBQ] Fix index sorting on flush" +type: bug diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java index b01011bc6f8e3..fa4b774d03ac7 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java @@ -179,7 +179,7 @@ public final void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { } final float[] globalCentroid = new float[fieldWriter.fieldInfo.getVectorDimension()]; // build a float vector values with random access - final FloatVectorValues floatVectorValues = getFloatVectorValues(fieldWriter.fieldInfo, fieldWriter.delegate, maxDoc); + final FloatVectorValues floatVectorValues = getFloatVectorValues(fieldWriter.fieldInfo, fieldWriter.delegate, maxDoc, sortMap); // build centroids final CentroidAssignments centroidAssignments = calculateCentroids(fieldWriter.fieldInfo, floatVectorValues, globalCentroid); // wrap centroids with a supplier @@ -216,16 +216,17 @@ public final void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { private static FloatVectorValues getFloatVectorValues( FieldInfo fieldInfo, FlatFieldVectorsWriter fieldVectorsWriter, - int maxDoc + int maxDoc, + Sorter.DocMap sortMap ) throws IOException { List vectors = fieldVectorsWriter.getVectors(); - if (vectors.size() == maxDoc) { + if (vectors.size() == maxDoc && sortMap == null) { return FloatVectorValues.fromFloats(vectors, fieldInfo.getVectorDimension()); } final DocIdSetIterator iterator = fieldVectorsWriter.getDocsWithFieldSet().iterator(); final int[] docIds = new int[vectors.size()]; for (int i = 0; i < docIds.length; i++) { - docIds[i] = iterator.nextDoc(); + docIds[i] = sortMap == null ? iterator.nextDoc() : sortMap.oldToNew(iterator.nextDoc()); } assert iterator.nextDoc() == NO_MORE_DOCS; return new FloatVectorValues() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java index 0507b49968fcb..b8db04a6c628a 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java @@ -17,19 +17,27 @@ import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.AcceptDocs; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.LogConfigurator; import org.junit.Before; @@ -290,4 +298,43 @@ public void testWithThreads() throws Exception { } } } + + public void testIndexSortOnFlush() throws IOException { + IndexWriterConfig config = newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(format)) + .setIndexSort(new Sort(new SortField("sort", SortField.Type.STRING))) + .setMergePolicy(NoMergePolicy.INSTANCE); + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, config)) { + float[] vectorA = new float[] { 0f, 3f }; + float[] vectorB = new float[] { 0f, 2f }; + float[] vectorC = new float[] { 0f, 1f }; + addSortedVectorDoc(w, "c", vectorC); + addSortedVectorDoc(w, "a", vectorA); + addSortedVectorDoc(w, "b", vectorB); + w.commit(); + try (IndexReader reader = DirectoryReader.open(dir)) { + LeafReader leafReader = getOnlyLeafReader(reader); + + // we might collect the same document twice because of soar assignments + KnnCollector collector = new TopKnnCollector(3, Integer.MAX_VALUE); + leafReader.searchNearestVectors( + "f", + vectorA, + collector, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()) + ); + TopDocs topDocs = collector.topDocs(); + assertEquals(3, topDocs.scoreDocs.length); + assertEquals(0, topDocs.scoreDocs[0].doc); + assertEquals(1, topDocs.scoreDocs[1].doc); + assertEquals(2, topDocs.scoreDocs[2].doc); + } + } + } + + private static void addSortedVectorDoc(IndexWriter writer, String id, float[] vector) throws IOException { + Document doc = new Document(); + doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new SortedDocValuesField("sort", new BytesRef(id))); + writer.addDocument(doc); + } }