diff --git a/docs/changelog/145076.yaml b/docs/changelog/145076.yaml new file mode 100644 index 0000000000000..cbd416e06d4e5 --- /dev/null +++ b/docs/changelog/145076.yaml @@ -0,0 +1,5 @@ +area: Vector Search +issues: [] +pr: 145076 +summary: "[DiskBBQ] Fix index sorting on flush (2nd attempt)" +type: bug diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java index a48aa63a4d6f7..6dbf8a3a502e9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java @@ -14,6 +14,7 @@ import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; @@ -37,6 +38,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.util.AbstractList; import java.util.ArrayList; import java.util.List; import java.util.function.Consumer; @@ -265,14 +267,38 @@ private static KMeansFloatVectorValues getKMeansFloatVectorValues( vectorTransform.accept(vectors); if (vectors.size() == maxDoc && sortMap == null) { return KMeansFloatVectorValues.build(vectors, null, fieldInfo.getVectorDimension()); + } else if (sortMap == null) { + final DocIdSetIterator iterator = fieldVectorsWriter.getDocsWithFieldSet().iterator(); + final int[] docIds = new int[vectors.size()]; + for (int i = 0; i < docIds.length; i++) { + docIds[i] = iterator.nextDoc(); + } + assert iterator.nextDoc() == NO_MORE_DOCS; + return KMeansFloatVectorValues.build(vectors, docIds, fieldInfo.getVectorDimension()); + } else { + DocsWithFieldSet newDocsWithField = new DocsWithFieldSet(); + final int[] ordMap = new int[fieldVectorsWriter.getDocsWithFieldSet().cardinality()]; // new ord to old ord + KnnVectorsWriter.mapOldOrdToNewOrd(fieldVectorsWriter.getDocsWithFieldSet(), sortMap, null, ordMap, newDocsWithField); + final DocIdSetIterator iterator = newDocsWithField.iterator(); + final int[] docIds = new int[vectors.size()]; + for (int i = 0; i < docIds.length; i++) { + docIds[i] = iterator.nextDoc(); + } + assert iterator.nextDoc() == NO_MORE_DOCS; + List orderedVectors = new AbstractList<>() { + + @Override + public int size() { + return vectors.size(); + } + + @Override + public float[] get(int index) { + return vectors.get(ordMap[index]); + } + }; + return KMeansFloatVectorValues.build(orderedVectors, docIds, fieldInfo.getVectorDimension()); } - final DocIdSetIterator iterator = fieldVectorsWriter.getDocsWithFieldSet().iterator(); - final int[] docIds = new int[vectors.size()]; - for (int i = 0; i < docIds.length; i++) { - docIds[i] = sortMap == null ? iterator.nextDoc() : sortMap.oldToNew(iterator.nextDoc()); - } - assert iterator.nextDoc() == NO_MORE_DOCS; - return KMeansFloatVectorValues.build(vectors, docIds, fieldInfo.getVectorDimension()); } /** diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/es94/ES940DiskBBQVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/es94/ES940DiskBBQVectorsFormatTests.java index b5962c642c31c..09778cb78c04a 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/es94/ES940DiskBBQVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/es94/ES940DiskBBQVectorsFormatTests.java @@ -17,9 +17,9 @@ import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.KeywordField; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.StringField; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -34,9 +34,11 @@ import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopDocsCollector; import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; @@ -398,25 +400,37 @@ private void doRestrictiveFilter(boolean dense) throws IOException { int maxMatchingDocs = random().nextInt(1, 10); int matchingDocs = 0; int numDocs = random().nextInt(100, 3_000); - try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + IndexWriterConfig iwc = newIndexWriterConfig(); + if (random().nextBoolean()) { + iwc.setIndexSort(new Sort(new SortField("k", SortField.Type.STRING))); + } + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, iwc)) { for (int i = 0; i < numDocs; i++) { Document doc = new Document(); if (dense || rarely() == false) { float[] vector = randomVector(dimensions); doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); } - doc.add(new KeywordField("k", new BytesRef("B"), Field.Store.NO)); + if (random().nextBoolean()) { + doc.add(new StringField("k", new BytesRef("A"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("A"))); + } else { + doc.add(new StringField("k", new BytesRef("C"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("C"))); + } w.addDocument(doc); if (matchingDocs < maxMatchingDocs && rarely()) { matchingDocs++; doc = new Document(); doc.add(new KnnFloatVectorField("f", randomVector(dimensions), VectorSimilarityFunction.EUCLIDEAN)); - doc.add(new KeywordField("k", new BytesRef("A"), Field.Store.NO)); + doc.add(new StringField("k", new BytesRef("B"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("B"))); w.addDocument(doc); } if (dense == false && rarely()) { doc = new Document(); - doc.add(new KeywordField("k", new BytesRef("A"), Field.Store.NO)); + doc.add(new StringField("k", new BytesRef("B"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("B"))); w.addDocument(doc); } } @@ -426,49 +440,69 @@ private void doRestrictiveFilter(boolean dense) throws IOException { float[] vector = randomVector(dimensions); Document doc = new Document(); doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); - doc.add(new KeywordField("k", new BytesRef("A"), Field.Store.NO)); + doc.add(new StringField("k", new BytesRef("B"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("B"))); w.addDocument(doc); } w.commit(); - // force one leave - w.forceMerge(1); + if (random().nextBoolean()) { + // force one leave + w.forceMerge(1); + } try (IndexReader reader = DirectoryReader.open(w)) { - LeafReader leafReader = getOnlyLeafReader(reader); - float[] vector = randomVector(dimensions); - // we might collect the same document twice because of soar assignments - KnnCollector collector; - if (random().nextBoolean()) { - collector = new TopKnnCollector(random().nextInt(2 * matchingDocs, 3 * matchingDocs), Integer.MAX_VALUE); - } else { - collector = new TopKnnCollector( - random().nextInt(2 * matchingDocs, 3 * matchingDocs), - Integer.MAX_VALUE, - new IVFKnnSearchStrategy(0.25f, null) + TopDocs[] topDocsArray = new TopDocs[reader.leaves().size()]; + for (int i = 0; i < reader.leaves().size(); i++) { + LeafReaderContext context = reader.leaves().get(i); + LeafReader leafReader = context.reader(); + float[] vector = randomVector(dimensions); + // we might collect the same document twice because of soar assignments + KnnCollector collector; + if (random().nextBoolean()) { + collector = new TopKnnCollector(random().nextInt(2 * matchingDocs, 3 * matchingDocs), Integer.MAX_VALUE); + } else { + collector = new TopKnnCollector( + random().nextInt(2 * matchingDocs, 3 * matchingDocs), + Integer.MAX_VALUE, + new IVFKnnSearchStrategy(0.25f, null) + ); + } + if (leafReader.postings(new Term("k", new BytesRef("B"))) == null) { + topDocsArray[i] = TopDocsCollector.EMPTY_TOPDOCS; + continue; + } + leafReader.searchNearestVectors( + "f", + vector, + collector, + AcceptDocs.fromIteratorSupplier( + () -> leafReader.postings(new Term("k", new BytesRef("B"))), + leafReader.getLiveDocs(), + leafReader.maxDoc() + ) + ); + TopDocs leafTopDocs = collector.topDocs(); + ScoreDoc[] adjusted = new ScoreDoc[leafTopDocs.scoreDocs.length]; + for (int docIndex = 0; docIndex < leafTopDocs.scoreDocs.length; docIndex++) { + ScoreDoc scoreDoc = leafTopDocs.scoreDocs[docIndex]; + adjusted[docIndex] = new ScoreDoc(scoreDoc.doc + context.docBase, scoreDoc.score); + } + topDocsArray[i] = new TopDocs(leafTopDocs.totalHits, adjusted); + // match no docs + leafReader.searchNearestVectors( + "f", + vector, + new TopKnnCollector(2, Integer.MAX_VALUE), + AcceptDocs.fromIteratorSupplier(DocIdSetIterator::empty, leafReader.getLiveDocs(), leafReader.maxDoc()) ); } - leafReader.searchNearestVectors( - "f", - vector, - collector, - AcceptDocs.fromIteratorSupplier( - () -> leafReader.postings(new Term("k", new BytesRef("A"))), - leafReader.getLiveDocs(), - leafReader.maxDoc() - ) - ); - TopDocs topDocs = collector.topDocs(); + TopDocs topDocs = TopDocs.merge(2 * maxMatchingDocs, topDocsArray); Set uniqueDocIds = new HashSet<>(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { uniqueDocIds.add(topDocs.scoreDocs[i].doc); + Document document = reader.storedFields().document(topDocs.scoreDocs[i].doc); + assertThat(document.getField("k").binaryValue().utf8ToString(), equalTo("B")); } assertEquals(matchingDocs, uniqueDocIds.size()); - // match no docs - leafReader.searchNearestVectors( - "f", - vector, - new TopKnnCollector(2, Integer.MAX_VALUE), - AcceptDocs.fromIteratorSupplier(DocIdSetIterator::empty, leafReader.getLiveDocs(), leafReader.maxDoc()) - ); } } } diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/next/ESNextDiskBBQVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/next/ESNextDiskBBQVectorsFormatTests.java index 548444047556a..6c3668177c0b9 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/next/ESNextDiskBBQVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/next/ESNextDiskBBQVectorsFormatTests.java @@ -17,9 +17,9 @@ import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.KeywordField; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.StringField; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -34,9 +34,11 @@ import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopDocsCollector; import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; @@ -398,25 +400,37 @@ private void doRestrictiveFilter(boolean dense) throws IOException { int maxMatchingDocs = random().nextInt(1, 10); int matchingDocs = 0; int numDocs = random().nextInt(100, 3_000); - try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + IndexWriterConfig iwc = newIndexWriterConfig(); + if (random().nextBoolean()) { + iwc.setIndexSort(new Sort(new SortField("k", SortField.Type.STRING))); + } + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, iwc)) { for (int i = 0; i < numDocs; i++) { Document doc = new Document(); if (dense || rarely() == false) { float[] vector = randomVector(dimensions); doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); } - doc.add(new KeywordField("k", new BytesRef("B"), Field.Store.NO)); + if (random().nextBoolean()) { + doc.add(new StringField("k", new BytesRef("A"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("A"))); + } else { + doc.add(new StringField("k", new BytesRef("C"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("C"))); + } w.addDocument(doc); if (matchingDocs < maxMatchingDocs && rarely()) { matchingDocs++; doc = new Document(); doc.add(new KnnFloatVectorField("f", randomVector(dimensions), VectorSimilarityFunction.EUCLIDEAN)); - doc.add(new KeywordField("k", new BytesRef("A"), Field.Store.NO)); + doc.add(new StringField("k", new BytesRef("B"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("B"))); w.addDocument(doc); } if (dense == false && rarely()) { doc = new Document(); - doc.add(new KeywordField("k", new BytesRef("A"), Field.Store.NO)); + doc.add(new StringField("k", new BytesRef("B"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("B"))); w.addDocument(doc); } } @@ -426,49 +440,69 @@ private void doRestrictiveFilter(boolean dense) throws IOException { float[] vector = randomVector(dimensions); Document doc = new Document(); doc.add(new KnnFloatVectorField("f", vector, VectorSimilarityFunction.EUCLIDEAN)); - doc.add(new KeywordField("k", new BytesRef("A"), Field.Store.NO)); + doc.add(new StringField("k", new BytesRef("B"), Field.Store.YES)); + doc.add(new SortedDocValuesField("k", new BytesRef("B"))); w.addDocument(doc); } w.commit(); - // force one leave - w.forceMerge(1); + if (random().nextBoolean()) { + // force one leave + w.forceMerge(1); + } try (IndexReader reader = DirectoryReader.open(w)) { - LeafReader leafReader = getOnlyLeafReader(reader); - float[] vector = randomVector(dimensions); - // we might collect the same document twice because of soar assignments - KnnCollector collector; - if (random().nextBoolean()) { - collector = new TopKnnCollector(random().nextInt(2 * matchingDocs, 3 * matchingDocs), Integer.MAX_VALUE); - } else { - collector = new TopKnnCollector( - random().nextInt(2 * matchingDocs, 3 * matchingDocs), - Integer.MAX_VALUE, - new IVFKnnSearchStrategy(0.25f, null) + TopDocs[] topDocsArray = new TopDocs[reader.leaves().size()]; + for (int i = 0; i < reader.leaves().size(); i++) { + LeafReaderContext context = reader.leaves().get(i); + LeafReader leafReader = context.reader(); + float[] vector = randomVector(dimensions); + // we might collect the same document twice because of soar assignments + KnnCollector collector; + if (random().nextBoolean()) { + collector = new TopKnnCollector(random().nextInt(2 * matchingDocs, 3 * matchingDocs), Integer.MAX_VALUE); + } else { + collector = new TopKnnCollector( + random().nextInt(2 * matchingDocs, 3 * matchingDocs), + Integer.MAX_VALUE, + new IVFKnnSearchStrategy(0.25f, null) + ); + } + if (leafReader.postings(new Term("k", new BytesRef("B"))) == null) { + topDocsArray[i] = TopDocsCollector.EMPTY_TOPDOCS; + continue; + } + leafReader.searchNearestVectors( + "f", + vector, + collector, + AcceptDocs.fromIteratorSupplier( + () -> leafReader.postings(new Term("k", new BytesRef("B"))), + leafReader.getLiveDocs(), + leafReader.maxDoc() + ) + ); + TopDocs leafTopDocs = collector.topDocs(); + ScoreDoc[] adjusted = new ScoreDoc[leafTopDocs.scoreDocs.length]; + for (int docIndex = 0; docIndex < leafTopDocs.scoreDocs.length; docIndex++) { + ScoreDoc scoreDoc = leafTopDocs.scoreDocs[docIndex]; + adjusted[docIndex] = new ScoreDoc(scoreDoc.doc + context.docBase, scoreDoc.score); + } + topDocsArray[i] = new TopDocs(leafTopDocs.totalHits, adjusted); + // match no docs + leafReader.searchNearestVectors( + "f", + vector, + new TopKnnCollector(2, Integer.MAX_VALUE), + AcceptDocs.fromIteratorSupplier(DocIdSetIterator::empty, leafReader.getLiveDocs(), leafReader.maxDoc()) ); } - leafReader.searchNearestVectors( - "f", - vector, - collector, - AcceptDocs.fromIteratorSupplier( - () -> leafReader.postings(new Term("k", new BytesRef("A"))), - leafReader.getLiveDocs(), - leafReader.maxDoc() - ) - ); - TopDocs topDocs = collector.topDocs(); + TopDocs topDocs = TopDocs.merge(2 * maxMatchingDocs, topDocsArray); Set uniqueDocIds = new HashSet<>(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { uniqueDocIds.add(topDocs.scoreDocs[i].doc); + Document document = reader.storedFields().document(topDocs.scoreDocs[i].doc); + assertThat(document.getField("k").binaryValue().utf8ToString(), equalTo("B")); } assertEquals(matchingDocs, uniqueDocIds.size()); - // match no docs - leafReader.searchNearestVectors( - "f", - vector, - new TopKnnCollector(2, Integer.MAX_VALUE), - AcceptDocs.fromIteratorSupplier(DocIdSetIterator::empty, leafReader.getLiveDocs(), leafReader.maxDoc()) - ); } } }