-
Notifications
You must be signed in to change notification settings - Fork 1.4k
GroupVarInt Encoding Implementation for HNSW Graphs #14932
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
723dcf6
0857c05
db86abf
ea9612b
4fc0f24
c6fc06d
ad58abf
a40b5d4
a47dd20
02d7a35
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -87,7 +87,8 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { | |
| static final String VECTOR_INDEX_EXTENSION = "vex"; | ||
|
|
||
| public static final int VERSION_START = 0; | ||
| public static final int VERSION_CURRENT = VERSION_START; | ||
| public static final int VERSION_GROUPVARINT = 1; | ||
| public static final int VERSION_CURRENT = VERSION_GROUPVARINT; | ||
|
|
||
| /** | ||
| * A maximum configurable maximum max conn. | ||
|
|
@@ -137,11 +138,22 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { | |
| private final int numMergeWorkers; | ||
| private final TaskExecutor mergeExec; | ||
|
|
||
| private final int writeVersion; | ||
|
|
||
| /** Constructs a format using default graph construction parameters */ | ||
| public Lucene99HnswVectorsFormat() { | ||
| this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null); | ||
| } | ||
|
|
||
| /** | ||
| * Constructs a format using the default parameters and the specific writer version. | ||
| * | ||
| * @param writeVersion the version used for the writer to encode docID's (VarInt=0, GroupVarInt=1) | ||
| */ | ||
| Lucene99HnswVectorsFormat(int writeVersion) { | ||
| this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null, writeVersion); | ||
| } | ||
|
|
||
| /** | ||
| * Constructs a format using the given graph construction parameters. | ||
| * | ||
|
|
@@ -165,6 +177,27 @@ public Lucene99HnswVectorsFormat(int maxConn, int beamWidth) { | |
| */ | ||
| public Lucene99HnswVectorsFormat( | ||
| int maxConn, int beamWidth, int numMergeWorkers, ExecutorService mergeExec) { | ||
| this(maxConn, beamWidth, numMergeWorkers, mergeExec, 1); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should use |
||
| } | ||
|
|
||
| /** | ||
| * Constructs a format using the given graph construction parameters and scalar quantization. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| * | ||
| * @param maxConn the maximum number of connections to a node in the HNSW graph | ||
| * @param beamWidth the size of the queue maintained during graph construction. | ||
| * @param numMergeWorkers number of workers (threads) that will be used when doing merge. If | ||
| * larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec | ||
| * @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are | ||
| * generated by this format to do the merge. If null, the configured {@link | ||
| * MergeScheduler#getIntraMergeExecutor(MergePolicy.OneMerge)} is used. | ||
| * @param writeVersion the version used for the writer to encode docID's (VarInt=0, GroupVarInt=1) | ||
| */ | ||
| Lucene99HnswVectorsFormat( | ||
| int maxConn, | ||
| int beamWidth, | ||
| int numMergeWorkers, | ||
| ExecutorService mergeExec, | ||
| int writeVersion) { | ||
| super("Lucene99HnswVectorsFormat"); | ||
| if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) { | ||
| throw new IllegalArgumentException( | ||
|
|
@@ -182,6 +215,7 @@ public Lucene99HnswVectorsFormat( | |
| } | ||
| this.maxConn = maxConn; | ||
| this.beamWidth = beamWidth; | ||
| this.writeVersion = writeVersion; | ||
| if (numMergeWorkers == 1 && mergeExec != null) { | ||
| throw new IllegalArgumentException( | ||
| "No executor service is needed as we'll use single thread to merge"); | ||
|
|
@@ -196,13 +230,14 @@ public Lucene99HnswVectorsFormat( | |
|
|
||
| @Override | ||
| public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { | ||
| return new Lucene99HnswVectorsWriter( | ||
| state, | ||
| maxConn, | ||
| beamWidth, | ||
| flatVectorsFormat.fieldsWriter(state), | ||
| numMergeWorkers, | ||
| mergeExec); | ||
| return new Lucene99HnswVectorsWriter( | ||
| state, | ||
| maxConn, | ||
| beamWidth, | ||
| flatVectorsFormat.fieldsWriter(state), | ||
| numMergeWorkers, | ||
| mergeExec, | ||
| writeVersion); | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,6 +76,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter { | |
| private final FlatVectorsWriter flatVectorWriter; | ||
| private final int numMergeWorkers; | ||
| private final TaskExecutor mergeExec; | ||
| private final int version; | ||
|
|
||
| private final List<FieldWriter<?>> fields = new ArrayList<>(); | ||
| private boolean finished; | ||
|
|
@@ -88,11 +89,24 @@ public Lucene99HnswVectorsWriter( | |
| int numMergeWorkers, | ||
| TaskExecutor mergeExec) | ||
| throws IOException { | ||
| this(state, M, beamWidth, flatVectorWriter, numMergeWorkers, mergeExec, 1); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I missed this earlier -- this should be |
||
| } | ||
|
|
||
| Lucene99HnswVectorsWriter( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add a "test-only" comment here too? |
||
| SegmentWriteState state, | ||
| int M, | ||
| int beamWidth, | ||
| FlatVectorsWriter flatVectorWriter, | ||
| int numMergeWorkers, | ||
| TaskExecutor mergeExec, | ||
| int version) | ||
| throws IOException { | ||
| this.M = M; | ||
| this.flatVectorWriter = flatVectorWriter; | ||
| this.beamWidth = beamWidth; | ||
| this.numMergeWorkers = numMergeWorkers; | ||
| this.mergeExec = mergeExec; | ||
| this.version = version; | ||
| segmentWriteState = state; | ||
| String metaFileName = | ||
| IndexFileNames.segmentFileName( | ||
|
|
@@ -111,13 +125,13 @@ public Lucene99HnswVectorsWriter( | |
| CodecUtil.writeIndexHeader( | ||
| meta, | ||
| Lucene99HnswVectorsFormat.META_CODEC_NAME, | ||
| Lucene99HnswVectorsFormat.VERSION_CURRENT, | ||
| version, | ||
| state.segmentInfo.getId(), | ||
| state.segmentSuffix); | ||
| CodecUtil.writeIndexHeader( | ||
| vectorIndex, | ||
| Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME, | ||
| Lucene99HnswVectorsFormat.VERSION_CURRENT, | ||
| version, | ||
| state.segmentInfo.getId(), | ||
| state.segmentSuffix); | ||
| } catch (Throwable t) { | ||
|
|
@@ -342,8 +356,12 @@ private void reconstructAndWriteNeighbours( | |
| } | ||
| // Write the size after duplicates are removed | ||
| vectorIndex.writeVInt(actualSize); | ||
| for (int i = 0; i < actualSize; i++) { | ||
| vectorIndex.writeVInt(scratch[i]); | ||
| if (version >= Lucene99HnswVectorsFormat.VERSION_GROUPVARINT) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: import as static constant for parity? |
||
| vectorIndex.writeGroupVInts(scratch, actualSize); | ||
| } else { | ||
| for (int i = 0; i < actualSize; i++) { | ||
| vectorIndex.writeVInt(scratch[i]); | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -444,9 +462,14 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { | |
| } | ||
| // Write the size after duplicates are removed | ||
| vectorIndex.writeVInt(actualSize); | ||
| for (int i = 0; i < actualSize; i++) { | ||
| vectorIndex.writeVInt(scratch[i]); | ||
| if (version >= Lucene99HnswVectorsFormat.VERSION_GROUPVARINT) { | ||
| vectorIndex.writeGroupVInts(scratch, actualSize); | ||
| } else { | ||
| for (int i = 0; i < actualSize; i++) { | ||
| vectorIndex.writeVInt(scratch[i]); | ||
| } | ||
| } | ||
|
|
||
| offsets[level][nodeOffsetId++] = | ||
| Math.toIntExact(vectorIndex.getFilePointer() - offsetStart); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.lucene.codecs.lucene99; | ||
|
|
||
| import org.apache.lucene.codecs.Codec; | ||
| import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; | ||
| import org.apache.lucene.tests.util.TestUtil; | ||
|
|
||
| public class TestLucene99HnswVectorsFormatV2 extends BaseKnnVectorsFormatTestCase { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Rename as |
||
|
|
||
| @Override | ||
| protected Codec getCodec() { | ||
| return TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat(0)); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can avoid this constructor and use the larger one below from tests? (all defaults are
public)If you do so, we should add a "test-only" comment to that constructor.
If you still want to keep this, we should add the "test-only" comment here and make that
private? (since it isn't used outside this class).