Skip to content
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ New Features
Improvements
---------------------

* LUCENE-9211: Add compression for Binary doc value fields. (Mark Harwood)

* LUCENE-9149: Increase data dimension limit in BKD. (Nick Knize)

* LUCENE-9102: Add maxQueryLength option to DirectSpellchecker. (Andy Webb via Bruno Roustant)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.EmptyDocValuesProducer;
import org.apache.lucene.index.FieldInfo;
Expand All @@ -42,13 +43,17 @@
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.ByteBuffersIndexOutput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.compress.LZ4;
import org.apache.lucene.util.compress.LZ4.FastCompressionHashTable;
import org.apache.lucene.util.packed.DirectMonotonicWriter;
import org.apache.lucene.util.packed.DirectWriter;

Expand All @@ -61,11 +66,13 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close

IndexOutput data, meta;
final int maxDoc;
private final SegmentWriteState state;

/** expert: Creates a new writer */
public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
boolean success = false;
try {
this.state = state;
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.createOutput(dataName, state.context);
CodecUtil.writeIndexHeader(data, dataCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
Expand Down Expand Up @@ -353,67 +360,191 @@ private void writeBlock(long[] values, int length, long gcd, ByteBuffersDataOutp
}
}

@Override
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
meta.writeByte(Lucene80DocValuesFormat.BINARY);

BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
meta.writeLong(start); // dataOffset
int numDocsWithField = 0;
int minLength = Integer.MAX_VALUE;
int maxLength = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
numDocsWithField++;
BytesRef v = values.binaryValue();
int length = v.length;
data.writeBytes(v.bytes, v.offset, v.length);
minLength = Math.min(length, minLength);
maxLength = Math.max(length, maxLength);
class CompressedBinaryBlockWriter implements Closeable {
final FastCompressionHashTable ht = new LZ4.FastCompressionHashTable();
int uncompressedBlockLength = 0;
int maxUncompressedBlockLength = 0;
int numDocsInCurrentBlock = 0;
final int[] docLengths = new int[Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK];
byte[] block = BytesRef.EMPTY_BYTES;
int totalChunks = 0;
long maxPointer = 0;
final long blockAddressesStart;

private final IndexOutput tempBinaryOffsets;


public CompressedBinaryBlockWriter() throws IOException {
tempBinaryOffsets = state.directory.createTempOutput(state.segmentInfo.name, "binary_pointers", state.context);
boolean success = false;
try {
CodecUtil.writeHeader(tempBinaryOffsets, Lucene80DocValuesFormat.META_CODEC + "FilePointers", Lucene80DocValuesFormat.VERSION_CURRENT);
blockAddressesStart = data.getFilePointer();
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(this); //self-close because constructor caller can't
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we usually do this like that instead, which helps avoid catching Throwable

boolean success = false;
try {
  // write header
} finally {
  if (success == false) {
    // close
  }
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What was the "+1" comment for line 407 about?
I've seen encoding elsewhere that have n+1 offsets to record start of each value and the last offset is effectively the end of the last value. In this scenario I'm writing n value lengths.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was about optimizing for the case that all values have the same length. In that case we could still one bit of the first length to mean that all values have the same length for instance?

}
assert numDocsWithField <= maxDoc;
meta.writeLong(data.getFilePointer() - start); // dataLength

if (numDocsWithField == 0) {
meta.writeLong(-2); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getBinary(field);
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
void addDoc(int doc, BytesRef v) throws IOException {
docLengths[numDocsInCurrentBlock] = v.length;
block = ArrayUtil.grow(block, uncompressedBlockLength + v.length);
System.arraycopy(v.bytes, v.offset, block, uncompressedBlockLength, v.length);
uncompressedBlockLength += v.length;
numDocsInCurrentBlock++;
if (numDocsInCurrentBlock == Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK) {
flushData();
}
}

meta.writeInt(numDocsWithField);
meta.writeInt(minLength);
meta.writeInt(maxLength);
if (maxLength > minLength) {
start = data.getFilePointer();
meta.writeLong(start);
private void flushData() throws IOException {
if (numDocsInCurrentBlock > 0) {
// Write offset to this block to temporary offsets file
totalChunks++;
long thisBlockStartPointer = data.getFilePointer();

// Optimisation - check if all lengths are same
boolean allLengthsSame = true;
for (int i = 1; i < Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK; i++) {
if (docLengths[i] != docLengths[i-1]) {
allLengthsSame = false;
break;
}
}
if (allLengthsSame) {
// Only write one value shifted. Steal a bit to indicate all other lengths are the same
int onlyOneLength = (docLengths[0] <<1) | 1;
data.writeVInt(onlyOneLength);
} else {
for (int i = 0; i < Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK; i++) {
if (i == 0) {
// Write first value shifted and steal a bit to indicate other lengths are to follow
int multipleLengths = (docLengths[0] <<1);
data.writeVInt(multipleLengths);
} else {
data.writeVInt(docLengths[i]);
}
}
}
maxUncompressedBlockLength = Math.max(maxUncompressedBlockLength, uncompressedBlockLength);
LZ4.compress(block, 0, uncompressedBlockLength, data, ht);
numDocsInCurrentBlock = 0;
// Ensure initialized with zeroes because full array is always written
Arrays.fill(docLengths, 0);
uncompressedBlockLength = 0;
maxPointer = data.getFilePointer();
tempBinaryOffsets.writeVLong(maxPointer - thisBlockStartPointer);
}
}

void writeMetaData() throws IOException {
if (totalChunks == 0) {
return;
}

long startDMW = data.getFilePointer();
meta.writeLong(startDMW);

meta.writeVInt(totalChunks);
meta.writeVInt(Lucene80DocValuesFormat.BINARY_BLOCK_SHIFT);
meta.writeVInt(maxUncompressedBlockLength);
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);


CodecUtil.writeFooter(tempBinaryOffsets);
IOUtils.close(tempBinaryOffsets);
//write the compressed block offsets info to the meta file by reading from temp file
try (ChecksumIndexInput filePointersIn = state.directory.openChecksumInput(tempBinaryOffsets.getName(), IOContext.READONCE)) {
CodecUtil.checkHeader(filePointersIn, Lucene80DocValuesFormat.META_CODEC + "FilePointers", Lucene80DocValuesFormat.VERSION_CURRENT,
Lucene80DocValuesFormat.VERSION_CURRENT);
Throwable priorE = null;
try {
final DirectMonotonicWriter filePointers = DirectMonotonicWriter.getInstance(meta, data, totalChunks, DIRECT_MONOTONIC_BLOCK_SHIFT);
long fp = blockAddressesStart;
for (int i = 0; i < totalChunks; ++i) {
filePointers.add(fp);
fp += filePointersIn.readVLong();
}
if (maxPointer < fp) {
throw new CorruptIndexException("File pointers don't add up ("+fp+" vs expected "+maxPointer+")", filePointersIn);
}
filePointers.finish();
} catch (Throwable e) {
priorE = e;
} finally {
CodecUtil.checkFooter(filePointersIn, priorE);
}
}
// Write the length of the DMW block in the data
meta.writeLong(data.getFilePointer() - startDMW);
}

final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
long addr = 0;
writer.add(addr);
values = valuesProducer.getBinary(field);
@Override
public void close() throws IOException {
if (tempBinaryOffsets != null) {
IOUtils.close(tempBinaryOffsets);
state.directory.deleteFile(tempBinaryOffsets.getName());
}
}

}


@Override
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
meta.writeByte(Lucene80DocValuesFormat.BINARY);

try (CompressedBinaryBlockWriter blockWriter = new CompressedBinaryBlockWriter()){
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
meta.writeLong(start); // dataOffset

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should use the BinaryEntry object here, and the just make the object "Writable" to a given DataOutput and "Readable" from a DataInput (which is already the case: readBinaryEntry). This will avoid the comments in the code -2 == docsWithFieldOffset etc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea but would prefer doing it in a separate PR.

int numDocsWithField = 0;
int minLength = Integer.MAX_VALUE;
int maxLength = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
addr += values.binaryValue().length;
writer.add(addr);
numDocsWithField++;
BytesRef v = values.binaryValue();
blockWriter.addDoc(doc, v);
int length = v.length;
minLength = Math.min(length, minLength);
maxLength = Math.max(length, maxLength);
}
writer.finish();
meta.writeLong(data.getFilePointer() - start);
blockWriter.flushData();

assert numDocsWithField <= maxDoc;
meta.writeLong(data.getFilePointer() - start); // dataLength

if (numDocsWithField == 0) {
meta.writeLong(-2); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getBinary(field);
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
Comment on lines +520 to +538

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently I'm working in a refactor of this code by having a doc id set iterator serializer capable to provide the correct instance based on the stored metadata. As you might see this is quite repetitive for the other fields


meta.writeInt(numDocsWithField);
meta.writeInt(minLength);
meta.writeInt(maxLength);

blockWriter.writeMetaData();

}

}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
static final String META_CODEC = "Lucene80DocValuesMetadata";
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static final int VERSION_BIN_COMPRESSED = 1;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be potentially in the BinaryDocValuesFormat class

static final int VERSION_CURRENT = VERSION_BIN_COMPRESSED;

// indicates docvalues type
static final byte NUMERIC = 0;
Expand All @@ -165,6 +166,9 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
static final int NUMERIC_BLOCK_SHIFT = 14;
static final int NUMERIC_BLOCK_SIZE = 1 << NUMERIC_BLOCK_SHIFT;

static final int BINARY_BLOCK_SHIFT = 5;
static final int BINARY_DOCS_PER_COMPRESSED_BLOCK = 1 << BINARY_BLOCK_SHIFT;

static final int TERMS_DICT_BLOCK_SHIFT = 4;
static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;
Expand Down
Loading