apache · markharwood · Feb 18, 2020 · Jan 20, 2020 · Feb 5, 2020 · Feb 6, 2020
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -122,6 +122,8 @@ New Features
 Improvements
 ---------------------
 
+* LUCENE-9211: Add compression for Binary doc value fields. (Mark Harwood)
+
 * LUCENE-9149: Increase data dimension limit in BKD. (Nick Knize)
 
 * LUCENE-9102: Add maxQueryLength option to DirectSpellchecker. (Andy Webb via Bruno Roustant)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
@@ -29,6 +29,7 @@
 import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.EmptyDocValuesProducer;
 import org.apache.lucene.index.FieldInfo;
@@ -42,13 +43,17 @@
 import org.apache.lucene.search.SortedSetSelector;
 import org.apache.lucene.store.ByteBuffersDataOutput;
 import org.apache.lucene.store.ByteBuffersIndexOutput;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.MathUtil;
 import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.compress.LZ4;
+import org.apache.lucene.util.compress.LZ4.FastCompressionHashTable;
 import org.apache.lucene.util.packed.DirectMonotonicWriter;
 import org.apache.lucene.util.packed.DirectWriter;
 
@@ -61,11 +66,13 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
 
   IndexOutput data, meta;
   final int maxDoc;
+  private final SegmentWriteState state;
 
   /** expert: Creates a new writer */
   public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
     boolean success = false;
     try {
+      this.state = state;
       String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
       data = state.directory.createOutput(dataName, state.context);
       CodecUtil.writeIndexHeader(data, dataCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
@@ -353,67 +360,191 @@ private void writeBlock(long[] values, int length, long gcd, ByteBuffersDataOutp
     }
   }
 
-  @Override
-  public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
-    meta.writeInt(field.number);
-    meta.writeByte(Lucene80DocValuesFormat.BINARY);
-
-    BinaryDocValues values = valuesProducer.getBinary(field);
-    long start = data.getFilePointer();
-    meta.writeLong(start); // dataOffset
-    int numDocsWithField = 0;
-    int minLength = Integer.MAX_VALUE;
-    int maxLength = 0;
-    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
-      numDocsWithField++;
-      BytesRef v = values.binaryValue();
-      int length = v.length;
-      data.writeBytes(v.bytes, v.offset, v.length);
-      minLength = Math.min(length, minLength);
-      maxLength = Math.max(length, maxLength);
+  class CompressedBinaryBlockWriter implements Closeable {
+    final FastCompressionHashTable ht = new LZ4.FastCompressionHashTable();    
+    int uncompressedBlockLength = 0;
+    int maxUncompressedBlockLength = 0;
+    int numDocsInCurrentBlock = 0;
+    final int[] docLengths = new int[Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK]; 
+    byte[] block = BytesRef.EMPTY_BYTES;
+    int totalChunks = 0;
+    long maxPointer = 0;
+    final long blockAddressesStart; 
+
+    private final IndexOutput tempBinaryOffsets;
+
+
+    public CompressedBinaryBlockWriter() throws IOException {
+      tempBinaryOffsets = state.directory.createTempOutput(state.segmentInfo.name, "binary_pointers", state.context);
+      boolean success = false;
+      try {
+        CodecUtil.writeHeader(tempBinaryOffsets, Lucene80DocValuesFormat.META_CODEC + "FilePointers", Lucene80DocValuesFormat.VERSION_CURRENT);
+        blockAddressesStart = data.getFilePointer();
+        success = true;
+      } finally {
+        if (success == false) {
+          IOUtils.closeWhileHandlingException(this); //self-close because constructor caller can't 
+        }
+      }
     }
-    assert numDocsWithField <= maxDoc;
-    meta.writeLong(data.getFilePointer() - start); // dataLength
 
-    if (numDocsWithField == 0) {
-      meta.writeLong(-2); // docsWithFieldOffset
-      meta.writeLong(0L); // docsWithFieldLength
-      meta.writeShort((short) -1); // jumpTableEntryCount
-      meta.writeByte((byte) -1);   // denseRankPower
-    } else if (numDocsWithField == maxDoc) {
-      meta.writeLong(-1); // docsWithFieldOffset
-      meta.writeLong(0L); // docsWithFieldLength
-      meta.writeShort((short) -1); // jumpTableEntryCount
-      meta.writeByte((byte) -1);   // denseRankPower
-    } else {
-      long offset = data.getFilePointer();
-      meta.writeLong(offset); // docsWithFieldOffset
-      values = valuesProducer.getBinary(field);
-      final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
-      meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
-      meta.writeShort(jumpTableEntryCount);
-      meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+    void addDoc(int doc, BytesRef v) throws IOException {
+      docLengths[numDocsInCurrentBlock] = v.length;
+      block = ArrayUtil.grow(block, uncompressedBlockLength + v.length);
+      System.arraycopy(v.bytes, v.offset, block, uncompressedBlockLength, v.length);
+      uncompressedBlockLength += v.length;
+      numDocsInCurrentBlock++;
+      if (numDocsInCurrentBlock == Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK) {
+        flushData();
+      }      
     }
 
-    meta.writeInt(numDocsWithField);
-    meta.writeInt(minLength);
-    meta.writeInt(maxLength);
-    if (maxLength > minLength) {
-      start = data.getFilePointer();
-      meta.writeLong(start);
+    private void flushData() throws IOException {
+      if (numDocsInCurrentBlock > 0) {
+        // Write offset to this block to temporary offsets file
+        totalChunks++;
+        long thisBlockStartPointer = data.getFilePointer();
+
+        // Optimisation - check if all lengths are same
+        boolean allLengthsSame = true;
+        for (int i = 1; i < Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK; i++) {
+          if (docLengths[i] != docLengths[i-1]) {
+            allLengthsSame = false;
+            break;
+          }
+        }
+        if (allLengthsSame) {
+            // Only write one value shifted. Steal a bit to indicate all other lengths are the same
+            int onlyOneLength = (docLengths[0] <<1) | 1;
+            data.writeVInt(onlyOneLength);
+        } else {
+          for (int i = 0; i < Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK; i++) {
+            if (i == 0) {
+              // Write first value shifted and steal a bit to indicate other lengths are to follow
+              int multipleLengths = (docLengths[0] <<1);
+              data.writeVInt(multipleLengths);              
+            } else {
+              data.writeVInt(docLengths[i]);
+            }
+          }
+        }
+        maxUncompressedBlockLength = Math.max(maxUncompressedBlockLength, uncompressedBlockLength);
+        LZ4.compress(block, 0, uncompressedBlockLength, data, ht);
+        numDocsInCurrentBlock = 0;
+        // Ensure initialized with zeroes because full array is always written
+        Arrays.fill(docLengths, 0);
+        uncompressedBlockLength = 0;
+        maxPointer = data.getFilePointer();
+        tempBinaryOffsets.writeVLong(maxPointer - thisBlockStartPointer);
+      }
+    }
+
+    void writeMetaData() throws IOException {
+      if (totalChunks == 0) {
+        return;
+      }
+
+      long startDMW = data.getFilePointer();
+      meta.writeLong(startDMW);
+
+      meta.writeVInt(totalChunks);
+      meta.writeVInt(Lucene80DocValuesFormat.BINARY_BLOCK_SHIFT);
+      meta.writeVInt(maxUncompressedBlockLength);
       meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
+
+
+      CodecUtil.writeFooter(tempBinaryOffsets);
+      IOUtils.close(tempBinaryOffsets);             
+      //write the compressed block offsets info to the meta file by reading from temp file
+      try (ChecksumIndexInput filePointersIn = state.directory.openChecksumInput(tempBinaryOffsets.getName(), IOContext.READONCE)) {
+        CodecUtil.checkHeader(filePointersIn, Lucene80DocValuesFormat.META_CODEC + "FilePointers", Lucene80DocValuesFormat.VERSION_CURRENT,
+          Lucene80DocValuesFormat.VERSION_CURRENT);
+        Throwable priorE = null;
+        try {
+          final DirectMonotonicWriter filePointers = DirectMonotonicWriter.getInstance(meta, data, totalChunks, DIRECT_MONOTONIC_BLOCK_SHIFT);
+          long fp = blockAddressesStart;
+          for (int i = 0; i < totalChunks; ++i) {
+            filePointers.add(fp);
+            fp += filePointersIn.readVLong();
+          }
+          if (maxPointer < fp) {
+            throw new CorruptIndexException("File pointers don't add up ("+fp+" vs expected "+maxPointer+")", filePointersIn);
+          }
+          filePointers.finish();
+        } catch (Throwable e) {
+          priorE = e;
+        } finally {
+          CodecUtil.checkFooter(filePointersIn, priorE);
+        }
+      }
+      // Write the length of the DMW block in the data 
+      meta.writeLong(data.getFilePointer() - startDMW);
+    }
 
-      final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
-      long addr = 0;
-      writer.add(addr);
-      values = valuesProducer.getBinary(field);
+    @Override
+    public void close() throws IOException {
+      if (tempBinaryOffsets != null) {
+        IOUtils.close(tempBinaryOffsets);             
+        state.directory.deleteFile(tempBinaryOffsets.getName());
+      }
+    }
+
+  }
+
+
+  @Override
+  public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
+    meta.writeInt(field.number);
+    meta.writeByte(Lucene80DocValuesFormat.BINARY);
+
+    try (CompressedBinaryBlockWriter blockWriter = new CompressedBinaryBlockWriter()){
+      BinaryDocValues values = valuesProducer.getBinary(field);
+      long start = data.getFilePointer();
+      meta.writeLong(start); // dataOffset
+      int numDocsWithField = 0;
+      int minLength = Integer.MAX_VALUE;
+      int maxLength = 0;
       for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
-        addr += values.binaryValue().length;
-        writer.add(addr);
+        numDocsWithField++;
+        BytesRef v = values.binaryValue();      
+        blockWriter.addDoc(doc, v);      
+        int length = v.length;      
+        minLength = Math.min(length, minLength);
+        maxLength = Math.max(length, maxLength);
       }
-      writer.finish();
-      meta.writeLong(data.getFilePointer() - start);
+      blockWriter.flushData();
+
+      assert numDocsWithField <= maxDoc;
+      meta.writeLong(data.getFilePointer() - start); // dataLength
+
+      if (numDocsWithField == 0) {
+        meta.writeLong(-2); // docsWithFieldOffset
+        meta.writeLong(0L); // docsWithFieldLength
+        meta.writeShort((short) -1); // jumpTableEntryCount
+        meta.writeByte((byte) -1);   // denseRankPower
+      } else if (numDocsWithField == maxDoc) {
+        meta.writeLong(-1); // docsWithFieldOffset
+        meta.writeLong(0L); // docsWithFieldLength
+        meta.writeShort((short) -1); // jumpTableEntryCount
+        meta.writeByte((byte) -1);   // denseRankPower
+      } else {
+        long offset = data.getFilePointer();
+        meta.writeLong(offset); // docsWithFieldOffset
+        values = valuesProducer.getBinary(field);
+        final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+        meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
+        meta.writeShort(jumpTableEntryCount);
+        meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+      }
+
+      meta.writeInt(numDocsWithField);
+      meta.writeInt(minLength);
+      meta.writeInt(maxLength);    
+
+      blockWriter.writeMetaData();
+
     }
+
   }
 
   @Override

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java
@@ -151,7 +151,8 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
   static final String META_CODEC = "Lucene80DocValuesMetadata";
   static final String META_EXTENSION = "dvm";
   static final int VERSION_START = 0;
-  static final int VERSION_CURRENT = VERSION_START;
+  static final int VERSION_BIN_COMPRESSED = 1;  
+  static final int VERSION_CURRENT = VERSION_BIN_COMPRESSED;
 
   // indicates docvalues type
   static final byte NUMERIC = 0;
@@ -165,6 +166,9 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
   static final int NUMERIC_BLOCK_SHIFT = 14;
   static final int NUMERIC_BLOCK_SIZE = 1 << NUMERIC_BLOCK_SHIFT;
 
+  static final int BINARY_BLOCK_SHIFT = 5;
+  static final int BINARY_DOCS_PER_COMPRESSED_BLOCK = 1 << BINARY_BLOCK_SHIFT;
+
   static final int TERMS_DICT_BLOCK_SHIFT = 4;
   static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
   static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;