-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-27264 Add options to consider compressed size when delimiting blocks during hfile writes #4675
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HBASE-27264 Add options to consider compressed size when delimiting blocks during hfile writes #4675
Changes from 5 commits
ebdc565
1ff2efa
17171a3
a328d9b
694631c
6918d65
4d67df3
b9e191f
80b95f3
5a1c552
4136f57
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,7 @@ | |
| import org.apache.hadoop.hbase.io.ByteBufferWriterDataOutputStream; | ||
| import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper; | ||
| import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; | ||
| import org.apache.hadoop.hbase.io.encoding.EncodedDataBlock; | ||
| import org.apache.hadoop.hbase.io.encoding.EncodingState; | ||
| import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext; | ||
| import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultDecodingContext; | ||
|
|
@@ -239,6 +240,10 @@ static class Header { | |
| static final byte[] DUMMY_HEADER_NO_CHECKSUM = | ||
| new byte[HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM]; | ||
|
|
||
| public static final String BLOCK_DELIMIT_COMPRESSED = "hbase.block.delimit.compressed"; | ||
|
|
||
| public static final String MAX_BLOCK_SIZE_UNCOMPRESSED = "hbase.block.max.size.uncompressed"; | ||
|
|
||
| /** | ||
| * Used deserializing blocks from Cache. <code> | ||
| * ++++++++++++++ | ||
|
|
@@ -454,7 +459,7 @@ int getOnDiskSizeWithoutHeader() { | |
| } | ||
|
|
||
| /** Returns the uncompressed size of data part (header and checksum excluded). */ | ||
| int getUncompressedSizeWithoutHeader() { | ||
| public int getUncompressedSizeWithoutHeader() { | ||
| return uncompressedSizeWithoutHeader; | ||
| } | ||
|
|
||
|
|
@@ -729,6 +734,16 @@ private enum State { | |
| BLOCK_READY | ||
| } | ||
|
|
||
| public boolean isDelimitByCompressedSize() { | ||
| return delimitByCompressedSize; | ||
| } | ||
|
|
||
| private boolean delimitByCompressedSize; | ||
|
|
||
| private int maxSizeUnCompressed; | ||
|
|
||
| private int adjustedBlockSize; | ||
|
|
||
| /** Writer state. Used to ensure the correct usage protocol. */ | ||
| private State state = State.INIT; | ||
|
|
||
|
|
@@ -807,11 +822,13 @@ EncodingState getEncodingState() { | |
| */ | ||
| public Writer(Configuration conf, HFileDataBlockEncoder dataBlockEncoder, | ||
| HFileContext fileContext) { | ||
| this(conf, dataBlockEncoder, fileContext, ByteBuffAllocator.HEAP); | ||
| this(conf, dataBlockEncoder, fileContext, ByteBuffAllocator.HEAP, false, | ||
| fileContext.getBlocksize()); | ||
| } | ||
|
|
||
| public Writer(Configuration conf, HFileDataBlockEncoder dataBlockEncoder, | ||
| HFileContext fileContext, ByteBuffAllocator allocator) { | ||
| HFileContext fileContext, ByteBuffAllocator allocator, boolean sizeLimitcompleted, | ||
| int maxSizeUnCompressed) { | ||
| if (fileContext.getBytesPerChecksum() < HConstants.HFILEBLOCK_HEADER_SIZE) { | ||
| throw new RuntimeException("Unsupported value of bytesPerChecksum. " + " Minimum is " | ||
| + HConstants.HFILEBLOCK_HEADER_SIZE + " but the configured value is " | ||
|
|
@@ -834,6 +851,8 @@ public Writer(Configuration conf, HFileDataBlockEncoder dataBlockEncoder, | |
| // TODO: Why fileContext saved away when we have dataBlockEncoder and/or | ||
| // defaultDataBlockEncoder? | ||
| this.fileContext = fileContext; | ||
| this.delimitByCompressedSize = sizeLimitcompleted; | ||
| this.maxSizeUnCompressed = maxSizeUnCompressed; | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -886,6 +905,27 @@ void ensureBlockReady() throws IOException { | |
| finishBlock(); | ||
| } | ||
|
|
||
| public boolean shouldFinishBlock() throws IOException { | ||
|
||
| int uncompressedBlockSize = blockSizeWritten(); | ||
| if (uncompressedBlockSize >= fileContext.getBlocksize()) { | ||
| if (delimitByCompressedSize && uncompressedBlockSize < maxSizeUnCompressed) { | ||
| // In order to avoid excessive compression size calculations, we do it only once when | ||
| // the uncompressed size has reached BLOCKSIZE. We then use this compression size to | ||
| // calculate the compression rate, and adjust the block size limit by this ratio. | ||
| if (adjustedBlockSize == 0 || uncompressedBlockSize >= adjustedBlockSize) { | ||
| int compressedSize = EncodedDataBlock.getCompressedSize(fileContext.getCompression(), | ||
| fileContext.getCompression().getCompressor(), baosInMemory.getBuffer(), 0, | ||
| baosInMemory.size()); | ||
| adjustedBlockSize = uncompressedBlockSize / compressedSize; | ||
| adjustedBlockSize *= fileContext.getBlocksize(); | ||
| } | ||
| return uncompressedBlockSize >= adjustedBlockSize; | ||
| } | ||
| return true; | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Finish up writing of the block. Flushes the compressing stream (if using compression), fills | ||
| * out the header, does any compression/encryption of bytes to flush out to disk, and manages | ||
|
|
@@ -1066,7 +1106,7 @@ int getUncompressedSizeWithoutHeader() { | |
| /** | ||
| * The uncompressed size of the block data, including header size. | ||
| */ | ||
| int getUncompressedSizeWithHeader() { | ||
| public int getUncompressedSizeWithHeader() { | ||
| expectState(State.BLOCK_READY); | ||
| return baosInMemory.size(); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,9 @@ | |
| */ | ||
| package org.apache.hadoop.hbase.io.hfile; | ||
|
|
||
| import static org.apache.hadoop.hbase.io.hfile.HFileBlock.BLOCK_DELIMIT_COMPRESSED; | ||
| import static org.apache.hadoop.hbase.io.hfile.HFileBlock.MAX_BLOCK_SIZE_UNCOMPRESSED; | ||
|
|
||
| import java.io.DataOutput; | ||
| import java.io.DataOutputStream; | ||
| import java.io.IOException; | ||
|
|
@@ -291,8 +294,9 @@ protected void finishInit(final Configuration conf) { | |
| if (blockWriter != null) { | ||
| throw new IllegalStateException("finishInit called twice"); | ||
| } | ||
| blockWriter = | ||
| new HFileBlock.Writer(conf, blockEncoder, hFileContext, cacheConf.getByteBuffAllocator()); | ||
| blockWriter = new HFileBlock.Writer(conf, blockEncoder, hFileContext, | ||
| cacheConf.getByteBuffAllocator(), conf.getBoolean(BLOCK_DELIMIT_COMPRESSED, false), | ||
| conf.getInt(MAX_BLOCK_SIZE_UNCOMPRESSED, hFileContext.getBlocksize() * 10)); | ||
| // Data block index writer | ||
| boolean cacheIndexesOnWrite = cacheConf.shouldCacheIndexesOnWrite(); | ||
| dataBlockIndexWriter = new HFileBlockIndex.BlockIndexWriter(blockWriter, | ||
|
|
@@ -319,6 +323,9 @@ protected void checkBlockBoundary() throws IOException { | |
| shouldFinishBlock = blockWriter.encodedBlockSizeWritten() >= hFileContext.getBlocksize() | ||
| || blockWriter.blockSizeWritten() >= hFileContext.getBlocksize(); | ||
| } | ||
| if (blockWriter.isDelimitByCompressedSize()) { | ||
| shouldFinishBlock &= blockWriter.shouldFinishBlock(); | ||
|
||
| } | ||
| if (shouldFinishBlock) { | ||
| finishBlock(); | ||
| writeInlineBlocks(false); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we move this and it's logic to the predicate itself?