Skip to content

Commit 40d1d26

Browse files
committed
ORC-1577: Use ZSTD as the default compression
### What changes were proposed in this pull request? This PR aims to use `ZSTD` as the default compression from Apache ORC 2.0.0. ### Why are the changes needed? Apache ORC has been supporting ZStandard since 1.6.0. ZStandard is known to be better than Gzip in terms of the size and speed. - _The Rise of ZStandard: Apache Spark/Parquet/ORC/Avro_ - [Slides](https://www.slideshare.net/databricks/the-rise-of-zstandard-apache-sparkparquetorcavro) - [Youtube](https://youtu.be/dTGxhHwjONY) ### How was this patch tested? Pass the CIs. Closes #1733 from dongjoon-hyun/ORC-1577. Authored-by: Dongjoon Hyun <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]> (cherry picked from commit baf4c23) Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 73ea5b8 commit 40d1d26

File tree

7 files changed

+11
-10
lines changed

7 files changed

+11
-10
lines changed

c++/src/Writer.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ namespace orc {
5151
stripeSize = 64 * 1024 * 1024; // 64M
5252
compressionBlockSize = 64 * 1024; // 64K
5353
rowIndexStride = 10000;
54-
compression = CompressionKind_ZLIB;
54+
compression = CompressionKind_ZSTD;
5555
compressionStrategy = CompressionStrategy_SPEED;
5656
memoryPool = getDefaultPool();
5757
paddingTolerance = 0.0;

java/core/src/java/org/apache/orc/OrcConf.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public enum OrcConf {
5252
BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding",
5353
true,
5454
"Define whether stripes should be padded to the HDFS block boundaries."),
55-
COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB",
55+
COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZSTD",
5656
"Define the default compression codec for ORC file"),
5757
WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12",
5858
"Define the version of the file to write. Possible values are 0.11 and\n"+

java/core/src/test/org/apache/orc/TestVectorOrcFile.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ public void testStringAndBinaryStatistics(Version fileFormat) throws Exception {
538538

539539
assertEquals(3, stats[1].getNumberOfValues());
540540
assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
541-
assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", stats[1].toString());
541+
assertEquals("count: 3 hasNull: true bytesOnDisk: 30 sum: 15", stats[1].toString());
542542

543543
assertEquals(3, stats[2].getNumberOfValues());
544544
assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
@@ -1255,7 +1255,7 @@ public void test1(Version fileFormat) throws Exception {
12551255
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
12561256
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
12571257
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
1258-
assertEquals("count: 2 hasNull: false bytesOnDisk: 15 min: -15.0 max: -5.0 sum: -20.0",
1258+
assertEquals("count: 2 hasNull: false bytesOnDisk: 19 min: -15.0 max: -5.0 sum: -20.0",
12591259
stats[7].toString());
12601260

12611261
assertEquals("count: 2 hasNull: false bytesOnDisk: " +
@@ -3961,7 +3961,7 @@ public void testEncryptMerge(Version fileFormat) throws Exception {
39613961
// test reading with no keys
39623962
Reader reader = OrcFile.createReader(merge1, OrcFile.readerOptions(conf));
39633963
assertEquals(9 * 1024, reader.getNumberOfRows());
3964-
assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
3964+
assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
39653965
assertEquals(1000, reader.getRowIndexStride());
39663966
assertEquals(0xc00, reader.getCompressionSize());
39673967
assertEquals(fileFormat, reader.getFileVersion());
@@ -4107,7 +4107,7 @@ public void testEncryptMerge(Version fileFormat) throws Exception {
41074107

41084108
reader = OrcFile.createReader(merge2, OrcFile.readerOptions(conf));
41094109
assertEquals(2 * 3 * 1024, reader.getNumberOfRows());
4110-
assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
4110+
assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
41114111
assertEquals(0x800, reader.getCompressionSize());
41124112
assertEquals(1000, reader.getRowIndexStride());
41134113
assertEquals(fileFormat, reader.getFileVersion());

java/tools/src/test/org/apache/orc/tools/TestFileDump.java

+1
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,7 @@ public void testHasNull() throws Exception {
588588
Writer writer = OrcFile.createWriter(testFilePath,
589589
OrcFile.writerOptions(conf)
590590
.setSchema(schema)
591+
.compress(CompressionKind.ZLIB)
591592
.rowIndexStride(1000)
592593
.stripeSize(10000)
593594
.bufferSize(10000));

site/_docs/core-java-config.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ permalink: /docs/core-java-config.html
6969
</tr>
7070
<tr>
7171
<td><code>orc.compress</code></td>
72-
<td>ZLIB</td>
72+
<td>ZSTD</td>
7373
<td>
7474
Define the default compression codec for ORC file
7575
</td>
@@ -396,4 +396,4 @@ permalink: /docs/core-java-config.html
396396
The maximum number of child elements to buffer before the ORC row writer writes the batch to the file.
397397
</td>
398398
</tr>
399-
</table>
399+
</table>

site/_docs/hive-config.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ with the same options.
1212

1313
Key | Default | Notes
1414
:----------------------- | :---------- | :------------------------
15-
orc.compress | ZLIB | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
15+
orc.compress | ZSTD | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
1616
orc.compress.size | 262,144 | compression chunk size
1717
orc.stripe.size | 67,108,864 | memory buffer in bytes for writing
1818
orc.row.index.stride | 10,000 | number of rows between index entries

site/_docs/spark-config.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ with the same options.
1212

1313
Key | Default | Notes
1414
:----------------------- | :---------- | :------------------------
15-
orc.compress | ZLIB | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
15+
orc.compress | ZSTD | high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}
1616
orc.compress.size | 262,144 | compression chunk size
1717
orc.stripe.size | 67,108,864 | memory buffer in bytes for writing
1818
orc.row.index.stride | 10,000 | number of rows between index entries

0 commit comments

Comments
 (0)