Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.FileUtils;
import org.apache.directory.api.util.Hex;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.io.ByteStreams;
Expand Down
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ project(':iceberg-aliyun') {
exclude module: 'logback-classic'
exclude module: 'spring-boot-starter-logging'
}
testImplementation 'commons-io:commons-io'
}
}

Expand Down
3 changes: 3 additions & 0 deletions core/src/main/java/org/apache/iceberg/TableProperties.java
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ private TableProperties() {}
public static final String DELETE_PARQUET_PAGE_ROW_LIMIT = "write.delete.parquet.page-row-limit";
public static final int PARQUET_PAGE_ROW_LIMIT_DEFAULT = 20_000;

public static final String PARQUET_DICT_ENABLED = "write.parquet.enable.dictionary";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't match the format of Iceberg options. It should be write.parquet.dict-enabled to match dict-size-bytes or should use enabled as the last word if you're using dictionary as a part of the hierarchy.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, the name does match our pattern

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies for my oversight here, i intended to have .enabled in suffix which i mentioned in the table prop but messed up here.

public static final boolean PARQUET_DICT_ENABLED_DEFAULT = true;

public static final String PARQUET_DICT_SIZE_BYTES = "write.parquet.dict-size-bytes";
public static final String DELETE_PARQUET_DICT_SIZE_BYTES =
"write.delete.parquet.dict-size-bytes";
Expand Down
91 changes: 46 additions & 45 deletions docs/configuration.md

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_PAGE_ROW_LIMIT;
Expand Down Expand Up @@ -244,6 +246,7 @@ public <D> FileAppender<D> build() throws IOException {
int rowGroupSize = context.rowGroupSize();
int pageSize = context.pageSize();
int pageRowLimit = context.pageRowLimit();
boolean dictionaryEnabled = context.dictionaryEnabled();
int dictionaryPageSize = context.dictionaryPageSize();
String compressionLevel = context.compressionLevel();
CompressionCodecName codec = context.codec();
Expand Down Expand Up @@ -286,6 +289,7 @@ public <D> FileAppender<D> build() throws IOException {
.withWriterVersion(writerVersion)
.withPageSize(pageSize)
.withPageRowCountLimit(pageRowLimit)
.withDictionaryEncoding(dictionaryEnabled)
.withDictionaryPageSize(dictionaryPageSize)
.withMinRowCountForPageSizeCheck(rowGroupCheckMinRecordCount)
.withMaxRowCountForPageSizeCheck(rowGroupCheckMaxRecordCount)
Expand Down Expand Up @@ -323,6 +327,7 @@ public <D> FileAppender<D> build() throws IOException {
.withRowGroupSize(rowGroupSize)
.withPageSize(pageSize)
.withPageRowCountLimit(pageRowLimit)
.withDictionaryEncoding(dictionaryEnabled)
.withDictionaryPageSize(dictionaryPageSize);

for (Map.Entry<String, String> entry : columnBloomFilterEnabled.entrySet()) {
Expand All @@ -339,6 +344,8 @@ private static class Context {
private final int rowGroupSize;
private final int pageSize;
private final int pageRowLimit;

private final boolean dictionaryEnabled;
private final int dictionaryPageSize;
private final CompressionCodecName codec;
private final String compressionLevel;
Expand All @@ -351,6 +358,7 @@ private Context(
int rowGroupSize,
int pageSize,
int pageRowLimit,
boolean dictionaryEnabled,
int dictionaryPageSize,
CompressionCodecName codec,
String compressionLevel,
Expand All @@ -361,6 +369,7 @@ private Context(
this.rowGroupSize = rowGroupSize;
this.pageSize = pageSize;
this.pageRowLimit = pageRowLimit;
this.dictionaryEnabled = dictionaryEnabled;
this.dictionaryPageSize = dictionaryPageSize;
this.codec = codec;
this.compressionLevel = compressionLevel;
Expand All @@ -386,6 +395,10 @@ static Context dataContext(Map<String, String> config) {
config, PARQUET_PAGE_ROW_LIMIT, PARQUET_PAGE_ROW_LIMIT_DEFAULT);
Preconditions.checkArgument(pageRowLimit > 0, "Page row count limit must be > 0");

boolean dictionaryEnabled =
PropertyUtil.propertyAsBoolean(
config, PARQUET_DICT_ENABLED, PARQUET_DICT_ENABLED_DEFAULT);

int dictionaryPageSize =
PropertyUtil.propertyAsInt(
config, PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT);
Expand Down Expand Up @@ -429,6 +442,7 @@ static Context dataContext(Map<String, String> config) {
rowGroupSize,
pageSize,
pageRowLimit,
dictionaryEnabled,
dictionaryPageSize,
codec,
compressionLevel,
Expand Down Expand Up @@ -500,6 +514,7 @@ static Context deleteContext(Map<String, String> config) {
rowGroupSize,
pageSize,
pageRowLimit,
dataContext.dictionaryEnabled(),
dictionaryPageSize,
codec,
compressionLevel,
Expand Down Expand Up @@ -529,6 +544,10 @@ int pageRowLimit() {
return pageRowLimit;
}

boolean dictionaryEnabled() {
return dictionaryEnabled;
}

int dictionaryPageSize() {
return dictionaryPageSize;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.iceberg.parquet;

import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED;
import static org.apache.iceberg.avro.AvroSchemaUtil.convert;
import static org.apache.iceberg.expressions.Expressions.and;
import static org.apache.iceberg.expressions.Expressions.equal;
Expand Down Expand Up @@ -197,6 +198,7 @@ public void createInputFile() throws IOException {
try (FileAppender<Record> appender =
Parquet.write(outFile)
.schema(FILE_SCHEMA)
.set(PARQUET_DICT_ENABLED, "false")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@singhpk234 @Fokko, I don't think it makes sense to create an option that is public and must be supported by Iceberg moving forward just for this test case. Is it possible to set this with a Hadoop option or to do this some other way?

.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_id", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_long", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_double", "true")
Expand Down
3 changes: 2 additions & 1 deletion versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ org.apache.hadoop:* = 2.7.3
org.apache.hive:* = 2.3.9
org.apache.httpcomponents.client5:* = 5.2.1
org.apache.orc:* = 1.8.3
org.apache.parquet:* = 1.12.3
org.apache.parquet:* = 1.13.1
org.apache.pig:pig = 0.14.0
com.fasterxml.jackson.*:* = 2.14.1
com.google.code.findbugs:jsr305 = 3.0.2
Expand Down Expand Up @@ -49,3 +49,4 @@ org.eclipse.jetty:* = 9.4.43.v20210629
org.testcontainers:* = 1.17.6
io.delta:delta-core_* = 2.2.0
org.awaitility:awaitility = 4.2.0
commons-io:commons-io = 2.8.0