Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/src/main/sphinx/connector/hive.rst
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,12 @@ with Parquet files performed by the Hive connector.
definition. The equivalent catalog session property is
``parquet_use_column_names``.
- ``true``
* - ``parquet.optimized-writer.validation-percentage``
- Percentage of parquet files to validate after write by re-reading the whole file
when ``parquet.experimental-optimized-writer.enabled`` is set to ``true``.
Comment thread
raunaqmorarka marked this conversation as resolved.
Outdated
The equivalent catalog session property is ``parquet_optimized_writer_validation_percentage``.
Comment thread
raunaqmorarka marked this conversation as resolved.
Outdated
Validation can be turned off by setting this property to ``0``.
- ``5``


Metastore configuration properties
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.parquet;

import com.google.common.collect.ImmutableList;
import io.trino.spi.TrinoException;
import io.trino.spi.block.Block;
import io.trino.spi.block.ColumnarArray;
import io.trino.spi.block.ColumnarMap;
import io.trino.spi.block.ColumnarRow;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.MapType;
import io.trino.spi.type.RowType;
import io.trino.spi.type.Type;

import java.util.List;

import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.trino.spi.block.ColumnarArray.toColumnarArray;
import static io.trino.spi.block.ColumnarMap.toColumnarMap;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;

class ColumnStatisticsValidation
{
private final Type type;
private final List<ColumnStatisticsValidation> fieldBuilders;

private long valuesCount;
private long nonLeafValuesCount;

public ColumnStatisticsValidation(Type type)
{
this.type = requireNonNull(type, "type is null");
this.fieldBuilders = type.getTypeParameters().stream()
.map(ColumnStatisticsValidation::new)
.collect(toImmutableList());
}

public void addBlock(Block block)
{
addBlock(block, new ColumnStatistics(0, 0));
}

public List<ColumnStatistics> build()
{
if (fieldBuilders.isEmpty()) {
return ImmutableList.of(new ColumnStatistics(valuesCount, nonLeafValuesCount));
}
return fieldBuilders.stream()
.flatMap(builder -> builder.build().stream())
.collect(toImmutableList());
}

private void addBlock(Block block, ColumnStatistics columnStatistics)
{
if (fieldBuilders.isEmpty()) {
addPrimitiveBlock(block);
valuesCount += columnStatistics.valuesCount();
nonLeafValuesCount += columnStatistics.nonLeafValuesCount();
return;
}

List<Block> fields;
ColumnStatistics mergedColumnStatistics;
if (type instanceof ArrayType) {
ColumnarArray columnarArray = toColumnarArray(block);
fields = ImmutableList.of(columnarArray.getElementsBlock());
mergedColumnStatistics = columnStatistics.merge(addArrayBlock(columnarArray));
}
else if (type instanceof MapType) {
ColumnarMap columnarMap = toColumnarMap(block);
fields = ImmutableList.of(columnarMap.getKeysBlock(), columnarMap.getValuesBlock());
mergedColumnStatistics = columnStatistics.merge(addMapBlock(columnarMap));
}
else if (type instanceof RowType) {
ColumnarRow columnarRow = ColumnarRow.toColumnarRow(block);
ImmutableList.Builder<Block> fieldsBuilder = ImmutableList.builder();
for (int index = 0; index < columnarRow.getFieldCount(); index++) {
fieldsBuilder.add(columnarRow.getField(index));
}
fields = fieldsBuilder.build();
mergedColumnStatistics = columnStatistics.merge(addRowBlock(columnarRow));
}
else {
throw new TrinoException(NOT_SUPPORTED, format("Unsupported type: %s", type));
}

for (int i = 0; i < fieldBuilders.size(); i++) {
fieldBuilders.get(i).addBlock(fields.get(i), mergedColumnStatistics);
}
}

private void addPrimitiveBlock(Block block)
{
valuesCount += block.getPositionCount();
if (!block.mayHaveNull()) {
return;
}
int nullsCount = 0;
for (int position = 0; position < block.getPositionCount(); position++) {
nullsCount += block.isNull(position) ? 1 : 0;
}
nonLeafValuesCount += nullsCount;
}

private static ColumnStatistics addMapBlock(ColumnarMap block)
{
if (!block.mayHaveNull()) {
int emptyEntriesCount = 0;
for (int position = 0; position < block.getPositionCount(); position++) {
emptyEntriesCount += block.getEntryCount(position) == 0 ? 1 : 0;
}
return new ColumnStatistics(emptyEntriesCount, emptyEntriesCount);
}
int nonLeafValuesCount = 0;
for (int position = 0; position < block.getPositionCount(); position++) {
nonLeafValuesCount += block.isNull(position) || block.getEntryCount(position) == 0 ? 1 : 0;
}
return new ColumnStatistics(nonLeafValuesCount, nonLeafValuesCount);
}

private static ColumnStatistics addArrayBlock(ColumnarArray block)
{
if (!block.mayHaveNull()) {
int emptyEntriesCount = 0;
for (int position = 0; position < block.getPositionCount(); position++) {
emptyEntriesCount += block.getLength(position) == 0 ? 1 : 0;
}
return new ColumnStatistics(emptyEntriesCount, emptyEntriesCount);
}
int nonLeafValuesCount = 0;
for (int position = 0; position < block.getPositionCount(); position++) {
nonLeafValuesCount += block.isNull(position) || block.getLength(position) == 0 ? 1 : 0;
}
return new ColumnStatistics(nonLeafValuesCount, nonLeafValuesCount);
}

private static ColumnStatistics addRowBlock(ColumnarRow block)
{
if (!block.mayHaveNull()) {
return new ColumnStatistics(0, 0);
}
int nullsCount = 0;
for (int position = 0; position < block.getPositionCount(); position++) {
nullsCount += block.isNull(position) ? 1 : 0;
}
return new ColumnStatistics(nullsCount, nullsCount);
}

/**
* @param valuesCount Count of values for a column field, including nulls, empty and defined values.
* @param nonLeafValuesCount Count of non-leaf values for a column field, this is nulls count for primitives
* and count of values below the max definition level for nested types
*/
record ColumnStatistics(long valuesCount, long nonLeafValuesCount)
{
ColumnStatistics merge(ColumnStatistics other)
{
return new ColumnStatistics(
valuesCount + other.valuesCount(),
nonLeafValuesCount + other.nonLeafValuesCount());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,14 @@ public ParquetCorruptionException(Throwable cause, String messageFormat, Object.
{
super(format(messageFormat, args), cause);
}

public ParquetCorruptionException(ParquetDataSourceId dataSourceId, String messageFormat, Object... args)
{
super(formatMessage(dataSourceId, messageFormat, args));
}

private static String formatMessage(ParquetDataSourceId dataSourceId, String messageFormat, Object[] args)
{
return "Malformed Parquet file. " + format(messageFormat, args) + " [" + dataSourceId + "]";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@
*/
package io.trino.parquet;

import com.google.common.collect.ImmutableList;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.MapType;
import io.trino.spi.type.RowType;
import io.trino.spi.type.Type;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.io.ColumnIO;
Expand All @@ -31,10 +36,14 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkArgument;
import static org.apache.parquet.io.ColumnIOUtil.columnDefinitionLevel;
import static org.apache.parquet.io.ColumnIOUtil.columnRepetitionLevel;
import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
import static org.apache.parquet.schema.Type.Repetition.REPEATED;

public final class ParquetTypeUtils
Expand Down Expand Up @@ -259,4 +268,50 @@ public static long getShortDecimalValue(byte[] bytes, int startOffset, int lengt

return value;
}

public static Optional<Field> constructField(Type type, ColumnIO columnIO)
{
if (columnIO == null) {
return Optional.empty();
}
boolean required = columnIO.getType().getRepetition() != OPTIONAL;
int repetitionLevel = columnRepetitionLevel(columnIO);
int definitionLevel = columnDefinitionLevel(columnIO);
if (type instanceof RowType rowType) {
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
List<RowType.Field> fields = rowType.getFields();
boolean structHasParameters = false;
for (RowType.Field rowField : fields) {
String name = rowField.getName().orElseThrow().toLowerCase(Locale.ENGLISH);
Optional<Field> field = constructField(rowField.getType(), lookupColumnByName(groupColumnIO, name));
structHasParameters |= field.isPresent();
fieldsBuilder.add(field);
}
if (structHasParameters) {
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, fieldsBuilder.build()));
}
return Optional.empty();
}
if (type instanceof MapType mapType) {
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
GroupColumnIO keyValueColumnIO = getMapKeyValueColumn(groupColumnIO);
if (keyValueColumnIO.getChildrenCount() != 2) {
return Optional.empty();
}
Optional<Field> keyField = constructField(mapType.getKeyType(), keyValueColumnIO.getChild(0));
Optional<Field> valueField = constructField(mapType.getValueType(), keyValueColumnIO.getChild(1));
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(keyField, valueField)));
}
if (type instanceof ArrayType arrayType) {
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
if (groupColumnIO.getChildrenCount() != 1) {
return Optional.empty();
}
Optional<Field> field = constructField(arrayType.getElementType(), getArrayElementColumn(groupColumnIO.getChild(0)));
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(field)));
}
PrimitiveColumnIO primitiveColumnIO = (PrimitiveColumnIO) columnIO;
return Optional.of(new PrimitiveField(type, required, primitiveColumnIO.getColumnDescriptor(), primitiveColumnIO.getId()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,12 @@ public static void validateParquet(boolean condition, String formatString, Objec
throw new ParquetCorruptionException(format(formatString, args));
}
}

public static void validateParquet(boolean condition, ParquetDataSourceId dataSourceId, String formatString, Object... args)
throws ParquetCorruptionException
{
if (!condition) {
throw new ParquetCorruptionException(dataSourceId, formatString, args);
}
}
}
Loading