Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,12 @@ private void printColumnChunk(Logger console, int width, ColumnChunkMetaData col
if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
humanReadable(perValue), stats == null ? "" : String.valueOf(stats.getNumNulls()),
humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
minMaxAsString(stats, type.getOriginalType())));
} else {
console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s",
name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
stats == null ? "" : String.valueOf(stats.getNumNulls()),
stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
minMaxAsString(stats, type.getOriginalType())));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ public String visit(DataPageV1 page) {
String enc = encodingAsString(page.getValueEncoding(), false);
long totalSize = page.getCompressedSize();
int count = page.getValueCount();
long numNulls = page.getStatistics().getNumNulls();
String numNulls = page.getStatistics().isNumNullsSet() ? Long.toString(page.getStatistics().getNumNulls()) : "";
float perValue = ((float) totalSize) / count;
String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType());
return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8s %-7s %s",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,44 @@
*/
public abstract class Statistics<T extends Comparable<T>> {

/**
* Builder class to build Statistics objects. Used to read the statistics from the Parquet file.
*/
public static class Builder {
private final PrimitiveType type;
private byte[] min;
private byte[] max;
private long numNulls = -1;

private Builder(PrimitiveType type) {
this.type = type;
}

public Builder withMin(byte[] min) {
this.min = min;
return this;
}

public Builder withMax(byte[] max) {
this.max = max;
return this;
}

public Builder withNumNulls(long numNulls) {
this.numNulls = numNulls;
return this;
}

public Statistics<?> build() {
Statistics<?> stats = createStats(type);
if (min != null && max != null) {
stats.setMinMaxFromBytes(min, max);
}
stats.num_nulls = this.numNulls;
return stats;
}
}

private final PrimitiveType type;
private final PrimitiveComparator<T> comparator;
private boolean hasNonNullValue;
Expand Down Expand Up @@ -109,6 +147,17 @@ public static Statistics<?> createStats(Type type) {
}
}

/**
* Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
*
* @param type
* type of the column
* @return builder to create new statistics object
*/
public static Builder getBuilder(PrimitiveType type) {
return new Builder(type);
}

/**
* updates statistics min and max using the passed value
* @param value value to use to update min and max
Expand Down Expand Up @@ -217,7 +266,9 @@ public void mergeStatistics(Statistics stats) {
* Abstract method to set min and max values from byte arrays.
* @param minBytes byte array to set the min value to
* @param maxBytes byte array to set the max value to
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
*/
@Deprecated
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);

/**
Expand Down Expand Up @@ -310,9 +361,13 @@ public String maxAsString() {

@Override
public String toString() {
if (this.hasNonNullValue())
return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls());
else if (!this.isEmpty())
if (this.hasNonNullValue()) {
if (isNumNullsSet()) {
return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls());
} else {
return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString());
}
} else if (!this.isEmpty())
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
else
return "no stats for this column";
Expand All @@ -335,16 +390,20 @@ public void incrementNumNulls(long increment) {

/**
* Returns the null count
* @return null count
* @return null count or {@code -1} if the null count is not set
*/
public long getNumNulls() {
return num_nulls;
}

/**
* Sets the number of nulls to the parameter value
* @param nulls null count to set the count to
*
* @param nulls
* null count to set the count to
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
*/
@Deprecated
public void setNumNulls(long nulls) {
num_nulls = nulls;
}
Expand All @@ -355,7 +414,7 @@ public void setNumNulls(long nulls) {
* @return true if object is empty, false otherwise
*/
public boolean isEmpty() {
return !hasNonNullValue && num_nulls == 0;
return !hasNonNullValue && !isNumNullsSet();
}

/**
Expand All @@ -365,6 +424,13 @@ public boolean hasNonNullValue() {
return hasNonNullValue;
}

/**
* @return whether numNulls is set and can be used
*/
public boolean isNumNullsSet() {
return num_nulls >= 0;
}

/**
* Sets the page/column as having a valid non-null value
* kind of misnomer here
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public class TestStatistics {
@Test
public void testNumNulls() {
IntStatistics stats = new IntStatistics();
assertTrue(stats.isNumNullsSet());
assertEquals(stats.getNumNulls(), 0);

stats.incrementNumNulls();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;

import static org.apache.parquet.Preconditions.checkArgument;
import static org.apache.parquet.Preconditions.checkNotNull;

/**
Expand Down Expand Up @@ -122,6 +121,10 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
}

if (value == null) {
// We don't know anything about the nulls in this chunk
if (!stats.isNumNullsSet()) {
return BLOCK_MIGHT_MATCH;
}
// we are looking for records where v eq(null)
// so drop if there are no nulls in this chunk
return !hasNulls(meta);
Expand All @@ -133,6 +136,11 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

// drop if value < min || value > max
return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0;
}
Expand Down Expand Up @@ -166,12 +174,17 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
return isAllNulls(meta);
}

if (hasNulls(meta)) {
if (stats.isNumNullsSet() && hasNulls(meta)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(optional) You might consider adding the "stats.isNumNullsSet()" clause to the body of hasNulls() itself, but I don't really have a strong opinion on this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would not work correctly because hasNulls is called negated as well.

// we are looking for records where v notEq(someNonNull)
// but this chunk contains nulls, we cannot drop it
return BLOCK_MIGHT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

// drop if this is a column where min = max = value
return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0;
}
Expand Down Expand Up @@ -201,6 +214,11 @@ public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

T value = lt.getValue();

// drop if value <= min
Expand Down Expand Up @@ -232,6 +250,11 @@ public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

T value = ltEq.getValue();

// drop if value < min
Expand Down Expand Up @@ -263,6 +286,11 @@ public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

T value = gt.getValue();

// drop if value >= max
Expand Down Expand Up @@ -294,6 +322,11 @@ public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) {
return BLOCK_CANNOT_MATCH;
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

T value = gtEq.getValue();

// drop if value > max
Expand Down Expand Up @@ -355,6 +388,11 @@ private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean vis
}
}

if (!stats.hasNonNullValue()) {
// stats does not contain min/max values, we cannot drop any chunks
return BLOCK_MIGHT_MATCH;
}

org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(),
stats.comparator());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -401,17 +401,21 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal
(String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) {
// create stats object based on the column type
org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type);
org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
org.apache.parquet.column.statistics.Statistics.getBuilder(type);

if (formatStats != null) {
// Use the new V2 min-max statistics over the former one if it is filled
if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) {
byte[] min = formatStats.min_value.array();
byte[] max = formatStats.max_value.array();
if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) {
stats.setMinMaxFromBytes(min, max);
statsBuilder.withMin(min);
statsBuilder.withMax(max);
}
if (formatStats.isSetNull_count()) {
statsBuilder.withNumNulls(formatStats.null_count);
}
stats.setNumNulls(formatStats.null_count);
} else {
boolean isSet = formatStats.isSetMax() && formatStats.isSetMin();
boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false;
Expand All @@ -424,13 +428,16 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) &&
(sortOrdersMatch || maxEqualsMin)) {
if (isSet) {
stats.setMinMaxFromBytes(formatStats.min.array(), formatStats.max.array());
statsBuilder.withMin(formatStats.min.array());
statsBuilder.withMax(formatStats.max.array());
}
if (formatStats.isSetNull_count()) {
statsBuilder.withNumNulls(formatStats.null_count);
}
stats.setNumNulls(formatStats.null_count);
}
}
}
return stats;
return statsBuilder.build();
}

public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(
Expand Down
Loading