From eddc8a40379d99fe45d62a11d2c000df4df46719 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 26 Jun 2020 17:23:44 -0700 Subject: [PATCH] Use Java reflection to get ORC date stats. --- .../org/apache/iceberg/orc/OrcMetrics.java | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java index e1cba36d5737..86b76972a346 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.sql.Date; import java.sql.Timestamp; import java.util.List; import java.util.Map; @@ -29,9 +28,11 @@ import java.util.Queue; import java.util.Set; import java.util.concurrent.TimeUnit; +import java.util.stream.Stream; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.Metrics; import org.apache.iceberg.Schema; +import org.apache.iceberg.common.DynFields; import org.apache.iceberg.exceptions.RuntimeIOException; import org.apache.iceberg.hadoop.HadoopInputFile; import org.apache.iceberg.io.InputFile; @@ -43,7 +44,6 @@ import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; import org.apache.orc.BooleanColumnStatistics; import org.apache.orc.ColumnStatistics; import org.apache.orc.DateColumnStatistics; @@ -55,6 +55,7 @@ import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; +import org.apache.orc.impl.ColumnStatisticsImpl; public class OrcMetrics { @@ -157,9 +158,7 @@ private static Optional fromOrcMin(Types.NestedField column, .setScale(((Types.DecimalType) column.type()).scale())) .orElse(null); } else if (columnStats instanceof DateColumnStatistics) { - min = Optional.ofNullable(((DateColumnStatistics) columnStats).getMinimum()) - .map(minStats -> DateTimeUtil.daysFromDate(((Date) minStats).toLocalDate())) - .orElse(null); + min = Optional.ofNullable(minDayFromEpoch((DateColumnStatistics) columnStats)).orElse(null); } else if (columnStats instanceof TimestampColumnStatistics) { TimestampColumnStatistics tColStats = (TimestampColumnStatistics) columnStats; Timestamp minValue = tColStats.getMinimumUTC(); @@ -195,9 +194,7 @@ private static Optional fromOrcMax(Types.NestedField column, .setScale(((Types.DecimalType) column.type()).scale())) .orElse(null); } else if (columnStats instanceof DateColumnStatistics) { - max = Optional.ofNullable(((DateColumnStatistics) columnStats).getMaximum()) - .map(maxStats -> DateTimeUtil.daysFromDate(((Date) maxStats).toLocalDate())) - .orElse(null); + max = Optional.ofNullable(maxDayFromEpoch((DateColumnStatistics) columnStats)).orElse(null); } else if (columnStats instanceof TimestampColumnStatistics) { TimestampColumnStatistics tColStats = (TimestampColumnStatistics) columnStats; Timestamp maxValue = tColStats.getMaximumUTC(); @@ -272,4 +269,27 @@ public TypeDescription primitive(Type.PrimitiveType iPrimitive, TypeDescription return primitive; } } + + private static final Class DATE_STATS_IMPL = Stream.of(ColumnStatisticsImpl.class.getDeclaredClasses()) + .filter(statsClass -> "DateStatisticsImpl".equals(statsClass.getSimpleName())) + .findFirst() + .orElse(null); + + private static final DynFields.UnboundField DATE_MINIMUM = DynFields.builder() + .hiddenImpl(DATE_STATS_IMPL, "minimum") + .defaultAlwaysNull() // if the minimum field isn't found, don't add a value + .build(); + + private static final DynFields.UnboundField DATE_MAXIMUM = DynFields.builder() + .hiddenImpl(DATE_STATS_IMPL, "maximum") + .defaultAlwaysNull() // if the minimum field isn't found, don't add a value + .build(); + + private static Integer minDayFromEpoch(DateColumnStatistics stats) { + return DATE_MINIMUM.get(stats); + } + + private static Integer maxDayFromEpoch(DateColumnStatistics stats) { + return DATE_MAXIMUM.get(stats); + } }