-
Notifications
You must be signed in to change notification settings - Fork 3k
Avro metrics support: track metrics in Avro value writers #1963
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
91cf508
39efa9f
70821cc
c25cb9c
63434f9
422dc24
19ab7d2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -95,10 +95,9 @@ public void close() throws IOException { | |
|
|
||
| @SuppressWarnings("unchecked") | ||
| private static <D> DataFileWriter<D> newAvroWriter( | ||
| Schema schema, PositionOutputStream stream, DatumWriter<?> metricsAwareDatumWriter, | ||
| Schema schema, PositionOutputStream stream, DatumWriter<?> datumWriter, | ||
| CodecFactory codec, Map<String, String> metadata) throws IOException { | ||
| DataFileWriter<D> writer = new DataFileWriter<>( | ||
| (DatumWriter<D>) metricsAwareDatumWriter); | ||
| DataFileWriter<D> writer = new DataFileWriter<>((DatumWriter<D>) datumWriter); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this rename needed? While I support cleaning up names, I would generally opt to leave these as-is to have smaller commits that are less likely to cause conflicts. |
||
|
|
||
| writer.setCodec(codec); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,10 +19,21 @@ | |
|
|
||
| package org.apache.iceberg.avro; | ||
|
|
||
| import java.nio.ByteBuffer; | ||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
| import java.util.Optional; | ||
| import org.apache.avro.io.DatumWriter; | ||
| import org.apache.iceberg.FieldMetrics; | ||
| import org.apache.iceberg.Metrics; | ||
| import org.apache.iceberg.MetricsConfig; | ||
| import org.apache.iceberg.MetricsModes; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.expressions.Literal; | ||
| import org.apache.iceberg.types.Conversions; | ||
| import org.apache.iceberg.types.Type; | ||
| import org.apache.iceberg.util.BinaryUtil; | ||
| import org.apache.iceberg.util.UnicodeUtil; | ||
|
|
||
| public class AvroMetrics { | ||
|
|
||
|
|
@@ -31,7 +42,112 @@ private AvroMetrics() { | |
|
|
||
| static Metrics fromWriter(DatumWriter<?> datumWriter, Schema schema, long numRecords, | ||
| MetricsConfig inputMetricsConfig) { | ||
| // TODO will populate in following PRs if datum writer is a MetricsAwareDatumWriter | ||
| return new Metrics(numRecords, null, null, null); | ||
| if (!(datumWriter instanceof MetricsAwareDatumWriter)) { | ||
| return new Metrics(numRecords, null, null, null, null); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is |
||
| } | ||
|
|
||
| MetricsAwareDatumWriter<?> metricsAwareDatumWriter = (MetricsAwareDatumWriter<?>) datumWriter; | ||
| MetricsConfig metricsConfig; | ||
| if (inputMetricsConfig == null) { | ||
| metricsConfig = MetricsConfig.getDefault(); | ||
| } else { | ||
| metricsConfig = inputMetricsConfig; | ||
| } | ||
|
|
||
| Map<Integer, Long> valueCounts = new HashMap<>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: we typically prefer |
||
| Map<Integer, Long> nullValueCounts = new HashMap<>(); | ||
| Map<Integer, Long> nanValueCounts = new HashMap<>(); | ||
| Map<Integer, ByteBuffer> lowerBounds = new HashMap<>(); | ||
| Map<Integer, ByteBuffer> upperBounds = new HashMap<>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thing to consider is that quite a bit of this method and the helper methods below is generic and could be written for |
||
|
|
||
| metricsAwareDatumWriter.metrics().forEach(metrics -> { | ||
| String columnName = schema.findColumnName(metrics.id()); | ||
| MetricsModes.MetricsMode metricsMode = metricsConfig.columnMode(columnName); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a method to look up metrics mode by ID? |
||
| if (metricsMode == MetricsModes.None.get()) { | ||
| return; | ||
| } | ||
|
|
||
| valueCounts.put(metrics.id(), metrics.valueCount()); | ||
| nullValueCounts.put(metrics.id(), metrics.nullValueCount()); | ||
| Type type = schema.findType(metrics.id()); | ||
|
|
||
| if (type.typeId() == Type.TypeID.FLOAT || type.typeId() == Type.TypeID.DOUBLE) { | ||
| nanValueCounts.put(metrics.id(), metrics.nanValueCount()); | ||
| } | ||
|
|
||
| if (metricsMode == MetricsModes.Counts.get()) { | ||
| return; | ||
| } | ||
|
|
||
| updateLowerBound(metrics, type, metricsMode).ifPresent(lowerBound -> lowerBounds.put(metrics.id(), lowerBound)); | ||
| updateUpperBound(metrics, type, metricsMode).ifPresent(upperBound -> upperBounds.put(metrics.id(), upperBound)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't quite understand the decision to return an option instead of just passing |
||
| }); | ||
|
|
||
| return new Metrics(numRecords, null, | ||
| valueCounts, nullValueCounts, nanValueCounts, lowerBounds, upperBounds); | ||
| } | ||
|
|
||
| private static Optional<ByteBuffer> updateLowerBound(FieldMetrics metrics, Type type, | ||
| MetricsModes.MetricsMode metricsMode) { | ||
| if (metrics.lowerBound() == null) { | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| Object lowerBound = metrics.lowerBound(); | ||
| if (metricsMode instanceof MetricsModes.Truncate) { | ||
| MetricsModes.Truncate truncateMode = (MetricsModes.Truncate) metricsMode; | ||
| int truncateLength = truncateMode.length(); | ||
| switch (type.typeId()) { | ||
| case STRING: | ||
| lowerBound = UnicodeUtil.truncateStringMin( | ||
| Literal.of((CharSequence) metrics.lowerBound()), truncateLength).value(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not use
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see that the |
||
| break; | ||
| case FIXED: | ||
| case BINARY: | ||
| lowerBound = BinaryUtil.truncateBinaryMin( | ||
| Literal.of((ByteBuffer) metrics.lowerBound()), truncateLength).value(); | ||
| break; | ||
| default: | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| return Optional.ofNullable(Conversions.toByteBuffer(type, lowerBound)); | ||
| } | ||
|
|
||
| private static Optional<ByteBuffer> updateUpperBound(FieldMetrics metrics, Type type, | ||
| MetricsModes.MetricsMode metricsMode) { | ||
| if (metrics.upperBound() == null) { | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| Object upperBound = null; | ||
| if (metricsMode instanceof MetricsModes.Truncate) { | ||
| MetricsModes.Truncate truncateMode = (MetricsModes.Truncate) metricsMode; | ||
| int truncateLength = truncateMode.length(); | ||
| switch (type.typeId()) { | ||
| case STRING: | ||
| upperBound = Optional.ofNullable( | ||
| UnicodeUtil.truncateStringMax(Literal.of((CharSequence) metrics.upperBound()), truncateLength)) | ||
| .map(Literal::value) | ||
| .orElse(null); | ||
| break; | ||
| case FIXED: | ||
| case BINARY: | ||
| upperBound = Optional.ofNullable( | ||
| BinaryUtil.truncateBinaryMax(Literal.of((ByteBuffer) metrics.upperBound()), truncateLength)) | ||
| .map(Literal::value) | ||
| .orElse(null); | ||
| break; | ||
| default: | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| if (upperBound == null) { | ||
| upperBound = metrics.upperBound(); | ||
| } | ||
|
|
||
| return Optional.ofNullable(Conversions.toByteBuffer(type, upperBound)); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,7 +41,11 @@ public static <T> T visit(Schema schema, AvroSchemaVisitor<T> visitor) { | |
| List<T> results = Lists.newArrayListWithExpectedSize(fields.size()); | ||
| for (Schema.Field field : schema.getFields()) { | ||
| names.add(field.name()); | ||
| visitor.beforeField(field.name(), field.schema(), schema); | ||
|
|
||
| T result = visitWithName(field.name(), field.schema(), visitor); | ||
| visitor.afterField(field.name(), field.schema(), schema); | ||
|
|
||
| results.add(result); | ||
| } | ||
|
|
||
|
|
@@ -59,13 +63,22 @@ public static <T> T visit(Schema schema, AvroSchemaVisitor<T> visitor) { | |
|
|
||
| case ARRAY: | ||
| if (schema.getLogicalType() instanceof LogicalMap) { | ||
| return visitor.array(schema, visit(schema.getElementType(), visitor)); | ||
| T result = visit(schema.getElementType(), visitor); | ||
| return visitor.array(schema, result); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't look like this change is needed? |
||
| } else { | ||
| return visitor.array(schema, visitWithName("element", schema.getElementType(), visitor)); | ||
| visitor.beforeListElement("element", schema.getElementType(), schema); | ||
| T result = visitWithName("element", schema.getElementType(), visitor); | ||
| visitor.afterListElement("element", schema.getElementType(), schema); | ||
|
|
||
| return visitor.array(schema, result); | ||
| } | ||
|
|
||
| case MAP: | ||
| return visitor.map(schema, visitWithName("value", schema.getValueType(), visitor)); | ||
| visitor.beforeMapValue("value", schema.getValueType(), schema); | ||
| T result = visitWithName("value", schema.getValueType(), visitor); | ||
| visitor.afterMapValue("value", schema.getValueType(), schema); | ||
|
|
||
| return visitor.map(schema, result); | ||
|
|
||
| default: | ||
| return visitor.primitive(schema); | ||
|
|
@@ -107,4 +120,26 @@ public T map(Schema map, T value) { | |
| public T primitive(Schema primitive) { | ||
| return null; | ||
| } | ||
|
|
||
| public void beforeField(String name, Schema type, Schema parentSchema) { | ||
| } | ||
|
|
||
| public void afterField(String name, Schema type, Schema parentSchema) { | ||
| } | ||
|
|
||
| public void beforeListElement(String name, Schema type, Schema parentSchema) { | ||
| beforeField(name, type, parentSchema); | ||
| } | ||
|
|
||
| public void afterListElement(String name, Schema type, Schema parentSchema) { | ||
| afterField(name, type, parentSchema); | ||
| } | ||
|
|
||
| public void beforeMapValue(String name, Schema type, Schema parentSchema) { | ||
| beforeField(name, type, parentSchema); | ||
| } | ||
|
|
||
| public void afterMapValue(String name, Schema type, Schema parentSchema) { | ||
| afterField(name, type, parentSchema); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After reviewing #2464, I think I understand why
Numberis used here instead ofFloatorDouble, but I think it would be better to make eachFieldMetricsclass specific to a value type. This will probably be done when #2464 is merged and this is rebased for float, but in the mean time you may want to update this for any other type metrics you're introducing in this PR.